Beispiel #1
0
def get_embeddings(melspecs: dict[str, np.ndarray], architectures: dict, predictors: dict) -> Optional[dict]:
    data = {}
    for architecture, metadata in architectures.items():
        input_pool = Pool()
        input_pool.set('model/Placeholder', melspecs[metadata['essentia-algorithm']])

        for dataset in metadata['datasets']:
            # TODO: chunk the input melspecs to avoid OOM error
            try:
                output_pool = predictors[f'{dataset}-{architecture}'](input_pool)
            except RuntimeError:
                return None

            for layer, layer_data in metadata['layers'].items():
                embeddings = output_pool[layer_data['name']].squeeze()

                if len(embeddings) == 0:
                    return None

                if len(embeddings.shape) == 1:
                    embeddings = np.expand_dims(embeddings, axis=0)

                data[f'{dataset}-{architecture}-{layer}'] = embeddings

    return data
    if opt.segmentation:
        INFO('Process step 2: Low Level')
        computeLowLevel(input_file, pool, startTime, endTime)
        segmentation.compute(input_file, pool, startTime, endTime)
        segments = pool['segmentation.timestamps']
        for i in xrange(len(segments)-1):
            startTime = segments[i]
            endTime = segments[i+1]

            INFO('**************************************************************************')
            INFO('Segment ' + str(i) + ': processing audio from ' + str(startTime) + 's to ' + str(endTime) + 's')
            INFO('**************************************************************************')

            # set segment name:
            segment_name = 'segment_'+ str(i)
            pool.set('segments.'+segment_name+'.name', segment_name)
            # set segment scope:
            pool.set('segments.'+segment_name+'.scope', numpy.array([startTime, endTime]))
            # compute descriptors:
            namespace = 'segments.'+segment_name+'.descriptors'
            segments_namespace.append(namespace)
            INFO('\tProcess step 2: Low Level')
            computeLowLevel(input_file, pool, startTime, endTime, namespace)
            INFO('\tProcess step 3: Mid Level')
            computeMidLevel(input_file, pool, startTime, endTime, namespace)
            INFO('\tProcess step 4: High Level')
            highlevel.compute(pool, namespace)

        # compute the rest of the descriptors for the entire audio. LowLevel
        # descriptors were already computed during segmentation
        startTime = float(opt.startTime)
        for i in xrange(len(segments) - 1):
            startTime = segments[i]
            endTime = segments[i + 1]

            INFO(
                '**************************************************************************'
            )
            INFO('Segment ' + str(i) + ': processing audio from ' +
                 str(startTime) + 's to ' + str(endTime) + 's')
            INFO(
                '**************************************************************************'
            )

            # set segment name:
            segment_name = 'segment_' + str(i)
            pool.set('segments.' + segment_name + '.name', segment_name)
            # set segment scope:
            pool.set('segments.' + segment_name + '.scope',
                     numpy.array([startTime, endTime]))
            # compute descriptors:
            namespace = 'segments.' + segment_name + '.descriptors'
            segments_namespace.append(namespace)
            INFO('\tProcess step 2: Low Level')
            computeLowLevel(input_file, pool, startTime, endTime, namespace)
            INFO('\tProcess step 3: Mid Level')
            computeMidLevel(input_file, pool, startTime, endTime, namespace)
            INFO('\tProcess step 4: High Level')
            highlevel.compute(pool, namespace)

        # compute the rest of the descriptors for the entire audio. LowLevel
        # descriptors were already computed during segmentation
Beispiel #4
0
class ModelsWrapper:
    def __init__(self, arch):
        self.architechture = arch
        self.in_layer = None
        self.out_layer = None
        if arch == 'musicnn':
            self.feature_extractor = es.TensorflowInputMusiCNN()
            self.frame_size = 512
            self.hop_size = 256
            self.patch_size = 187
            self.num_bands = 96
        elif arch == 'vggish':
            self.feature_extractor = es.TensorflowInputVGGish()
            self.frame_size = 400
            self.hop_size = 200
            self.patch_size = 96
            self.num_bands = 64
        self.feature_frames = []
        self.in_pool = Pool()
        self.out_pool = Pool()
        # setup model
        self.predict = None

    def load_model(self, model_path, in_layer, out_layer):
        if not self.predict:
            self.predict = es.TensorflowPredict(graphFilename=model_path,
                                                inputs=[in_layer],
                                                outputs=[out_layer],
                                                squeeze=True)
            self.in_layer = in_layer
            self.out_layer = out_layer

    def compute_features(self, audio):
        frames = []
        self.feature_frames = []  # ensure it's empty
        for frame in es.FrameGenerator(audio,
                                       frameSize=self.frame_size,
                                       hopSize=self.hop_size,
                                       startFromZero=True):
            frames.append(frame)

        for f in frames:
            self.feature_frames.append(self.feature_extractor(f))

        return self.feature_frames

    def make_prediction(self):
        self._featuresToTensorAsBatch()
        self.out_pool.clear()
        self.out_pool = self.predict(self.in_pool)
        return self.out_pool[self.out_layer]

    def _featuresToTensorAsBatch(self):
        # reshape features as tensor, zeropadding as needed
        feature_frames_as_np = np.array(self.feature_frames, dtype=np.single)
        incomplete_patch_size = feature_frames_as_np.shape[0] % self.patch_size

        zero_frame_size = self.patch_size - incomplete_patch_size
        zero_frames = np.zeros((zero_frame_size, self.num_bands),
                               dtype=np.single)
        zero_padded_features = np.append(feature_frames_as_np,
                                         zero_frames,
                                         axis=0)
        batch = np.expand_dims(
            np.reshape(zero_padded_features,
                       [-1, self.patch_size, self.num_bands]), 1)
        self.in_pool.set(self.in_layer, batch)

    def dispose(self):
        # clear model from memory
        self.predict = None
        self.in_layer = None
    if opt.segmentation:
        INFO("Process step 2: Low Level")
        computeLowLevel(input_file, neqPool, eqPool, startTime, endTime)
        segmentation.compute(input_file, eqPool, startTime, endTime)
        segments = eqPool["segmentation.timestamps"]
        for i in xrange(len(segments) - 1):
            startTime = segments[i]
            endTime = segments[i + 1]

            INFO("**************************************************************************")
            INFO("Segment " + str(i) + ": processing audio from " + str(startTime) + "s to " + str(endTime) + "s")
            INFO("**************************************************************************")

            # set segment name:
            segment_name = "segment_" + str(i)
            neqPool.set("segments." + segment_name + ".name", segment_name)
            eqPool.set("segments." + segment_name + ".name", segment_name)
            # set segment scope:
            neqPool.set("segments." + segment_name + ".scope", numpy.array([startTime, endTime]))
            eqPool.set("segments." + segment_name + ".scope", numpy.array([startTime, endTime]))
            # compute descriptors:
            namespace = "segments." + segment_name + ".descriptors"
            segments_namespace.append(namespace)
            INFO("\tProcess step 2: Low Level")
            computeLowLevel(input_file, neqPool, eqPool, startTime, endTime, namespace)
            INFO("\tProcess step 3: Mid Level")
            computeMidLevel(input_file, neqPool, eqPool, startTime, endTime, namespace)
            INFO("\tProcess step 4: High Level")
            highlevel.compute(eqPool, namespace)
            highlevel.compute(neqPool, namespace)