Example #1
0
def _calculate_similarity(sim, runner):
    dm = sim.dm
    ord = sim.ord

    assert dm.task is None or dm.task.is_completed(), \
        'Cannot calculate similarity because previous error occurred when extracting features'
    assert ord is None or ord.task is None or ord.task.is_completed(),\
        'Cannot calculate similarity because previous error occutred when constructing ordination'

    if ord:
        sids_path = ord.get_sids_path()
        source_bytes_path = ord.get_bytes_path()
    else:
        sids_path = dm.get_sids_path()
        source_bytes_path = dm.get_bytes_path()

    sids = bytes_to_ndarray(sids_path, np.int32)
    coordinates = get_rawdata_from_binary(source_bytes_path, len(sids))
    coordinates[np.where(np.logical_not(np.isfinite(coordinates)))] = 0

    runner.start()
    tree = linkage(coordinates, method='average')

    order = natural_order(tree)
    sorted_order = np.argsort(order).astype(np.int32)

    runner.wrapping_up()

    sim_sids_path = sim.get_sids_path()
    sim_bytes_path = sim.get_bytes_path()

    ndarray_to_bytes(sorted_order, sim_bytes_path)
    ndarray_to_bytes(sids, sim_sids_path)
Example #2
0
    def test_bytes_to_ndarray(self):
        django.setup()
        from koe.ts_utils import bytes_to_ndarray, ndarray_to_bytes

        arr = np.random.rand(100, 200).astype(np.float32)
        filename = '/tmp/{}.bytes'.format(uuid4().hex)

        ndarray_to_bytes(arr, filename)
        arr_ = bytes_to_ndarray(filename).reshape((100, 200))

        os.remove(filename)

        self.assertTrue(np.allclose(arr, arr_))
Example #3
0
def construct_ordination(task_id):
    task = get_or_wait(task_id)
    runner = TaskRunner(task)
    try:
        runner.preparing()

        cls, ord_id = task.target.split(':')
        ord_id = int(ord_id)
        assert cls == Ordination.__name__
        ord = Ordination.objects.get(id=ord_id)

        dm = ord.dm
        method_name = ord.method
        ndims = ord.ndims
        param_kwargs = Ordination.params_to_kwargs(ord.params)

        assert dm.task is None or dm.task.is_completed()
        assert method_name in methods.keys(), 'Unknown method {}'.format(
            method_name)
        assert 2 <= ndims <= 3, 'Only support 2 or 3 dimensional ordination'

        runner.start()
        dm_sids_path = dm.get_sids_path()
        dm_bytes_path = dm.get_bytes_path()

        sids = bytes_to_ndarray(dm_sids_path, np.int32)
        dm_data = get_rawdata_from_binary(dm_bytes_path, len(sids))

        data = zscore(dm_data)
        data[np.where(np.isnan(data))] = 0
        data[np.where(np.isinf(data))] = 0

        method = methods[method_name]
        result = method(data, ndims, **param_kwargs)

        runner.wrapping_up()

        ord_sids_path = ord.get_sids_path()
        ord_bytes_path = ord.get_bytes_path()

        ndarray_to_bytes(result, ord_bytes_path)
        ndarray_to_bytes(sids, ord_sids_path)

        runner.complete()
    except Exception as e:
        runner.error(e)
def create_full_tensor(database, recreate):
    features = Feature.objects.all().order_by('id')
    aggregations = Aggregation.objects.all().order_by('id')
    features_hash = '-'.join(
        list(map(str, features.values_list('id', flat=True))))
    aggregations_hash = '-'.join(
        list(map(str, aggregations.values_list('id', flat=True))))
    aggregators = [aggregator_map[x.name] for x in aggregations]

    full_tensor = FullTensorData.objects.filter(
        database=database,
        features_hash=features_hash,
        aggregations_hash=aggregations_hash).first()

    if full_tensor and not recreate:
        print(
            'Full tensor {} already exists. If you want to recreate, turn on flag --recreate'
            .format(full_tensor.name))
        return full_tensor, False

    if full_tensor is None:
        full_tensors_name = uuid.uuid4().hex
        full_tensor = FullTensorData(name=full_tensors_name,
                                     database=database,
                                     features_hash=features_hash,
                                     aggregations_hash=aggregations_hash)

    full_sids_path = full_tensor.get_sids_path()
    full_bytes_path = full_tensor.get_bytes_path()
    full_cols_path = full_tensor.get_cols_path()

    sids, tids = get_sids_tids(database)
    f2bs, fa2bs = get_binstorage_locations(features, aggregators)
    data, col_inds = extract_rawdata(f2bs, fa2bs, tids, features, aggregators)

    ndarray_to_bytes(data, full_bytes_path)
    ndarray_to_bytes(sids, full_sids_path)

    with open(full_cols_path, 'w', encoding='utf-8') as f:
        json.dump(col_inds, f)

    full_tensor.save()
    return full_tensor, True
Example #5
0
def calculate_similarity(task_id):
    task = get_or_wait(task_id)
    runner = TaskRunner(task)
    try:
        runner.preparing()

        cls, sim_id = task.target.split(':')
        sim_id = int(sim_id)
        assert cls == SimilarityIndex.__name__
        sim = SimilarityIndex.objects.get(id=sim_id)

        dm = sim.dm
        ord = sim.ord

        assert dm.task is None or dm.task.is_completed()
        assert ord is None or ord.task is None or ord.task.is_completed()

        if ord:
            sids_path = ord.get_sids_path()
            source_bytes_path = ord.get_bytes_path()
        else:
            sids_path = dm.get_sids_path()
            source_bytes_path = dm.get_bytes_path()

        runner.start()

        sids, sorted_order = _calculate_similarity(sids_path,
                                                   source_bytes_path)

        runner.wrapping_up()

        sim_sids_path = sim.get_sids_path()
        sim_bytes_path = sim.get_bytes_path()

        ndarray_to_bytes(sorted_order, sim_bytes_path)
        ndarray_to_bytes(sids, sim_sids_path)

        runner.complete()
    except Exception as e:
        runner.error(e)
Example #6
0
def _construct_ordination(ord, runner):
    dm = ord.dm
    method_name = ord.method
    ndims = ord.ndims
    param_kwargs = Ordination.params_to_kwargs(ord.params)

    assert dm.task is None or dm.task.is_completed(
    ), 'Cannot construct ordination because its DataMatrix failed'
    assert method_name in methods.keys(), 'Unknown method {}'.format(
        method_name)
    assert 2 <= ndims <= 3, 'Only support 2 or 3 dimensional ordination'

    runner.start()
    dm_sids_path = dm.get_sids_path()
    dm_bytes_path = dm.get_bytes_path()

    sids = bytes_to_ndarray(dm_sids_path, np.int32)
    dm_data = get_rawdata_from_binary(dm_bytes_path, len(sids))

    dm_dims = dm_data.shape[1]

    assert dm_data.shape[1] >= ndims, \
        'Data has only {} dimension(s), not enough to construct a {}-dimensional ordination'.format(dm_dims, ndims)

    data = zscore(dm_data)
    data[np.where(np.isnan(data))] = 0
    data[np.where(np.isinf(data))] = 0

    method = methods[method_name]
    result = method(data, ndims, **param_kwargs)
    result = result.astype(np.float32)

    runner.wrapping_up()

    ord_sids_path = ord.get_sids_path()
    ord_bytes_path = ord.get_bytes_path()

    ndarray_to_bytes(result, ord_bytes_path)
    ndarray_to_bytes(sids, ord_sids_path)
def encode_into_datamatrix(variables, encoder, session, database_name,
                           kernel_only):
    with_duration = variables['with_duration']
    dm_name = variables['dm_name']
    ndims = encoder.latent_dims

    database = get_or_error(Database, dict(name__iexact=database_name))
    audio_files = AudioFile.objects.filter(database=database)
    segments = Segment.objects.filter(audio_file__in=audio_files)

    encoding_result = encode_syllables(variables, encoder, session, segments,
                                       kernel_only)
    features_value = np.array(list(encoding_result.values()))
    sids = np.array(list(encoding_result.keys()), dtype=np.int32)

    sid_sorted_inds = np.argsort(sids)
    sids = sids[sid_sorted_inds]
    features_value = features_value[sid_sorted_inds]

    preserved = Case(*[When(id=id, then=pos) for pos, id in enumerate(sids)])
    segments = segments.order_by(preserved)
    tids = segments.values_list('tid', flat=True)

    features = [feature_map['s2s_autoencoded']]
    col_inds = {'s2s_autoencoded': [0, ndims]}
    if with_duration:
        features.append(feature_map['duration'])
        col_inds['duration'] = [ndims, ndims + 1]
        durations = list(
            segments.annotate(duration=F('end_time_ms') -
                              F('start_time_ms')).values_list('duration',
                                                              flat=True))
        durations = np.array(durations)
        assert len(durations) == len(sids)
        features_value = np.concatenate(
            (features_value, durations.reshape(-1, 1)), axis=1)

    features_value = features_value.astype(np.float32)

    dm = DataMatrix(database=database)
    dm.name = dm_name
    dm.ndims = ndims
    dm.features_hash = '-'.join([str(x.id) for x in features])
    dm.aggregations_hash = ''
    dm.save()

    full_sids_path = dm.get_sids_path()
    full_tids_path = dm.get_tids_path()
    full_bytes_path = dm.get_bytes_path()
    full_cols_path = dm.get_cols_path()

    ndarray_to_bytes(features_value, full_bytes_path)
    ndarray_to_bytes(np.array(sids, dtype=np.int32), full_sids_path)
    ndarray_to_bytes(np.array(tids, dtype=np.int32), full_tids_path)

    with open(full_cols_path, 'w', encoding='utf-8') as f:
        json.dump(col_inds, f)
    def handle(self, *args, **options):
        path = options['path']
        if not os.path.isfile(path):
            raise Exception('File {} not found'.format(path))

        database_name = options['database_name']
        dm_name = options['dm_name']
        database = get_or_error(Database, dict(name__iexact=database_name))

        dataset = data_set.load(Path(path))
        features = dataset.features
        filenames = dataset.filenames
        sids = [int(x[:-4]) for x in filenames]

        nobs, ndims = dataset.features.shape

        preserved = Case(*[When(id=id, then=pos) for pos, id in enumerate(sids)])
        segments = Segment.objects.filter(id__in=sids).order_by(preserved)
        tids = segments.values_list('tid', flat=True)

        col_inds = {'s2s_autoencoded': [0, ndims]}

        dm = DataMatrix(database=database)
        dm.name = dm_name
        dm.ndims = ndims
        dm.features_hash = 's2s_autoencoded'
        dm.aggregations_hash = ''
        dm.save()

        full_sids_path = dm.get_sids_path()
        full_tids_path = dm.get_tids_path()
        full_bytes_path = dm.get_bytes_path()
        full_cols_path = dm.get_cols_path()

        ndarray_to_bytes(features, full_bytes_path)
        ndarray_to_bytes(np.array(sids, dtype=np.int32), full_sids_path)
        ndarray_to_bytes(np.array(tids, dtype=np.int32), full_tids_path)

        with open(full_cols_path, 'w', encoding='utf-8') as f:
            json.dump(col_inds, f)
Example #9
0
def extract_database_measurements(arg=None,
                                  force=False,
                                  send_email='always',
                                  raise_err=False,
                                  *args,
                                  **kwargs):
    if isinstance(arg, int):
        task = get_or_wait(arg)
    else:
        task = arg

    send_email = 'error-only' if settings.DEBUG else send_email
    runner = TaskRunner(task, send_email=send_email)
    try:
        runner.preparing()

        if isinstance(task, Task):
            cls, dm_id = task.target.split(':')
            dm_id = int(dm_id)
            assert cls == DataMatrix.__name__
            dm = DataMatrix.objects.get(id=dm_id)

            if dm.database:
                segments = Segment.objects.filter(
                    audio_file__database=dm.database)
                sids = segments.values_list('id', flat=True)
            else:
                sids = dm.tmpdb.ids
            features_hash = dm.features_hash
            aggregations_hash = dm.aggregations_hash
        else:
            sids = task.sids
            features_hash = task.features_hash
            aggregations_hash = task.aggregations_hash

        if len(sids) == 0:
            raise CustomAssertionError(
                'Measurement cannot be extracted because your database doesn\'t contain any segments.'
            )

        segments = Segment.objects.filter(id__in=sids)
        tids = np.array(segments.values_list('tid', flat=True), dtype=np.int32)

        features = Feature.objects.filter(id__in=features_hash.split('-'))
        aggregations = Aggregation.objects.filter(
            id__in=aggregations_hash.split('-'))

        available_feature_names = feature_extractors.keys()
        disabled_features_names = [
            x.name for x in features if x.name not in available_feature_names
        ]

        if len(disabled_features_names):
            warning('Task #{}: Features {} are no longer available'.format(
                task.id, disabled_features_names))
            features = [
                x for x in features if x.name in available_feature_names
            ]

        available_aggregator_names = aggregator_map.keys()
        disabled_aggregators_names = [
            x.name for x in aggregations
            if x.name not in available_aggregator_names
        ]

        if len(disabled_aggregators_names):
            warning('Task #{}: Aggregation {} are no longer available'.format(
                task.id, disabled_aggregators_names))
            aggregations = [
                x for x in aggregations if x.name in available_aggregator_names
            ]

        aggregators = [aggregator_map[x.name] for x in aggregations]

        extract_segment_features_for_segments(runner,
                                              sids,
                                              features,
                                              force=force)

        runner.wrapping_up()
        child_task = task.__class__(user=task.user, parent=task)
        child_task.save()
        child_runner = TaskRunner(child_task)
        child_runner.preparing()

        aggregate_feature_values(child_runner,
                                 tids,
                                 features,
                                 aggregators,
                                 force=force)
        child_runner.complete()

        if isinstance(task, Task):
            full_sids_path = dm.get_sids_path()
            full_bytes_path = dm.get_bytes_path()
            full_cols_path = dm.get_cols_path()

            data, col_inds = extract_rawdata(tids, features, aggregators)

            ndarray_to_bytes(data, full_bytes_path)
            ndarray_to_bytes(np.array(sids, dtype=np.int32), full_sids_path)

            with open(full_cols_path, 'w', encoding='utf-8') as f:
                json.dump(col_inds, f)

            dm.ndims = data.shape[1]
            dm.save()
        runner.complete()

    except Exception as e:
        if raise_err:
            raise e
        runner.error(e)
def create_derived_tensor(full_tensor, annotator, dim_reduce, ndims, recreate):
    admin = get_or_error(User, dict(username__iexact='superuser'))
    full_sids_path = full_tensor.get_sids_path()
    full_bytes_path = full_tensor.get_bytes_path()

    sids = bytes_to_ndarray(full_sids_path, np.int32)
    full_data = get_rawdata_from_binary(full_bytes_path, len(sids))

    if dim_reduce != 'none':
        dim_reduce_fun = reduce_funcs[dim_reduce]
        n_feature_cols = full_data.shape[1]
        n_components = min(n_feature_cols // 2, ndims)
    else:
        dim_reduce_fun = None
        n_components = None

    derived_tensor = DerivedTensorData.objects.filter(
        database=full_tensor.database,
        full_tensor=full_tensor,
        features_hash=full_tensor.features_hash,
        aggregations_hash=full_tensor.aggregations_hash,
        ndims=n_components,
        dimreduce=dim_reduce,
        creator=admin,
        annotator=annotator).first()
    if derived_tensor and not recreate:
        print(
            'Derived tensor {} already exists. If you want to recreate, turn on flag --recreate'
            .format(derived_tensor.name))
        return derived_tensor, False

    if derived_tensor is None:
        derived_tensors_name = uuid.uuid4().hex
        derived_tensor = DerivedTensorData(
            name=derived_tensors_name,
            database=full_tensor.database,
            full_tensor=full_tensor,
            features_hash=full_tensor.features_hash,
            aggregations_hash=full_tensor.aggregations_hash,
            dimreduce=dim_reduce,
            ndims=n_components,
            creator=admin,
            annotator=annotator)

    derived_cfg_path = derived_tensor.get_config_path()

    if dim_reduce_fun:

        # TSNE needs normalisation first
        if dim_reduce.startswith('tsne'):
            full_data = zscore(full_data)
            full_data[np.where(np.isnan(full_data))] = 0
            full_data[np.where(np.isinf(full_data))] = 0

        dim_reduced_data = dim_reduce_fun(full_data, n_components)
        derived_bytes_path = derived_tensor.get_bytes_path()
        ndarray_to_bytes(dim_reduced_data, derived_bytes_path)
        tensor_shape = dim_reduced_data.shape
        tensor_path = '/' + derived_bytes_path,
    else:
        tensor_shape = full_data.shape
        tensor_path = '/' + full_bytes_path,

    # Always write config last - to make sure it's not missing anything
    embedding = dict(
        tensorName=derived_tensor.name,
        tensorShape=tensor_shape,
        tensorPath=tensor_path,
        metadataPath=reverse('tsne-meta',
                             kwargs={'tensor_name': derived_tensor.name}),
    )
    config = dict(embeddings=[embedding])
    write_config(config, derived_cfg_path)

    derived_tensor.save()
    return derived_tensor, True
Example #11
0
    def perform_action(self, when, remove_dead):
        for dm in DataMatrix.objects.all():
            need_reconstruct = self.check_rebuild_necessary(dm, when)

            if not need_reconstruct:
                continue

            full_sids_path = dm.get_sids_path()
            full_bytes_path = dm.get_bytes_path()
            full_cols_path = dm.get_cols_path()

            if dm.database:
                if os.path.isfile(full_sids_path):
                    sids = bytes_to_ndarray(full_sids_path, np.int32)
                else:
                    sids = Segment.objects.filter(
                        audio_file__database=dm.database).values_list(
                            'id', flat=True)
                dbname = dm.database.name
            else:
                sids = dm.tmpdb.ids
                dbname = dm.tmpdb.name

            segments = Segment.objects.filter(id__in=sids)

            if len(segments) == 0:
                print('Skip DM #{}-{}-{}: '.format(dm.id, dbname, dm.name))

                if remove_dead:
                    print('Delete {}'.format(dm))
                    for f in [full_sids_path, full_bytes_path, full_cols_path]:
                        print('Remove binary file {}'.format(f))
                        try:
                            os.remove(f)
                        except FileNotFoundError:
                            pass
                    dm.delete()
                continue

            tids = np.array(segments.values_list('tid', flat=True),
                            dtype=np.int32)

            features_ids = dm.features_hash.split('-')
            features = list(Feature.objects.filter(id__in=features_ids))

            aggregations_ids = dm.aggregations_hash.split('-')
            aggregations = Aggregation.objects.filter(id__in=aggregations_ids)

            available_feature_names = feature_extractors.keys()
            disabled_features_names = [
                x.name for x in features
                if x.name not in available_feature_names
            ]

            if len(disabled_features_names):
                warning(
                    'DM #{}-{}-{}: Features {} are no longer available'.format(
                        dm.id, dbname, dm.name, disabled_features_names))
                features = [
                    x for x in features if x.name in available_feature_names
                ]

            available_aggregator_names = aggregator_map.keys()
            disabled_aggregators_names = [
                x.name for x in aggregations
                if x.name not in available_aggregator_names
            ]

            if len(disabled_aggregators_names):
                warning('DM #{}-{}-{}: Aggregation {} are no longer available'.
                        format(dm.id, dbname, dm.name,
                               disabled_aggregators_names))
                aggregations = [
                    x for x in aggregations
                    if x.name in available_aggregator_names
                ]

            aggregators = [aggregator_map[x.name] for x in aggregations]

            runner = ConsoleTaskRunner(
                prefix='Extract measurement for DM #{}-{}-{}: '.format(
                    dm.id, dbname, dm.name))
            runner.preparing()
            extract_segment_features_for_segments(runner,
                                                  sids,
                                                  features,
                                                  force=False)
            runner.wrapping_up()

            child_runner = ConsoleTaskRunner(
                prefix='Aggregate measurement for DM #{}-{}-{}: '.format(
                    dm.id, dbname, dm.name))
            child_runner.preparing()

            aggregate_feature_values(child_runner, tids, features, aggregators)
            child_runner.wrapping_up()
            child_runner.complete()

            runner.complete()

            data, col_inds = extract_rawdata(tids, features, aggregators)

            ndarray_to_bytes(data, full_bytes_path)
            ndarray_to_bytes(np.array(sids, dtype=np.int32), full_sids_path)

            with open(full_cols_path, 'w', encoding='utf-8') as f:
                json.dump(col_inds, f)

            dm.ndims = data.shape[1]
            dm.save()
Example #12
0
def extract_database_measurements(arg=None, force=False):
    if isinstance(arg, int):
        task = get_or_wait(arg)
    else:
        task = arg
    runner = TaskRunner(task)
    try:
        runner.preparing()

        if isinstance(task, Task):
            cls, dm_id = task.target.split(':')
            dm_id = int(dm_id)
            assert cls == DataMatrix.__name__
            dm = DataMatrix.objects.get(id=dm_id)

            if dm.database:
                segments = Segment.objects.filter(
                    audio_file__database=dm.database)
                sids = segments.values_list('id', flat=True)
            else:
                sids = dm.tmpdb.ids
            features_hash = dm.features_hash
            aggregations_hash = dm.aggregations_hash
        else:
            sids = task.sids
            features_hash = task.features_hash
            aggregations_hash = task.aggregations_hash

        features = Feature.objects.filter(id__in=features_hash.split('-'))
        aggregations = Aggregation.objects.filter(
            id__in=aggregations_hash.split('-'))
        aggregators = [aggregator_map[x.name] for x in aggregations]

        # feature to binstorage's files
        f2bs = {}
        # feature+aggregation to binstorage's files
        fa2bs = {}

        for feature in features:
            feature_name = feature.name
            index_filename = data_path('binary/features',
                                       '{}.idx'.format(feature_name),
                                       for_url=False)
            value_filename = data_path('binary/features',
                                       '{}.val'.format(feature_name),
                                       for_url=False)
            f2bs[feature] = (index_filename, value_filename)

            if feature not in fa2bs:
                fa2bs[feature] = {}

            for aggregator in aggregators:
                aggregator_name = aggregator.get_name()
                folder = os.path.join('binary', 'features', feature_name)
                mkdirp(os.path.join(settings.MEDIA_URL, folder)[1:])

                index_filename = data_path(folder,
                                           '{}.idx'.format(aggregator_name),
                                           for_url=False)
                value_filename = data_path(folder,
                                           '{}.val'.format(aggregator_name),
                                           for_url=False)
                fa2bs[feature][aggregator] = (index_filename, value_filename)

        tids, f2tid2fvals = extract_segment_features_for_segments(
            runner, sids, features, f2bs, force)

        for feature, (index_filename, value_filename) in f2bs.items():
            _tids, _fvals = f2tid2fvals.get(feature, (None, None))
            if _tids:
                _tids = np.array(_tids, dtype=np.int32)
                ensure_parent_folder_exists(index_filename)
                binstorage.store(_tids, _fvals, index_filename, value_filename)

        runner.wrapping_up()
        child_task = task.__class__(user=task.user, parent=task)
        child_task.save()
        child_runner = TaskRunner(child_task)
        child_runner.preparing()

        aggregate_feature_values(child_runner, sids, f2bs, fa2bs, features,
                                 aggregators)
        child_runner.complete()

        if isinstance(task, Task):
            full_sids_path = dm.get_sids_path()
            full_bytes_path = dm.get_bytes_path()
            full_cols_path = dm.get_cols_path()

            data, col_inds = extract_rawdata(f2bs, fa2bs, tids, features,
                                             aggregators)

            ndarray_to_bytes(data, full_bytes_path)
            ndarray_to_bytes(np.array(sids, dtype=np.int32), full_sids_path)

            with open(full_cols_path, 'w', encoding='utf-8') as f:
                json.dump(col_inds, f)

            dm.ndims = data.shape[1]
            dm.save()
        runner.complete()

    except Exception as e:
        runner.error(e)