def encode_into_datamatrix(variables, encoder, session, database_name,
                           kernel_only):
    with_duration = variables['with_duration']
    dm_name = variables['dm_name']
    ndims = encoder.latent_dims

    database = get_or_error(Database, dict(name__iexact=database_name))
    audio_files = AudioFile.objects.filter(database=database)
    segments = Segment.objects.filter(audio_file__in=audio_files)

    encoding_result = encode_syllables(variables, encoder, session, segments,
                                       kernel_only)
    features_value = np.array(list(encoding_result.values()))
    sids = np.array(list(encoding_result.keys()), dtype=np.int32)

    sid_sorted_inds = np.argsort(sids)
    sids = sids[sid_sorted_inds]
    features_value = features_value[sid_sorted_inds]

    preserved = Case(*[When(id=id, then=pos) for pos, id in enumerate(sids)])
    segments = segments.order_by(preserved)
    tids = segments.values_list('tid', flat=True)

    features = [feature_map['s2s_autoencoded']]
    col_inds = {'s2s_autoencoded': [0, ndims]}
    if with_duration:
        features.append(feature_map['duration'])
        col_inds['duration'] = [ndims, ndims + 1]
        durations = list(
            segments.annotate(duration=F('end_time_ms') -
                              F('start_time_ms')).values_list('duration',
                                                              flat=True))
        durations = np.array(durations)
        assert len(durations) == len(sids)
        features_value = np.concatenate(
            (features_value, durations.reshape(-1, 1)), axis=1)

    features_value = features_value.astype(np.float32)

    dm = DataMatrix(database=database)
    dm.name = dm_name
    dm.ndims = ndims
    dm.features_hash = '-'.join([str(x.id) for x in features])
    dm.aggregations_hash = ''
    dm.save()

    full_sids_path = dm.get_sids_path()
    full_tids_path = dm.get_tids_path()
    full_bytes_path = dm.get_bytes_path()
    full_cols_path = dm.get_cols_path()

    ndarray_to_bytes(features_value, full_bytes_path)
    ndarray_to_bytes(np.array(sids, dtype=np.int32), full_sids_path)
    ndarray_to_bytes(np.array(tids, dtype=np.int32), full_tids_path)

    with open(full_cols_path, 'w', encoding='utf-8') as f:
        json.dump(col_inds, f)
Example #2
0
    def handle(self, *args, **options):
        database_name = options['database_name']
        celery = options['celery']
        save_db = options['save_db']

        if not save_db and celery:
            warning('celery reverted to False because save_db is False')

        database = get_or_error(Database, dict(name__iexact=database_name))

        features = Feature.objects.all().order_by('id')
        aggregations = Aggregation.objects.filter(enabled=True).order_by('id')

        enabled_features = []
        for f in features:
            if f.name in feature_map:
                enabled_features.append(f)

        features_hash = '-'.join(
            list(map(str, [x.id for x in enabled_features])))
        aggregations_hash = '-'.join(
            list(map(str, aggregations.values_list('id', flat=True))))

        user = User.objects.get(username='******')

        if save_db:
            dm = DataMatrix(database=database)
            dm.ndims = 0
            dm.name = uuid.uuid4().hex
            dm.features_hash = features_hash
            dm.aggregations_hash = aggregations_hash
            dm.save()
            task = Task(user=user,
                        target='{}:{}'.format(DataMatrix.__name__, dm.id))
            task.save()
            dm.task = task
            dm.save()
        else:
            task = NonDbTask(user=user)
            segments = Segment.objects.filter(audio_file__database=database)
            sids = segments.values_list('id', flat=True)
            task.sids = sids
            task.features_hash = features_hash
            task.aggregations_hash = aggregations_hash

        if celery:
            extract_database_measurements.delay(task.id)
        else:
            extract_database_measurements(task, force=True)
    def handle(self, *args, **options):
        path = options['path']
        if not os.path.isfile(path):
            raise Exception('File {} not found'.format(path))

        database_name = options['database_name']
        dm_name = options['dm_name']
        database = get_or_error(Database, dict(name__iexact=database_name))

        dataset = data_set.load(Path(path))
        features = dataset.features
        filenames = dataset.filenames
        sids = [int(x[:-4]) for x in filenames]

        nobs, ndims = dataset.features.shape

        preserved = Case(*[When(id=id, then=pos) for pos, id in enumerate(sids)])
        segments = Segment.objects.filter(id__in=sids).order_by(preserved)
        tids = segments.values_list('tid', flat=True)

        col_inds = {'s2s_autoencoded': [0, ndims]}

        dm = DataMatrix(database=database)
        dm.name = dm_name
        dm.ndims = ndims
        dm.features_hash = 's2s_autoencoded'
        dm.aggregations_hash = ''
        dm.save()

        full_sids_path = dm.get_sids_path()
        full_tids_path = dm.get_tids_path()
        full_bytes_path = dm.get_bytes_path()
        full_cols_path = dm.get_cols_path()

        ndarray_to_bytes(features, full_bytes_path)
        ndarray_to_bytes(np.array(sids, dtype=np.int32), full_sids_path)
        ndarray_to_bytes(np.array(tids, dtype=np.int32), full_tids_path)

        with open(full_cols_path, 'w', encoding='utf-8') as f:
            json.dump(col_inds, f)
Example #4
0
    def form_valid(self, form):
        post_data = self.request.POST
        user = self.request.user
        form_data = form.cleaned_data
        name = form_data.get('name', None)
        dmid = form_data.get('data_matrix', None)

        has_error = False
        is_recreating = False

        if dmid:
            dm = get_or_error(DataMatrix, dict(id=dmid))
            is_recreating = True

        else:
            if 'database' in post_data:
                database_id = int(post_data['database'])
                database = get_or_error(Database, dict(id=int(database_id)))
                if DataMatrix.objects.filter(database=database,
                                             name=name).exists():
                    form.add_error('name', 'This name is already taken')
                    has_error = True
                dm = DataMatrix(database=database)
            else:
                database_id = get_or_error(post_data, 'tmpdb')
                database = get_or_error(TemporaryDatabase,
                                        dict(id=int(database_id)))
                if DataMatrix.objects.filter(tmpdb=database,
                                             name=name).exists():
                    form.add_error('name', 'This name is already taken')
                    has_error = True
                dm = DataMatrix(tmpdb=database)

            if has_error:
                context = self.get_context_data()
                context['form'] = form
                rendered = render_to_string(
                    'partials/feature-selection-form.html', context=context)
                return HttpResponse(
                    json.dumps(
                        dict(message=dict(success=False, html=rendered))))

            features = form_data['features'].order_by('id')
            aggregations = form_data['aggregations'].order_by('id')

            dm.name = name
            dm.ndims = 0
            dm.features_hash = '-'.join(
                list(map(str, features.values_list('id', flat=True))))
            dm.aggregations_hash = '-'.join(
                list(map(str, aggregations.values_list('id', flat=True))))
            dm.save()

        task = Task(user=user,
                    target='{}:{}'.format(DataMatrix.__name__, dm.id))
        task.save()
        dm.task = task
        dm.save()

        delay_in_production(extract_database_measurements, task.id)
        if is_recreating:
            ord_tasks = recreate_associated_ordination_tasks(dmid, user)
            for task in ord_tasks:
                delay_in_production(construct_ordination, task.id)

            sim_tasks = recreate_associated_similarity_tasks(dmid, user)
            for task in sim_tasks:
                delay_in_production(calculate_similarity, task.id)

        context = self.get_context_data()
        context['task'] = task
        rendered = render_to_string('partials/feature-extraction-tasks.html',
                                    context=context)
        return HttpResponse(
            json.dumps(dict(message=dict(success=True, html=rendered))))