def _calculate_similarity(sim, runner): dm = sim.dm ord = sim.ord assert dm.task is None or dm.task.is_completed(), \ 'Cannot calculate similarity because previous error occurred when extracting features' assert ord is None or ord.task is None or ord.task.is_completed(),\ 'Cannot calculate similarity because previous error occutred when constructing ordination' if ord: sids_path = ord.get_sids_path() source_bytes_path = ord.get_bytes_path() else: sids_path = dm.get_sids_path() source_bytes_path = dm.get_bytes_path() sids = bytes_to_ndarray(sids_path, np.int32) coordinates = get_rawdata_from_binary(source_bytes_path, len(sids)) coordinates[np.where(np.logical_not(np.isfinite(coordinates)))] = 0 runner.start() tree = linkage(coordinates, method='average') order = natural_order(tree) sorted_order = np.argsort(order).astype(np.int32) runner.wrapping_up() sim_sids_path = sim.get_sids_path() sim_bytes_path = sim.get_bytes_path() ndarray_to_bytes(sorted_order, sim_bytes_path) ndarray_to_bytes(sids, sim_sids_path)
def test_bytes_to_ndarray(self): django.setup() from koe.ts_utils import bytes_to_ndarray, ndarray_to_bytes arr = np.random.rand(100, 200).astype(np.float32) filename = '/tmp/{}.bytes'.format(uuid4().hex) ndarray_to_bytes(arr, filename) arr_ = bytes_to_ndarray(filename).reshape((100, 200)) os.remove(filename) self.assertTrue(np.allclose(arr, arr_))
def construct_ordination(task_id): task = get_or_wait(task_id) runner = TaskRunner(task) try: runner.preparing() cls, ord_id = task.target.split(':') ord_id = int(ord_id) assert cls == Ordination.__name__ ord = Ordination.objects.get(id=ord_id) dm = ord.dm method_name = ord.method ndims = ord.ndims param_kwargs = Ordination.params_to_kwargs(ord.params) assert dm.task is None or dm.task.is_completed() assert method_name in methods.keys(), 'Unknown method {}'.format( method_name) assert 2 <= ndims <= 3, 'Only support 2 or 3 dimensional ordination' runner.start() dm_sids_path = dm.get_sids_path() dm_bytes_path = dm.get_bytes_path() sids = bytes_to_ndarray(dm_sids_path, np.int32) dm_data = get_rawdata_from_binary(dm_bytes_path, len(sids)) data = zscore(dm_data) data[np.where(np.isnan(data))] = 0 data[np.where(np.isinf(data))] = 0 method = methods[method_name] result = method(data, ndims, **param_kwargs) runner.wrapping_up() ord_sids_path = ord.get_sids_path() ord_bytes_path = ord.get_bytes_path() ndarray_to_bytes(result, ord_bytes_path) ndarray_to_bytes(sids, ord_sids_path) runner.complete() except Exception as e: runner.error(e)
def create_full_tensor(database, recreate): features = Feature.objects.all().order_by('id') aggregations = Aggregation.objects.all().order_by('id') features_hash = '-'.join( list(map(str, features.values_list('id', flat=True)))) aggregations_hash = '-'.join( list(map(str, aggregations.values_list('id', flat=True)))) aggregators = [aggregator_map[x.name] for x in aggregations] full_tensor = FullTensorData.objects.filter( database=database, features_hash=features_hash, aggregations_hash=aggregations_hash).first() if full_tensor and not recreate: print( 'Full tensor {} already exists. If you want to recreate, turn on flag --recreate' .format(full_tensor.name)) return full_tensor, False if full_tensor is None: full_tensors_name = uuid.uuid4().hex full_tensor = FullTensorData(name=full_tensors_name, database=database, features_hash=features_hash, aggregations_hash=aggregations_hash) full_sids_path = full_tensor.get_sids_path() full_bytes_path = full_tensor.get_bytes_path() full_cols_path = full_tensor.get_cols_path() sids, tids = get_sids_tids(database) f2bs, fa2bs = get_binstorage_locations(features, aggregators) data, col_inds = extract_rawdata(f2bs, fa2bs, tids, features, aggregators) ndarray_to_bytes(data, full_bytes_path) ndarray_to_bytes(sids, full_sids_path) with open(full_cols_path, 'w', encoding='utf-8') as f: json.dump(col_inds, f) full_tensor.save() return full_tensor, True
def calculate_similarity(task_id): task = get_or_wait(task_id) runner = TaskRunner(task) try: runner.preparing() cls, sim_id = task.target.split(':') sim_id = int(sim_id) assert cls == SimilarityIndex.__name__ sim = SimilarityIndex.objects.get(id=sim_id) dm = sim.dm ord = sim.ord assert dm.task is None or dm.task.is_completed() assert ord is None or ord.task is None or ord.task.is_completed() if ord: sids_path = ord.get_sids_path() source_bytes_path = ord.get_bytes_path() else: sids_path = dm.get_sids_path() source_bytes_path = dm.get_bytes_path() runner.start() sids, sorted_order = _calculate_similarity(sids_path, source_bytes_path) runner.wrapping_up() sim_sids_path = sim.get_sids_path() sim_bytes_path = sim.get_bytes_path() ndarray_to_bytes(sorted_order, sim_bytes_path) ndarray_to_bytes(sids, sim_sids_path) runner.complete() except Exception as e: runner.error(e)
def _construct_ordination(ord, runner): dm = ord.dm method_name = ord.method ndims = ord.ndims param_kwargs = Ordination.params_to_kwargs(ord.params) assert dm.task is None or dm.task.is_completed( ), 'Cannot construct ordination because its DataMatrix failed' assert method_name in methods.keys(), 'Unknown method {}'.format( method_name) assert 2 <= ndims <= 3, 'Only support 2 or 3 dimensional ordination' runner.start() dm_sids_path = dm.get_sids_path() dm_bytes_path = dm.get_bytes_path() sids = bytes_to_ndarray(dm_sids_path, np.int32) dm_data = get_rawdata_from_binary(dm_bytes_path, len(sids)) dm_dims = dm_data.shape[1] assert dm_data.shape[1] >= ndims, \ 'Data has only {} dimension(s), not enough to construct a {}-dimensional ordination'.format(dm_dims, ndims) data = zscore(dm_data) data[np.where(np.isnan(data))] = 0 data[np.where(np.isinf(data))] = 0 method = methods[method_name] result = method(data, ndims, **param_kwargs) result = result.astype(np.float32) runner.wrapping_up() ord_sids_path = ord.get_sids_path() ord_bytes_path = ord.get_bytes_path() ndarray_to_bytes(result, ord_bytes_path) ndarray_to_bytes(sids, ord_sids_path)
def encode_into_datamatrix(variables, encoder, session, database_name, kernel_only): with_duration = variables['with_duration'] dm_name = variables['dm_name'] ndims = encoder.latent_dims database = get_or_error(Database, dict(name__iexact=database_name)) audio_files = AudioFile.objects.filter(database=database) segments = Segment.objects.filter(audio_file__in=audio_files) encoding_result = encode_syllables(variables, encoder, session, segments, kernel_only) features_value = np.array(list(encoding_result.values())) sids = np.array(list(encoding_result.keys()), dtype=np.int32) sid_sorted_inds = np.argsort(sids) sids = sids[sid_sorted_inds] features_value = features_value[sid_sorted_inds] preserved = Case(*[When(id=id, then=pos) for pos, id in enumerate(sids)]) segments = segments.order_by(preserved) tids = segments.values_list('tid', flat=True) features = [feature_map['s2s_autoencoded']] col_inds = {'s2s_autoencoded': [0, ndims]} if with_duration: features.append(feature_map['duration']) col_inds['duration'] = [ndims, ndims + 1] durations = list( segments.annotate(duration=F('end_time_ms') - F('start_time_ms')).values_list('duration', flat=True)) durations = np.array(durations) assert len(durations) == len(sids) features_value = np.concatenate( (features_value, durations.reshape(-1, 1)), axis=1) features_value = features_value.astype(np.float32) dm = DataMatrix(database=database) dm.name = dm_name dm.ndims = ndims dm.features_hash = '-'.join([str(x.id) for x in features]) dm.aggregations_hash = '' dm.save() full_sids_path = dm.get_sids_path() full_tids_path = dm.get_tids_path() full_bytes_path = dm.get_bytes_path() full_cols_path = dm.get_cols_path() ndarray_to_bytes(features_value, full_bytes_path) ndarray_to_bytes(np.array(sids, dtype=np.int32), full_sids_path) ndarray_to_bytes(np.array(tids, dtype=np.int32), full_tids_path) with open(full_cols_path, 'w', encoding='utf-8') as f: json.dump(col_inds, f)
def handle(self, *args, **options): path = options['path'] if not os.path.isfile(path): raise Exception('File {} not found'.format(path)) database_name = options['database_name'] dm_name = options['dm_name'] database = get_or_error(Database, dict(name__iexact=database_name)) dataset = data_set.load(Path(path)) features = dataset.features filenames = dataset.filenames sids = [int(x[:-4]) for x in filenames] nobs, ndims = dataset.features.shape preserved = Case(*[When(id=id, then=pos) for pos, id in enumerate(sids)]) segments = Segment.objects.filter(id__in=sids).order_by(preserved) tids = segments.values_list('tid', flat=True) col_inds = {'s2s_autoencoded': [0, ndims]} dm = DataMatrix(database=database) dm.name = dm_name dm.ndims = ndims dm.features_hash = 's2s_autoencoded' dm.aggregations_hash = '' dm.save() full_sids_path = dm.get_sids_path() full_tids_path = dm.get_tids_path() full_bytes_path = dm.get_bytes_path() full_cols_path = dm.get_cols_path() ndarray_to_bytes(features, full_bytes_path) ndarray_to_bytes(np.array(sids, dtype=np.int32), full_sids_path) ndarray_to_bytes(np.array(tids, dtype=np.int32), full_tids_path) with open(full_cols_path, 'w', encoding='utf-8') as f: json.dump(col_inds, f)
def extract_database_measurements(arg=None, force=False, send_email='always', raise_err=False, *args, **kwargs): if isinstance(arg, int): task = get_or_wait(arg) else: task = arg send_email = 'error-only' if settings.DEBUG else send_email runner = TaskRunner(task, send_email=send_email) try: runner.preparing() if isinstance(task, Task): cls, dm_id = task.target.split(':') dm_id = int(dm_id) assert cls == DataMatrix.__name__ dm = DataMatrix.objects.get(id=dm_id) if dm.database: segments = Segment.objects.filter( audio_file__database=dm.database) sids = segments.values_list('id', flat=True) else: sids = dm.tmpdb.ids features_hash = dm.features_hash aggregations_hash = dm.aggregations_hash else: sids = task.sids features_hash = task.features_hash aggregations_hash = task.aggregations_hash if len(sids) == 0: raise CustomAssertionError( 'Measurement cannot be extracted because your database doesn\'t contain any segments.' ) segments = Segment.objects.filter(id__in=sids) tids = np.array(segments.values_list('tid', flat=True), dtype=np.int32) features = Feature.objects.filter(id__in=features_hash.split('-')) aggregations = Aggregation.objects.filter( id__in=aggregations_hash.split('-')) available_feature_names = feature_extractors.keys() disabled_features_names = [ x.name for x in features if x.name not in available_feature_names ] if len(disabled_features_names): warning('Task #{}: Features {} are no longer available'.format( task.id, disabled_features_names)) features = [ x for x in features if x.name in available_feature_names ] available_aggregator_names = aggregator_map.keys() disabled_aggregators_names = [ x.name for x in aggregations if x.name not in available_aggregator_names ] if len(disabled_aggregators_names): warning('Task #{}: Aggregation {} are no longer available'.format( task.id, disabled_aggregators_names)) aggregations = [ x for x in aggregations if x.name in available_aggregator_names ] aggregators = [aggregator_map[x.name] for x in aggregations] extract_segment_features_for_segments(runner, sids, features, force=force) runner.wrapping_up() child_task = task.__class__(user=task.user, parent=task) child_task.save() child_runner = TaskRunner(child_task) child_runner.preparing() aggregate_feature_values(child_runner, tids, features, aggregators, force=force) child_runner.complete() if isinstance(task, Task): full_sids_path = dm.get_sids_path() full_bytes_path = dm.get_bytes_path() full_cols_path = dm.get_cols_path() data, col_inds = extract_rawdata(tids, features, aggregators) ndarray_to_bytes(data, full_bytes_path) ndarray_to_bytes(np.array(sids, dtype=np.int32), full_sids_path) with open(full_cols_path, 'w', encoding='utf-8') as f: json.dump(col_inds, f) dm.ndims = data.shape[1] dm.save() runner.complete() except Exception as e: if raise_err: raise e runner.error(e)
def create_derived_tensor(full_tensor, annotator, dim_reduce, ndims, recreate): admin = get_or_error(User, dict(username__iexact='superuser')) full_sids_path = full_tensor.get_sids_path() full_bytes_path = full_tensor.get_bytes_path() sids = bytes_to_ndarray(full_sids_path, np.int32) full_data = get_rawdata_from_binary(full_bytes_path, len(sids)) if dim_reduce != 'none': dim_reduce_fun = reduce_funcs[dim_reduce] n_feature_cols = full_data.shape[1] n_components = min(n_feature_cols // 2, ndims) else: dim_reduce_fun = None n_components = None derived_tensor = DerivedTensorData.objects.filter( database=full_tensor.database, full_tensor=full_tensor, features_hash=full_tensor.features_hash, aggregations_hash=full_tensor.aggregations_hash, ndims=n_components, dimreduce=dim_reduce, creator=admin, annotator=annotator).first() if derived_tensor and not recreate: print( 'Derived tensor {} already exists. If you want to recreate, turn on flag --recreate' .format(derived_tensor.name)) return derived_tensor, False if derived_tensor is None: derived_tensors_name = uuid.uuid4().hex derived_tensor = DerivedTensorData( name=derived_tensors_name, database=full_tensor.database, full_tensor=full_tensor, features_hash=full_tensor.features_hash, aggregations_hash=full_tensor.aggregations_hash, dimreduce=dim_reduce, ndims=n_components, creator=admin, annotator=annotator) derived_cfg_path = derived_tensor.get_config_path() if dim_reduce_fun: # TSNE needs normalisation first if dim_reduce.startswith('tsne'): full_data = zscore(full_data) full_data[np.where(np.isnan(full_data))] = 0 full_data[np.where(np.isinf(full_data))] = 0 dim_reduced_data = dim_reduce_fun(full_data, n_components) derived_bytes_path = derived_tensor.get_bytes_path() ndarray_to_bytes(dim_reduced_data, derived_bytes_path) tensor_shape = dim_reduced_data.shape tensor_path = '/' + derived_bytes_path, else: tensor_shape = full_data.shape tensor_path = '/' + full_bytes_path, # Always write config last - to make sure it's not missing anything embedding = dict( tensorName=derived_tensor.name, tensorShape=tensor_shape, tensorPath=tensor_path, metadataPath=reverse('tsne-meta', kwargs={'tensor_name': derived_tensor.name}), ) config = dict(embeddings=[embedding]) write_config(config, derived_cfg_path) derived_tensor.save() return derived_tensor, True
def perform_action(self, when, remove_dead): for dm in DataMatrix.objects.all(): need_reconstruct = self.check_rebuild_necessary(dm, when) if not need_reconstruct: continue full_sids_path = dm.get_sids_path() full_bytes_path = dm.get_bytes_path() full_cols_path = dm.get_cols_path() if dm.database: if os.path.isfile(full_sids_path): sids = bytes_to_ndarray(full_sids_path, np.int32) else: sids = Segment.objects.filter( audio_file__database=dm.database).values_list( 'id', flat=True) dbname = dm.database.name else: sids = dm.tmpdb.ids dbname = dm.tmpdb.name segments = Segment.objects.filter(id__in=sids) if len(segments) == 0: print('Skip DM #{}-{}-{}: '.format(dm.id, dbname, dm.name)) if remove_dead: print('Delete {}'.format(dm)) for f in [full_sids_path, full_bytes_path, full_cols_path]: print('Remove binary file {}'.format(f)) try: os.remove(f) except FileNotFoundError: pass dm.delete() continue tids = np.array(segments.values_list('tid', flat=True), dtype=np.int32) features_ids = dm.features_hash.split('-') features = list(Feature.objects.filter(id__in=features_ids)) aggregations_ids = dm.aggregations_hash.split('-') aggregations = Aggregation.objects.filter(id__in=aggregations_ids) available_feature_names = feature_extractors.keys() disabled_features_names = [ x.name for x in features if x.name not in available_feature_names ] if len(disabled_features_names): warning( 'DM #{}-{}-{}: Features {} are no longer available'.format( dm.id, dbname, dm.name, disabled_features_names)) features = [ x for x in features if x.name in available_feature_names ] available_aggregator_names = aggregator_map.keys() disabled_aggregators_names = [ x.name for x in aggregations if x.name not in available_aggregator_names ] if len(disabled_aggregators_names): warning('DM #{}-{}-{}: Aggregation {} are no longer available'. format(dm.id, dbname, dm.name, disabled_aggregators_names)) aggregations = [ x for x in aggregations if x.name in available_aggregator_names ] aggregators = [aggregator_map[x.name] for x in aggregations] runner = ConsoleTaskRunner( prefix='Extract measurement for DM #{}-{}-{}: '.format( dm.id, dbname, dm.name)) runner.preparing() extract_segment_features_for_segments(runner, sids, features, force=False) runner.wrapping_up() child_runner = ConsoleTaskRunner( prefix='Aggregate measurement for DM #{}-{}-{}: '.format( dm.id, dbname, dm.name)) child_runner.preparing() aggregate_feature_values(child_runner, tids, features, aggregators) child_runner.wrapping_up() child_runner.complete() runner.complete() data, col_inds = extract_rawdata(tids, features, aggregators) ndarray_to_bytes(data, full_bytes_path) ndarray_to_bytes(np.array(sids, dtype=np.int32), full_sids_path) with open(full_cols_path, 'w', encoding='utf-8') as f: json.dump(col_inds, f) dm.ndims = data.shape[1] dm.save()
def extract_database_measurements(arg=None, force=False): if isinstance(arg, int): task = get_or_wait(arg) else: task = arg runner = TaskRunner(task) try: runner.preparing() if isinstance(task, Task): cls, dm_id = task.target.split(':') dm_id = int(dm_id) assert cls == DataMatrix.__name__ dm = DataMatrix.objects.get(id=dm_id) if dm.database: segments = Segment.objects.filter( audio_file__database=dm.database) sids = segments.values_list('id', flat=True) else: sids = dm.tmpdb.ids features_hash = dm.features_hash aggregations_hash = dm.aggregations_hash else: sids = task.sids features_hash = task.features_hash aggregations_hash = task.aggregations_hash features = Feature.objects.filter(id__in=features_hash.split('-')) aggregations = Aggregation.objects.filter( id__in=aggregations_hash.split('-')) aggregators = [aggregator_map[x.name] for x in aggregations] # feature to binstorage's files f2bs = {} # feature+aggregation to binstorage's files fa2bs = {} for feature in features: feature_name = feature.name index_filename = data_path('binary/features', '{}.idx'.format(feature_name), for_url=False) value_filename = data_path('binary/features', '{}.val'.format(feature_name), for_url=False) f2bs[feature] = (index_filename, value_filename) if feature not in fa2bs: fa2bs[feature] = {} for aggregator in aggregators: aggregator_name = aggregator.get_name() folder = os.path.join('binary', 'features', feature_name) mkdirp(os.path.join(settings.MEDIA_URL, folder)[1:]) index_filename = data_path(folder, '{}.idx'.format(aggregator_name), for_url=False) value_filename = data_path(folder, '{}.val'.format(aggregator_name), for_url=False) fa2bs[feature][aggregator] = (index_filename, value_filename) tids, f2tid2fvals = extract_segment_features_for_segments( runner, sids, features, f2bs, force) for feature, (index_filename, value_filename) in f2bs.items(): _tids, _fvals = f2tid2fvals.get(feature, (None, None)) if _tids: _tids = np.array(_tids, dtype=np.int32) ensure_parent_folder_exists(index_filename) binstorage.store(_tids, _fvals, index_filename, value_filename) runner.wrapping_up() child_task = task.__class__(user=task.user, parent=task) child_task.save() child_runner = TaskRunner(child_task) child_runner.preparing() aggregate_feature_values(child_runner, sids, f2bs, fa2bs, features, aggregators) child_runner.complete() if isinstance(task, Task): full_sids_path = dm.get_sids_path() full_bytes_path = dm.get_bytes_path() full_cols_path = dm.get_cols_path() data, col_inds = extract_rawdata(f2bs, fa2bs, tids, features, aggregators) ndarray_to_bytes(data, full_bytes_path) ndarray_to_bytes(np.array(sids, dtype=np.int32), full_sids_path) with open(full_cols_path, 'w', encoding='utf-8') as f: json.dump(col_inds, f) dm.ndims = data.shape[1] dm.save() runner.complete() except Exception as e: runner.error(e)