def handle(self, *args, **options):

        tensor_to_dm = {}
        for tensor in FullTensorData.objects.all():
            sids_path = tensor.get_sids_path()
            bytes_path = tensor.get_bytes_path()
            cols_path = tensor.get_cols_path()

            sids = bytes_to_ndarray(sids_path, np.int32)
            data = get_rawdata_from_binary(bytes_path, len(sids))

            dm = DataMatrix.objects.filter(name=tensor.name).first()
            if dm is None:
                dm = DataMatrix.objects.create(
                    database=tensor.database,
                    name=tensor.name,
                    features_hash=tensor.features_hash,
                    aggregations_hash=tensor.aggregations_hash,
                    ndims=data.shape[1])

            dm_sids_path = dm.get_sids_path()
            dm_bytes_path = dm.get_bytes_path()
            dm_cols_path = dm.get_cols_path()

            ensure_parent_folder_exists(dm_sids_path)

            shutil.copy(sids_path, dm_sids_path)
            shutil.copy(bytes_path, dm_bytes_path)
            shutil.copy(cols_path, dm_cols_path)

            tensor_to_dm[tensor] = dm

        for tensor in DerivedTensorData.objects.exclude(dimreduce='none'):
            dm = tensor_to_dm[tensor.full_tensor]
            sids_path = tensor.full_tensor.get_sids_path()
            bytes_path = tensor.get_bytes_path()

            if not os.path.exists(bytes_path):
                bytes_path = tensor.full_tensor.get_bytes_path()

            method = tensor.dimreduce
            ndims = tensor.ndims
            if method.startswith('tsne'):
                ndims = int(method[4:])
                method = 'tsne'

            ord = Ordination.objects.filter(dm=dm, method=method,
                                            ndims=ndims).first()
            if ord is None:
                ord = Ordination.objects.create(dm=dm,
                                                method=method,
                                                ndims=ndims)

            ord_sids_path = ord.get_sids_path()
            ord_bytes_path = ord.get_bytes_path()

            ensure_parent_folder_exists(ord_sids_path)

            shutil.copy(sids_path, ord_sids_path)
            shutil.copy(bytes_path, ord_bytes_path)
Ejemplo n.º 2
0
    def test_pca(self):
        django.setup()
        from koe.models import Feature, Aggregation, FullTensorData, Database
        from koe.ts_utils import bytes_to_ndarray, get_rawdata_from_binary

        database = Database.objects.get(name='Bellbird_TMI')
        features = Feature.objects.all().order_by('id')
        aggregations = Aggregation.objects.all().order_by('id')
        features_hash = '-'.join(
            list(map(str, features.values_list('id', flat=True))))
        aggregations_hash = '-'.join(
            list(map(str, aggregations.values_list('id', flat=True))))

        full_tensor = FullTensorData.objects.filter(
            database=database,
            features_hash=features_hash,
            aggregations_hash=aggregations_hash).first()
        if full_tensor is None:
            raise Exception('Tensor not found')

        full_sids_path = full_tensor.get_sids_path()
        full_bytes_path = full_tensor.get_bytes_path()

        sids = bytes_to_ndarray(full_sids_path, np.int32)
        full_data = get_rawdata_from_binary(full_bytes_path, len(sids))

        with tictoc('PCA'):
            dim_reduce_func = pca(n_components=50)
            dim_reduce_func.fit_transform(full_data)
Ejemplo n.º 3
0
def _calculate_similarity(sim, runner):
    dm = sim.dm
    ord = sim.ord

    assert dm.task is None or dm.task.is_completed(), \
        'Cannot calculate similarity because previous error occurred when extracting features'
    assert ord is None or ord.task is None or ord.task.is_completed(),\
        'Cannot calculate similarity because previous error occutred when constructing ordination'

    if ord:
        sids_path = ord.get_sids_path()
        source_bytes_path = ord.get_bytes_path()
    else:
        sids_path = dm.get_sids_path()
        source_bytes_path = dm.get_bytes_path()

    sids = bytes_to_ndarray(sids_path, np.int32)
    coordinates = get_rawdata_from_binary(source_bytes_path, len(sids))
    coordinates[np.where(np.logical_not(np.isfinite(coordinates)))] = 0

    runner.start()
    tree = linkage(coordinates, method='average')

    order = natural_order(tree)
    sorted_order = np.argsort(order).astype(np.int32)

    runner.wrapping_up()

    sim_sids_path = sim.get_sids_path()
    sim_bytes_path = sim.get_bytes_path()

    ndarray_to_bytes(sorted_order, sim_bytes_path)
    ndarray_to_bytes(sids, sim_sids_path)
    def post_init(self, options):
        super(Command, self).post_init(options)

        dmid = options['dmid']
        ordid = options['ordid']
        self.class_aggregation = options['class_aggregation']

        if (dmid is None) == (ordid is None):
            raise Exception(
                'Either but not both --dm-id and --ord-id should be given')

        if dmid:
            self.dm = get_or_error(DataMatrix, dict(id=dmid))
            self.ord = None
        else:
            self.ord = get_or_error(Ordination, dict(id=ordid))
            self.dm = self.ord.dm

        sids_path = self.dm.get_sids_path()
        source_bytes_path = self.dm.get_bytes_path()

        self.sids = bytes_to_ndarray(sids_path, np.int32)
        self.tids = get_tids(self.sids)
        coordinates = get_rawdata_from_binary(source_bytes_path,
                                              len(self.sids))
        coordinates = drop_useless_columns(coordinates)
        coordinates = zscore(coordinates)
        coordinates[np.where(np.isinf(coordinates))] = 0
        coordinates[np.where(np.isnan(coordinates))] = 0
        self.coordinates = coordinates
Ejemplo n.º 5
0
def _calculate_similarity(sids_path, source_bytes_path, return_tree=False):
    sids = bytes_to_ndarray(sids_path, np.int32)
    coordinates = get_rawdata_from_binary(source_bytes_path, len(sids))

    tree = linkage(coordinates, method='average')
    order = natural_order(tree)
    sorted_order = np.argsort(order).astype(np.int32)
    if return_tree:
        return sids, sorted_order, tree
    return sids, sorted_order
Ejemplo n.º 6
0
    def prepare_data_for_analysis(self, pkl_filename, options):
        label_level = options['label_level']
        cdm = options['cdm']
        dmid = options['dmid']
        annotator_name = options['annotator_name']

        methods = dict(mean=np.mean, median=np.median)
        method = get_or_error(
            methods, cdm,
            'Unknown value {} for --class-distance-method.'.format(cdm))
        dm = get_dm(dmid)
        sids_path = dm.get_sids_path()
        source_bytes_path = dm.get_bytes_path()

        sids = bytes_to_ndarray(sids_path, np.int32)
        coordinates = get_rawdata_from_binary(source_bytes_path, len(sids))
        coordinates = drop_useless_columns(coordinates)
        coordinates = zscore(coordinates)
        coordinates[np.where(np.isinf(coordinates))] = 0
        coordinates[np.where(np.isnan(coordinates))] = 0

        if annotator_name is not None:
            annotator = get_or_error(User,
                                     dict(username__iexact=annotator_name))
            label_arr, syl_label_enum_arr = get_syllable_labels(
                annotator, label_level, sids)
            nlabels = len(label_arr)
            distmat, classes_info = calc_class_dist_by_syl_features(
                syl_label_enum_arr, nlabels, coordinates, method)
            dist_triu = mat2triu(distmat)
        else:
            dist_triu = distance.pdist(coordinates, 'euclidean')
            label_arr = []
            syl_label_enum_arr = []
            classes_info = []
            for sind, sid in enumerate(sids):
                label = str(sind)
                label_arr.append(label)
                syl_label_enum_arr.append(sind)
                classes_info.append([sind])

        tree = linkage(dist_triu, method='average')

        saved_dict = dict(tree=tree,
                          dbid=dm.database.id,
                          sids=sids,
                          unique_labels=label_arr,
                          classes_info=classes_info)

        with open(pkl_filename, 'wb') as f:
            pickle.dump(saved_dict, f)

        return saved_dict
Ejemplo n.º 7
0
    def test_bytes_to_ndarray(self):
        django.setup()
        from koe.ts_utils import bytes_to_ndarray, ndarray_to_bytes

        arr = np.random.rand(100, 200).astype(np.float32)
        filename = '/tmp/{}.bytes'.format(uuid4().hex)

        ndarray_to_bytes(arr, filename)
        arr_ = bytes_to_ndarray(filename).reshape((100, 200))

        os.remove(filename)

        self.assertTrue(np.allclose(arr, arr_))
Ejemplo n.º 8
0
def get_metadata(request, tensor_name):
    tensor = get_or_error(DerivedTensorData, dict(name=tensor_name))
    full_tensor = tensor.full_tensor

    full_sids_path = full_tensor.get_sids_path()
    sids = bytes_to_ndarray(full_sids_path, np.int32)

    metadata, headers = extract_tensor_metadata(sids, tensor.annotator)
    content = write_metadata(metadata, sids, headers)

    response = HttpResponse()
    response.write(content)
    response['Content-Type'] = 'text/tsv'
    response['Content-Length'] = len(content)
    return response
def get_sid_info(request):
    sids_path = get_or_error(request.POST, 'path')
    if sids_path.startswith('/'):
        sids_path = sids_path[1:]
    sids = bytes_to_ndarray(sids_path, np.int32)
    preserved = Case(*[When(id=id, then=pos) for pos, id in enumerate(sids)])
    ordered_segs = Segment.objects.filter(id__in=sids).order_by(preserved)
    value_list = ordered_segs.values_list('audio_file__id', 'audio_file__name',
                                          'start_time_ms', 'end_time_ms')
    seg_info = []
    song_info = {}

    for aid, aname, start, end in value_list:
        seg_info.append((aid, start, end))
        song_info[aid] = aname
    return seg_info, song_info
Ejemplo n.º 10
0
def construct_ordination(task_id):
    task = get_or_wait(task_id)
    runner = TaskRunner(task)
    try:
        runner.preparing()

        cls, ord_id = task.target.split(':')
        ord_id = int(ord_id)
        assert cls == Ordination.__name__
        ord = Ordination.objects.get(id=ord_id)

        dm = ord.dm
        method_name = ord.method
        ndims = ord.ndims
        param_kwargs = Ordination.params_to_kwargs(ord.params)

        assert dm.task is None or dm.task.is_completed()
        assert method_name in methods.keys(), 'Unknown method {}'.format(
            method_name)
        assert 2 <= ndims <= 3, 'Only support 2 or 3 dimensional ordination'

        runner.start()
        dm_sids_path = dm.get_sids_path()
        dm_bytes_path = dm.get_bytes_path()

        sids = bytes_to_ndarray(dm_sids_path, np.int32)
        dm_data = get_rawdata_from_binary(dm_bytes_path, len(sids))

        data = zscore(dm_data)
        data[np.where(np.isnan(data))] = 0
        data[np.where(np.isinf(data))] = 0

        method = methods[method_name]
        result = method(data, ndims, **param_kwargs)

        runner.wrapping_up()

        ord_sids_path = ord.get_sids_path()
        ord_bytes_path = ord.get_bytes_path()

        ndarray_to_bytes(result, ord_bytes_path)
        ndarray_to_bytes(sids, ord_sids_path)

        runner.complete()
    except Exception as e:
        runner.error(e)
Ejemplo n.º 11
0
    def post_init(self, options):
        super(Command, self).post_init(options)

        dmid = options['dmid']
        self.dm = get_or_error(DataMatrix, dict(id=dmid))

        sids_path = self.dm.get_sids_path()
        source_bytes_path = self.dm.get_bytes_path()

        self.sids = bytes_to_ndarray(sids_path, np.int32)
        self.tids = get_tids(self.sids)
        coordinates = get_rawdata_from_binary(source_bytes_path,
                                              len(self.sids))
        coordinates = drop_useless_columns(coordinates)
        coordinates = zscore(coordinates)
        coordinates[np.where(np.isinf(coordinates))] = 0
        coordinates[np.where(np.isnan(coordinates))] = 0
        self.coordinates = coordinates
Ejemplo n.º 12
0
    def check_rebuild_necessary(obj, when):
        ord_sids_path = obj.get_sids_path()
        ord_bytes_path = obj.get_bytes_path()

        status = ALL_GOOD

        is_missing = not (os.path.isfile(ord_sids_path)
                          and os.path.isfile(ord_bytes_path))

        if is_missing:
            status = MISSING
        else:
            sids = bytes_to_ndarray(ord_sids_path, np.int32)
            existing_count = Segment.objects.filter(id__in=sids).count()

            if len(sids) != existing_count:
                status = INCONSISTENT

        if when == 'missing':
            if status == MISSING:
                print('Re-constructing {} due to missing binary'.format(obj))
                return True
            else:
                print('Skip {} because it\'s binary files are not missing'.
                      format(obj))
        elif when == 'inconsistent':
            if status == INCONSISTENT:
                print(
                    'Re-constructing {} due to binary being inconsistent with database'
                    .format(obj))
                return True
            elif status == MISSING:
                print('Re-constructing {} due to missing binary'.format(obj))
                return True
            else:
                print(
                    'Skip {} because it\'s binary files are consistent with the database'
                    .format(obj))
        else:
            print('Forced re-constructing {}'.format(obj))
            return True

        return False
Ejemplo n.º 13
0
def _construct_ordination(ord, runner):
    dm = ord.dm
    method_name = ord.method
    ndims = ord.ndims
    param_kwargs = Ordination.params_to_kwargs(ord.params)

    assert dm.task is None or dm.task.is_completed(
    ), 'Cannot construct ordination because its DataMatrix failed'
    assert method_name in methods.keys(), 'Unknown method {}'.format(
        method_name)
    assert 2 <= ndims <= 3, 'Only support 2 or 3 dimensional ordination'

    runner.start()
    dm_sids_path = dm.get_sids_path()
    dm_bytes_path = dm.get_bytes_path()

    sids = bytes_to_ndarray(dm_sids_path, np.int32)
    dm_data = get_rawdata_from_binary(dm_bytes_path, len(sids))

    dm_dims = dm_data.shape[1]

    assert dm_data.shape[1] >= ndims, \
        'Data has only {} dimension(s), not enough to construct a {}-dimensional ordination'.format(dm_dims, ndims)

    data = zscore(dm_data)
    data[np.where(np.isnan(data))] = 0
    data[np.where(np.isinf(data))] = 0

    method = methods[method_name]
    result = method(data, ndims, **param_kwargs)
    result = result.astype(np.float32)

    runner.wrapping_up()

    ord_sids_path = ord.get_sids_path()
    ord_bytes_path = ord.get_bytes_path()

    ndarray_to_bytes(result, ord_bytes_path)
    ndarray_to_bytes(sids, ord_sids_path)
def get_ordination_metadata(request, ord_id, viewas):
    ord = get_or_error(Ordination, dict(id=ord_id))

    sids_path = ord.get_sids_path()
    sids = bytes_to_ndarray(sids_path, np.int32)

    viewas = get_or_error(User, dict(username=viewas))

    try:
        metadata, headers = extract_tensor_metadata(sids, viewas)
    except KeyError as e:
        err_message = 'Syllable #{} has been deleted from the database since the creation of this ordination and ' \
                      'thus renders it invalid. Please choose another one.'.format(str(e))
        raise CustomAssertionError(err_message)

    content = write_metadata(metadata, sids, headers)

    response = HttpResponse()
    response.write(content)
    response['Content-Type'] = 'text/tsv'
    response['Content-Length'] = len(content)
    return response
Ejemplo n.º 15
0
def bulk_get_segment_info(segs, extras):
    """
    Return rows contains Segments' information to display in SlickGrid
    :param segs: an array of segment object (or a QuerySet)
    :param extras: Must specify the user to get the correct ExtraAttrValue columns
    :return: [row]
    """
    viewas = extras.viewas
    holdout = extras.get('_holdout', 'false') == 'true'
    user = extras.user

    if 'database' in extras:
        database_id = extras.database
        current_database = get_or_error(Database, dict(id=database_id))
    else:
        database_id = extras.tmpdb
        current_database = get_or_error(TemporaryDatabase,
                                        dict(id=database_id))

    similarity_id = extras.similarity
    current_similarity = None
    if similarity_id:
        current_similarity = get_or_error(SimilarityIndex,
                                          dict(id=similarity_id))

    rows = []
    ids = []
    if current_database is None:
        return ids, rows

    if holdout:
        ids_holder = ExtraAttrValue.objects.filter(
            attr=settings.ATTRS.user.hold_ids_attr,
            owner_id=user.id,
            user=user).first()

        if ids_holder is not None and ids_holder.value != '':
            ids = ids_holder.value.split(',')
            segs = segs.filter(id__in=ids)
    elif isinstance(current_database, TemporaryDatabase):
        ids = current_database.ids
        segs = segs.filter(id__in=ids)
    else:
        segs = segs.filter(audio_file__database=current_database.id)

    values = list(
        segs.values_list(
            'id',
            'tid',
            'start_time_ms',
            'end_time_ms',
            'audio_file__name',
            'audio_file__id',
            'audio_file__quality',
            'audio_file__added',
            'audio_file__track__name',
            'audio_file__track__date',
            'audio_file__individual__name',
            'audio_file__individual__gender',
        ))

    segids = [x[0] for x in values]
    song_ids = [x[5] for x in values]

    extra_attr_values_list = ExtraAttrValue.objects \
        .filter(user__username=viewas, attr__klass=Segment.__name__, owner_id__in=segids) \
        .values_list('owner_id', 'attr__name', 'value')

    song_extra_attr_values_list = ExtraAttrValue.objects \
        .filter(user__username=viewas, attr__klass=AudioFile.__name__, owner_id__in=song_ids) \
        .values_list('owner_id', 'attr__name', 'value')

    extra_attr_values_lookup = {}
    for id, attr, value in extra_attr_values_list:
        if id not in extra_attr_values_lookup:
            extra_attr_values_lookup[id] = {}
        extra_attr_dict = extra_attr_values_lookup[id]
        extra_attr_dict[attr] = value

    song_extra_attr_values_lookup = {}
    for id, attr, value in song_extra_attr_values_list:
        if id not in song_extra_attr_values_lookup:
            song_extra_attr_values_lookup[id] = {}
        extra_attr_dict = song_extra_attr_values_lookup[id]
        extra_attr_dict[attr] = value

    ids = np.array([x[0] for x in values], dtype=np.int32)

    if current_similarity is None:
        id2order = {}
    else:
        sim_sids_path = current_similarity.get_sids_path()
        sim_bytes_path = current_similarity.get_bytes_path()

        sim_sids = bytes_to_ndarray(sim_sids_path, np.int32).tolist()
        sim_order = np.squeeze(
            get_rawdata_from_binary(sim_bytes_path, len(sim_sids),
                                    np.int32)).tolist()
        id2order = dict(zip(sim_sids, sim_order))

    for id, tid, start, end, song_name, song_id, quality, added, track, date, individual, gender in values:
        sim_index = id2order.get(id, None)

        duration = end - start
        url = reverse('segmentation', kwargs={'file_id': song_id})
        url = '[{}]({})'.format(url, song_name)
        row = dict(
            id=id,
            start_time_ms=start,
            end_time_ms=end,
            duration=duration,
            song=url,
            sim_index=sim_index,
            song_track=track,
            song_individual=individual,
            sex=gender,
            song_quality=quality,
            record_date=date,
            song_added=added.date(),
            spectrogram=tid,
        )
        extra_attr_dict = extra_attr_values_lookup.get(id, {})
        song_extra_attr_dict = song_extra_attr_values_lookup.get(song_id, {})

        for attr in extra_attr_dict:
            row[attr] = extra_attr_dict[attr]

        for song_attr in song_extra_attr_dict:
            attr = 'song_{}'.format(song_attr)
            row[attr] = song_extra_attr_dict[song_attr]

        rows.append(row)

    return ids, rows
    def handle(self, *args, **options):
        clsf_type = options['clsf_type']
        database_name = options['database_name']
        source = options['source']
        annotator_name = options['annotator_name']
        label_level = options['label_level']
        min_occur = options['min_occur']
        ipc = options['ipc']
        ratio_ = options['ratio']
        profile = options['profile']
        dm_name = options['dm_name']

        tsv_file = profile + '.tsv'
        trials_file = profile + '.trials'
        if ipc is not None:
            assert ipc <= min_occur, 'Instances per class cannot exceed as min-occur'
            ipc_min = ipc
            ipc_max = ipc
        else:
            ipc_min = min_occur
            ipc_max = int(np.floor(min_occur * 1.5))

        train_ratio, valid_ratio, test_ratio = get_ratios(ratio_)

        open_mode = 'w'

        assert clsf_type in classifiers.keys(), 'Unknown _classify: {}'.format(
            clsf_type)
        classifier = classifiers[clsf_type]

        database = get_or_error(Database, dict(name__iexact=database_name))
        annotator = get_or_error(User, dict(username__iexact=annotator_name))

        if dm_name is None:
            features = Feature.objects.all().order_by('id')
            aggregations = Aggregation.objects.filter(
                enabled=True).order_by('id')
            aggregators = [aggregator_map[x.name] for x in aggregations]

            enabled_features = []
            for f in features:
                if f.name in feature_map:
                    enabled_features.append(f)

            features_hash = '-'.join(
                list(map(str, [x.id for x in enabled_features])))
            aggregations_hash = '-'.join(
                list(map(str, aggregations.values_list('id', flat=True))))

            dm = DataMatrix.objects.filter(
                database=database,
                features_hash=features_hash,
                aggregations_hash=aggregations_hash).last()
            if dm is None:
                raise Exception('No full data matrix for database {}'.format(
                    database_name))
        else:
            dm = DataMatrix.objects.filter(database=database,
                                           name=dm_name).first()
            if dm is None:
                raise Exception('No such matrix {} for database {}'.format(
                    dm_name, database_name))

            if dm.aggregations_hash:
                aggregations_list = dm.aggregations_hash.split('-')
                aggregators = [aggregator_map[x] for x in aggregations_list]
            else:
                aggregators = []

            features = Feature.objects.filter(
                id__in=dm.features_hash.split('-'))
            ftgroup_names = {
                'custom': list(features.values_list('name', flat=True))
            }

        dm_sids_path = dm.get_sids_path()
        dm_tids_path = dm.get_tids_path()
        dm_bytes_path = dm.get_bytes_path()
        feature_cols = dm.get_cols_path()
        with open(feature_cols, 'r', encoding='utf-8') as f:
            col_inds = json.load(f)

        _sids = bytes_to_ndarray(dm_sids_path, np.int32)
        _sids, sort_order = np.unique(_sids, return_index=True)

        try:
            _tids = bytes_to_ndarray(dm_tids_path, np.int32)
            _tids = _tids[sort_order]
        except FileNotFoundError:
            _tids = get_tids(_sids)

        full_data = get_rawdata_from_binary(dm_bytes_path, len(_sids))
        full_data = full_data[sort_order, :]

        labels, no_label_ids = get_labels_by_sids(_sids, label_level,
                                                  annotator, min_occur)

        if len(no_label_ids) > 0:
            sids, tids, labels = exclude_no_labels(_sids, _tids, labels,
                                                   no_label_ids)
            lookup_ids_rows = np.searchsorted(_sids, sids)
            full_data = full_data[lookup_ids_rows, :]

        full_data = zscore(full_data)
        full_data[np.where(np.isnan(full_data))] = 0
        full_data[np.where(np.isinf(full_data))] = 0

        unique_labels = np.unique(labels)
        nlabels = len(unique_labels)

        for ftgroup_name, feature_names in ftgroup_names.items():
            if ftgroup_name == 'all':
                features = list(feature_map.values())
            else:
                features = [feature_map[x] for x in feature_names]
            ft_col_inds = []
            for feature in features:
                if feature.is_fixed_length:
                    col_name = feature.name
                    col_range = col_inds[col_name]
                    ft_col_inds += range(col_range[0], col_range[1])
                else:
                    for aggregator in aggregators:
                        col_name = '{}_{}'.format(feature.name,
                                                  aggregator.get_name())
                        col_range = col_inds[col_name]
                        ft_col_inds += range(col_range[0], col_range[1])

            ft_col_inds = np.array(ft_col_inds, dtype=np.int32)
            ndims = len(ft_col_inds)
            data = full_data[:, ft_col_inds]

            if source == 'pca':
                explained, data = pca_optimal(data, ndims, 0.9)
                pca_dims = data.shape[1]

            dp = EnumDataProvider(data, labels, balanced=True)
            trainvalidset, testset = dp.split(test_ratio,
                                              limits=(ipc_min, ipc_max))

            v2t_ratio = valid_ratio / (train_ratio + valid_ratio)
            nfolds = int(np.floor(1. / v2t_ratio + 0.01))

            params_names = []
            params_converters = []
            params_count = 0

            def loss(params):
                classifier_args = {}
                for i in range(params_count):
                    param_name = params_names[i]
                    param_converter = params_converters[i]
                    param_value = params[i]
                    classifier_args[param_name] = param_converter(param_value)

                print(classifier_args)
                score = perform_k_fold(classifier, trainvalidset, nfolds,
                                       v2t_ratio, nlabels, **classifier_args)
                return 1. - score

            n_estimators_choices = hp.uniform('n_estimators', 40, 100)
            min_samples_split_choices = hp.uniform('min_samples_split', 2, 21)
            min_samples_leaf_choices = hp.uniform('min_samples_leaf', 1, 20)

            n_features = data.shape[1]
            auto_gamma = 1 / n_features
            gamma_choices = hp.uniform('gamma', auto_gamma / 10,
                                       auto_gamma * 10)
            c_choices = hp.uniform('C', -1, 2)
            hidden_layer_size_choices = hp.uniform('hidden_layer_sizes', 100,
                                                   5000)
            n_neighbors_choices = hp.uniform('n_neighbors', 1, 10)

            choices = {
                'rf': {
                    'n_estimators':
                    (lambda x: int(np.round(x)), n_estimators_choices),
                    'min_samples_split':
                    (lambda x: int(np.round(x)), min_samples_split_choices),
                    'min_samples_leaf':
                    (lambda x: int(np.round(x)), min_samples_leaf_choices),
                },
                'svm_rbf': {
                    'gamma': (float, gamma_choices),
                    'C': (lambda x: 10**x, c_choices),
                },
                'svm_linear': {
                    'C': (lambda x: 10**x, c_choices),
                },
                'nnet': {
                    'hidden_layer_sizes':
                    (lambda x: (int(np.round(x)), ), hidden_layer_size_choices)
                },
                'knn': {
                    'n_neighbors':
                    (lambda x: int(np.round(x)), n_neighbors_choices)
                }
            }

            space = []
            for arg_name, (converter,
                           arg_values) in choices[clsf_type].items():
                space.append(arg_values)
                params_names.append(arg_name)
                params_converters.append(converter)
                params_count += 1

            trials = Trials()
            max_evals = params_count * 30
            best = fmin(fn=loss,
                        space=space,
                        algo=tpe.suggest,
                        max_evals=max_evals,
                        trials=trials)
            print(best)

            with open(trials_file, 'wb') as f:
                pickle.dump(trials, f)

            best_trial = trials.best_trial
            best_trial_args_values_ = best_trial['misc']['vals']
            best_trial_args_values = {}
            for arg_name, arg_values in best_trial_args_values_.items():
                converter = choices[clsf_type][arg_name][0]
                arg_value = converter(arg_values[0])
                best_trial_args_values[arg_name] = arg_value

            model_args = ['id'] + list(
                best_trial_args_values.keys()) + ['accuracy']

            model_args_values = {x: [] for x in model_args}
            for idx, trial in enumerate(trials.trials):
                if trial == best_trial:
                    idx = 'Best'
                trial_args_values = trial['misc']['vals']
                for arg_name in model_args:
                    if arg_name == 'id':
                        model_args_values['id'].append(idx)
                    elif arg_name == 'accuracy':
                        trial_accuracy = 1. - trial['result']['loss']
                        model_args_values['accuracy'].append(trial_accuracy)
                    else:
                        # choice = choices[clsf_type][arg_name]
                        converter = choices[clsf_type][arg_name][0]
                        val = converter(trial_args_values[arg_name][0])
                        # val = choice[choice_idx]
                        model_args_values[arg_name].append(val)

            # Perform classification on the test set
            train_x = np.array(trainvalidset.data)
            train_y = np.array(trainvalidset.labels, dtype=np.int32)
            test_x = np.array(testset.data)
            test_y = np.array(testset.labels, dtype=np.int32)

            score, label_hits, label_misses, cfmat, importances =\
                classifier(train_x, train_y, test_x, test_y, nlabels, True, **best_trial_args_values)
            lb_hitrates = label_hits / (label_hits + label_misses).astype(
                np.float)

            with open(tsv_file, open_mode, encoding='utf-8') as f:
                for arg in model_args:
                    values = model_args_values[arg]
                    f.write('{}\t'.format(arg))
                    f.write('\t'.join(map(str, values)))
                    f.write('\n')

                f.write('Results using best-model\'s paramaters on testset\n')

                if source == 'full':
                    f.write(
                        'Feature group\tNdims\tLabel prediction score\t{}\n'.
                        format('\t '.join(unique_labels)))
                    f.write('{}\t{}\t{}\t{}\n'.format(
                        ftgroup_name, ndims, score,
                        '\t'.join(map(str, lb_hitrates))))
                else:
                    f.write(
                        'Feature group\tNdims\tPCA explained\tPCA Dims\tLabel prediction score\t{}\n'
                        .format('\t '.join(unique_labels)))
                    f.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                        ftgroup_name, ndims, explained, pca_dims, score,
                        '\t'.join(map(str, lb_hitrates))))
                f.write('\n')
                open_mode = 'a'
Ejemplo n.º 17
0
    def handle(self, *args, **options):
        clsf_type = options['clsf_type']
        database_name = options['database_name']
        source = options['source']
        annotator_name = options['annotator_name']
        label_level = options['label_level']
        min_occur = options['min_occur']
        ipc = options['ipc']
        ratio_ = options['ratio']
        niters = options['niters']
        profile = options.get('profile', None)
        tsv_file = profile + '.tsv'
        if ipc is not None:
            assert ipc <= min_occur, 'Instances per class cannot exceed as min-occur'
            ipc_min = ipc
            ipc_max = ipc
        else:
            ipc_min = min_occur
            ipc_max = int(np.floor(min_occur * 1.5))

        train_ratio, valid_ratio = get_ratios(ratio_, 2)

        open_mode = 'w'

        assert clsf_type in classifiers.keys(), 'Unknown _classify: {}'.format(
            clsf_type)
        classifier = classifiers[clsf_type]

        database = get_or_error(Database, dict(name__iexact=database_name))
        annotator = get_or_error(User, dict(username__iexact=annotator_name))

        features = Feature.objects.all().order_by('id')
        aggregations = Aggregation.objects.filter(enabled=True).order_by('id')
        aggregators = [aggregator_map[x.name] for x in aggregations]

        enabled_features = []
        for f in features:
            if f.name in feature_map:
                enabled_features.append(f)

        features_hash = '-'.join(
            list(map(str, [x.id for x in enabled_features])))
        aggregations_hash = '-'.join(
            list(map(str, aggregations.values_list('id', flat=True))))

        dm = DataMatrix.objects.filter(
            database=database,
            features_hash=features_hash,
            aggregations_hash=aggregations_hash).last()
        if dm is None:
            raise Exception(
                'No full data matrix for database {}'.format(database_name))

        dm_sids_path = dm.get_sids_path()
        dm_tids_path = dm.get_tids_path()
        dm_bytes_path = dm.get_bytes_path()
        feature_cols = dm.get_cols_path()
        with open(feature_cols, 'r', encoding='utf-8') as f:
            col_inds = json.load(f)

        _sids = bytes_to_ndarray(dm_sids_path, np.int32)
        _sids, sort_order = np.unique(_sids, return_index=True)

        try:
            _tids = bytes_to_ndarray(dm_tids_path, np.int32)
            _tids = _tids[sort_order]
        except FileNotFoundError:
            _tids = get_tids(_sids)

        full_data = get_rawdata_from_binary(dm_bytes_path, len(_sids))
        full_data = full_data[sort_order, :]

        labels, no_label_ids = get_labels_by_sids(_sids, label_level,
                                                  annotator, min_occur)

        if len(no_label_ids) > 0:
            sids, tids, labels = exclude_no_labels(_sids, _tids, labels,
                                                   no_label_ids)
            lookup_ids_rows = np.searchsorted(_sids, sids)
            full_data = full_data[lookup_ids_rows, :]

        full_data = zscore(full_data)
        full_data[np.where(np.isnan(full_data))] = 0
        full_data[np.where(np.isinf(full_data))] = 0

        unique_labels = np.unique(labels)
        nlabels = len(unique_labels)

        for ftgroup_name, feature_names in ftgroup_names.items():
            if ftgroup_name == 'all':
                features = list(feature_map.values())
            else:
                features = [feature_map[x] for x in feature_names]
            ft_col_inds = []
            for feature in features:
                if feature.is_fixed_length:
                    col_name = feature.name
                    col_range = col_inds[col_name]
                    ft_col_inds += range(col_range[0], col_range[1])
                else:
                    for aggregator in aggregators:
                        col_name = '{}_{}'.format(feature.name,
                                                  aggregator.get_name())
                        col_range = col_inds[col_name]
                        ft_col_inds += range(col_range[0], col_range[1])

            ft_col_inds = np.array(ft_col_inds, dtype=np.int32)
            ndims = len(ft_col_inds)
            data = full_data[:, ft_col_inds]

            if source == 'pca':
                explained, data = pca_optimal(data, ndims, 0.9)
                pca_dims = data.shape[1]

            with open('/tmp/hyperopt.pkl', 'rb') as f:
                saved = pickle.load(f)

            performance_data = saved[clsf_type]
            accuracies = performance_data['accuracies']
            groups = performance_data['groups']
            params = performance_data['params']

            group_name = '{}-{}'.format(ftgroup_name, source)
            group_member_inds = np.where(groups == group_name)
            group_accuracies = accuracies[group_member_inds]

            best_acc_idx = np.argmax(group_accuracies)

            group_params = {}
            best_params = {}
            for param_name in params:
                param_values = np.array(params[param_name])
                group_param_values = param_values[group_member_inds]
                group_params[param_name] = group_param_values

                converter = converters[clsf_type][param_name]
                best_params[param_name] = converter(
                    group_param_values[best_acc_idx])

            dp = EnumDataProvider(data, labels, balanced=True)

            nfolds = int(np.floor(1 / valid_ratio + 0.01))
            ntrials = nfolds * niters
            label_prediction_scores = [0] * ntrials
            label_hitss = [0] * ntrials
            label_missess = [0] * ntrials
            label_hitrates = np.empty((ntrials, nlabels))
            label_hitrates[:] = np.nan
            importancess = np.empty((ntrials, data.shape[1]))
            cfmats = np.ndarray((ntrials, nlabels, nlabels))

            ind = 0

            bar = Bar('Features: {}. Classifier: {} Data type: {}...'.format(
                ftgroup_name, clsf_type, source),
                      max=ntrials)

            for iter in range(niters):
                traintetset, _ = dp.split(0, limits=(ipc_min, ipc_max))
                traintetset.make_folds(nfolds, valid_ratio)
                for k in range(nfolds):
                    trainset, testset = traintetset.get_fold(k)
                    train_x = np.array(trainset.data)
                    train_y = np.array(trainset.labels, dtype=np.int32)
                    test_x = np.array(testset.data)
                    test_y = np.array(testset.labels, dtype=np.int32)

                    score, label_hits, label_misses, cfmat, importances = \
                        classifier(train_x, train_y, test_x, test_y, nlabels, True, **best_params)

                    label_prediction_scores[ind] = score
                    label_hitss[ind] = label_hits
                    label_missess[ind] = label_misses

                    label_hitrate = label_hits / (
                        label_hits + label_misses).astype(np.float)

                    label_hitrates[ind, :] = label_hitrate
                    importancess[ind, :] = importances
                    cfmats[ind, :, :] = cfmat

                    bar.next()
                    ind += 1
            bar.finish()

            mean_label_prediction_scores = np.nanmean(label_prediction_scores)
            std_label_prediction_scores = np.nanstd(label_prediction_scores)
            sum_cfmat = np.nansum(cfmats, axis=0)

            with open(tsv_file, open_mode, encoding='utf-8') as f:
                if source == 'full':
                    f.write('{}\t{}\t{}\t{}\t{}\n'.format(
                        ftgroup_name, ndims, mean_label_prediction_scores,
                        std_label_prediction_scores,
                        '\t'.join(map(str, np.nanmean(label_hitrates, 0)))))
                else:
                    f.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
                        ftgroup_name, ndims, explained, pca_dims,
                        mean_label_prediction_scores,
                        std_label_prediction_scores,
                        '\t'.join(map(str, np.nanmean(label_hitrates, 0)))))

                f.write('Accuracy: \n')
                f.write('\t'.join(list(map(str, label_prediction_scores))))
                f.write('\n')
                f.write('\t')
                f.write('\t'.join(unique_labels))
                f.write('\n')
                for i in range(nlabels):
                    label = unique_labels[i]
                    cfrow = sum_cfmat[:, i]
                    f.write(label)
                    f.write('\t')
                    f.write('\t'.join(map(str, cfrow)))
                    f.write('\n')
                f.write('\n')
                open_mode = 'a'
Ejemplo n.º 18
0
    def perform_action(self, when, remove_dead):
        for dm in DataMatrix.objects.all():
            need_reconstruct = self.check_rebuild_necessary(dm, when)

            if not need_reconstruct:
                continue

            full_sids_path = dm.get_sids_path()
            full_bytes_path = dm.get_bytes_path()
            full_cols_path = dm.get_cols_path()

            if dm.database:
                if os.path.isfile(full_sids_path):
                    sids = bytes_to_ndarray(full_sids_path, np.int32)
                else:
                    sids = Segment.objects.filter(
                        audio_file__database=dm.database).values_list(
                            'id', flat=True)
                dbname = dm.database.name
            else:
                sids = dm.tmpdb.ids
                dbname = dm.tmpdb.name

            segments = Segment.objects.filter(id__in=sids)

            if len(segments) == 0:
                print('Skip DM #{}-{}-{}: '.format(dm.id, dbname, dm.name))

                if remove_dead:
                    print('Delete {}'.format(dm))
                    for f in [full_sids_path, full_bytes_path, full_cols_path]:
                        print('Remove binary file {}'.format(f))
                        try:
                            os.remove(f)
                        except FileNotFoundError:
                            pass
                    dm.delete()
                continue

            tids = np.array(segments.values_list('tid', flat=True),
                            dtype=np.int32)

            features_ids = dm.features_hash.split('-')
            features = list(Feature.objects.filter(id__in=features_ids))

            aggregations_ids = dm.aggregations_hash.split('-')
            aggregations = Aggregation.objects.filter(id__in=aggregations_ids)

            available_feature_names = feature_extractors.keys()
            disabled_features_names = [
                x.name for x in features
                if x.name not in available_feature_names
            ]

            if len(disabled_features_names):
                warning(
                    'DM #{}-{}-{}: Features {} are no longer available'.format(
                        dm.id, dbname, dm.name, disabled_features_names))
                features = [
                    x for x in features if x.name in available_feature_names
                ]

            available_aggregator_names = aggregator_map.keys()
            disabled_aggregators_names = [
                x.name for x in aggregations
                if x.name not in available_aggregator_names
            ]

            if len(disabled_aggregators_names):
                warning('DM #{}-{}-{}: Aggregation {} are no longer available'.
                        format(dm.id, dbname, dm.name,
                               disabled_aggregators_names))
                aggregations = [
                    x for x in aggregations
                    if x.name in available_aggregator_names
                ]

            aggregators = [aggregator_map[x.name] for x in aggregations]

            runner = ConsoleTaskRunner(
                prefix='Extract measurement for DM #{}-{}-{}: '.format(
                    dm.id, dbname, dm.name))
            runner.preparing()
            extract_segment_features_for_segments(runner,
                                                  sids,
                                                  features,
                                                  force=False)
            runner.wrapping_up()

            child_runner = ConsoleTaskRunner(
                prefix='Aggregate measurement for DM #{}-{}-{}: '.format(
                    dm.id, dbname, dm.name))
            child_runner.preparing()

            aggregate_feature_values(child_runner, tids, features, aggregators)
            child_runner.wrapping_up()
            child_runner.complete()

            runner.complete()

            data, col_inds = extract_rawdata(tids, features, aggregators)

            ndarray_to_bytes(data, full_bytes_path)
            ndarray_to_bytes(np.array(sids, dtype=np.int32), full_sids_path)

            with open(full_cols_path, 'w', encoding='utf-8') as f:
                json.dump(col_inds, f)

            dm.ndims = data.shape[1]
            dm.save()
    def handle(self, *args, **options):
        clsf_type = options['clsf_type']
        database_name = options['database_name']
        source = options['source']
        annotator_name = options['annotator_name']
        label_level = options['label_level']
        min_occur = options['min_occur']
        ratio_ = options['ratio']
        niters = options['niters']
        csv_filename = options.get('csv_filename', None)

        train_ratio, valid_ratio = get_ratios(ratio_, 2)

        assert clsf_type in classifiers.keys(), 'Unknown _classify: {}'.format(clsf_type)
        classifier = classifiers[clsf_type]

        database = get_or_error(Database, dict(name__iexact=database_name))
        annotator = get_or_error(User, dict(username__iexact=annotator_name))

        features = Feature.objects.all().order_by('id')
        aggregations = Aggregation.objects.filter(enabled=True).order_by('id')

        enabled_features = []
        for f in features:
            if f.name in feature_map:
                enabled_features.append(f)

        features_hash = '-'.join(list(map(str, [x.id for x in enabled_features])))
        aggregations_hash = '-'.join(list(map(str, aggregations.values_list('id', flat=True))))

        dm = DataMatrix.objects.filter(database=database, features_hash=features_hash,
                                       aggregations_hash=aggregations_hash).last()
        if dm is None:
            raise Exception('No full data matrix for database {}'.format(database_name))

        dm_sids_path = dm.get_sids_path()
        dm_tids_path = dm.get_tids_path()
        dm_bytes_path = dm.get_bytes_path()
        feature_cols = dm.get_cols_path()
        with open(feature_cols, 'r', encoding='utf-8') as f:
            col_inds = json.load(f)

        _sids = bytes_to_ndarray(dm_sids_path, np.int32)
        _sids, sort_order = np.unique(_sids, return_index=True)

        try:
            _tids = bytes_to_ndarray(dm_tids_path, np.int32)
            _tids = _tids[sort_order]
        except FileNotFoundError:
            _tids = get_tids(_sids)

        full_data = get_rawdata_from_binary(dm_bytes_path, len(_sids))
        full_data = full_data[sort_order, :]

        labels, no_label_ids = get_labels_by_sids(_sids, label_level, annotator, min_occur)

        if len(no_label_ids) > 0:
            sids, tids, labels = exclude_no_labels(_sids, _tids, labels, no_label_ids)
            lookup_ids_rows = np.searchsorted(_sids, sids)
            full_data = full_data[lookup_ids_rows, :]

        full_data = zscore(full_data)
        full_data[np.where(np.isnan(full_data))] = 0
        full_data[np.where(np.isinf(full_data))] = 0

        unique_labels = np.unique(labels)
        nlabels = len(unique_labels)

        if csv_filename:
            with open(csv_filename, 'w', encoding='utf-8') as f:
                if source == 'pca':
                    f.write('Feature group\tAggregators\tNdims\tPCA explained\tPCA Dims\tLabel prediction mean\tstdev'
                            '\t{}\n'.format('\t '.join(unique_labels)))
                else:
                    f.write('Feature group\tAggregators\tNdims\tLabel prediction mean\tstdev\t{}\n'
                            .format('\t '.join(unique_labels)))

        for ftgroup_name, feature_names in ftgroup_names.items():
            for agggroup_name, aggs in list(enabled_aggregators.items()) + [('all', None)]:
                if agggroup_name == 'all':
                    aggs = [aggregator_map[x.name] for x in aggregations]
                if ftgroup_name == 'all':
                    features = list(feature_map.values())
                else:
                    features = [feature_map[x] for x in feature_names]
                ft_col_inds = []
                for feature in features:
                    if feature.is_fixed_length:
                        col_name = feature.name
                        col_range = col_inds[col_name]
                        ft_col_inds += range(col_range[0], col_range[1])
                    else:
                        for aggregator in aggs:
                            col_name = '{}_{}'.format(feature.name, aggregator.get_name())
                            col_range = col_inds[col_name]
                            ft_col_inds += range(col_range[0], col_range[1])

                ft_col_inds = np.array(ft_col_inds, dtype=np.int32)
                ndims = len(ft_col_inds)
                data = full_data[:, ft_col_inds]

                if source == 'pca':
                    explained, data = pca_optimal(data, ndims, 0.9)
                    pca_dims = data.shape[1]

                dp = EnumDataProvider(data, labels, balanced=True)

                nfolds = int(np.floor(1 / valid_ratio + 0.01))
                ntrials = nfolds * niters
                label_prediction_scores = [0] * ntrials
                label_hitss = [0] * ntrials
                label_missess = [0] * ntrials
                label_hitrates = np.empty((ntrials, nlabels))
                label_hitrates[:] = np.nan
                importancess = np.empty((ntrials, data.shape[1]))
                cfmats = np.ndarray((ntrials, nlabels, nlabels))

                ind = 0

                bar = Bar('Features: {}. Aggregator: {}. Classifier: {} Data type: {}...'
                          .format(ftgroup_name, agggroup_name, clsf_type, source), max=ntrials)

                for iter in range(niters):
                    traintetset, _ = dp.split(0, limits=(min_occur, int(np.floor(min_occur * 1.5))))
                    traintetset.make_folds(nfolds, valid_ratio)
                    for k in range(nfolds):
                        trainset, testset = traintetset.get_fold(k)
                        train_x = np.array(trainset.data)
                        train_y = np.array(trainset.labels, dtype=np.int32)
                        test_x = np.array(testset.data)
                        test_y = np.array(testset.labels, dtype=np.int32)

                        score, label_hits, label_misses, cfmat, importances = \
                            classifier(train_x, train_y, test_x, test_y, nlabels, True)

                        label_prediction_scores[ind] = score
                        label_hitss[ind] = label_hits
                        label_missess[ind] = label_misses

                        label_hitrate = label_hits / (label_hits + label_misses).astype(np.float)

                        label_hitrates[ind, :] = label_hitrate
                        importancess[ind, :] = importances
                        cfmats[ind, :, :] = cfmat

                        bar.next()
                        ind += 1
                bar.finish()

                mean_label_prediction_scores = np.nanmean(label_prediction_scores)
                std_label_prediction_scores = np.nanstd(label_prediction_scores)
                sum_cfmat = np.nansum(cfmats, axis=0)

                if csv_filename:
                    with open(csv_filename, 'a', encoding='utf-8') as f:
                        if source == 'full':
                            f.write('{}\t{}\t{}\t{}\t{}\t{}\n'
                                    .format(ftgroup_name, agggroup_name, ndims, mean_label_prediction_scores,
                                            std_label_prediction_scores,
                                            '\t'.join(map(str, np.nanmean(label_hitrates, 0)))))
                        else:
                            f.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'
                                    .format(ftgroup_name, agggroup_name, ndims, explained, pca_dims,
                                            mean_label_prediction_scores, std_label_prediction_scores,
                                            '\t'.join(map(str, np.nanmean(label_hitrates, 0)))))
                        f.write('\t')
                        f.write('\t'.join(unique_labels))
                        f.write('\n')
                        for i in range(nlabels):
                            label = unique_labels[i]
                            cfrow = sum_cfmat[:, i]
                            f.write(label)
                            f.write('\t')
                            f.write('\t'.join(map(str, cfrow)))
                            f.write('\n')
                        f.write('\n')
                else:
                    print('{}/{}: {} by {}: mean = {} std = {}'
                          .format(ftgroup_name, agggroup_name, clsf_type, source, mean_label_prediction_scores,
                                  std_label_prediction_scores))
Ejemplo n.º 20
0
def create_derived_tensor(full_tensor, annotator, dim_reduce, ndims, recreate):
    admin = get_or_error(User, dict(username__iexact='superuser'))
    full_sids_path = full_tensor.get_sids_path()
    full_bytes_path = full_tensor.get_bytes_path()

    sids = bytes_to_ndarray(full_sids_path, np.int32)
    full_data = get_rawdata_from_binary(full_bytes_path, len(sids))

    if dim_reduce != 'none':
        dim_reduce_fun = reduce_funcs[dim_reduce]
        n_feature_cols = full_data.shape[1]
        n_components = min(n_feature_cols // 2, ndims)
    else:
        dim_reduce_fun = None
        n_components = None

    derived_tensor = DerivedTensorData.objects.filter(
        database=full_tensor.database,
        full_tensor=full_tensor,
        features_hash=full_tensor.features_hash,
        aggregations_hash=full_tensor.aggregations_hash,
        ndims=n_components,
        dimreduce=dim_reduce,
        creator=admin,
        annotator=annotator).first()
    if derived_tensor and not recreate:
        print(
            'Derived tensor {} already exists. If you want to recreate, turn on flag --recreate'
            .format(derived_tensor.name))
        return derived_tensor, False

    if derived_tensor is None:
        derived_tensors_name = uuid.uuid4().hex
        derived_tensor = DerivedTensorData(
            name=derived_tensors_name,
            database=full_tensor.database,
            full_tensor=full_tensor,
            features_hash=full_tensor.features_hash,
            aggregations_hash=full_tensor.aggregations_hash,
            dimreduce=dim_reduce,
            ndims=n_components,
            creator=admin,
            annotator=annotator)

    derived_cfg_path = derived_tensor.get_config_path()

    if dim_reduce_fun:

        # TSNE needs normalisation first
        if dim_reduce.startswith('tsne'):
            full_data = zscore(full_data)
            full_data[np.where(np.isnan(full_data))] = 0
            full_data[np.where(np.isinf(full_data))] = 0

        dim_reduced_data = dim_reduce_fun(full_data, n_components)
        derived_bytes_path = derived_tensor.get_bytes_path()
        ndarray_to_bytes(dim_reduced_data, derived_bytes_path)
        tensor_shape = dim_reduced_data.shape
        tensor_path = '/' + derived_bytes_path,
    else:
        tensor_shape = full_data.shape
        tensor_path = '/' + full_bytes_path,

    # Always write config last - to make sure it's not missing anything
    embedding = dict(
        tensorName=derived_tensor.name,
        tensorShape=tensor_shape,
        tensorPath=tensor_path,
        metadataPath=reverse('tsne-meta',
                             kwargs={'tensor_name': derived_tensor.name}),
    )
    config = dict(embeddings=[embedding])
    write_config(config, derived_cfg_path)

    derived_tensor.save()
    return derived_tensor, True
Ejemplo n.º 21
0
    def handle(self, database_name, population_name, type, perplexity,
               normalised, *args, **kwargs):
        database = get_or_error(Database, dict(name__iexact=database_name))
        assert type in ['tsne2', 'tsne3', 'mds', 'mdspca']

        features = Feature.objects.all().order_by('id')
        aggregations = Aggregation.objects.all().order_by('id')

        features_hash = '-'.join(
            list(map(str, features.values_list('id', flat=True))))
        aggregations_hash = '-'.join(
            list(map(str, aggregations.values_list('id', flat=True))))

        full_tensor = FullTensorData.objects.filter(
            database=database,
            features_hash=features_hash,
            aggregations_hash=aggregations_hash).first()

        if full_tensor is None:
            raise Exception(
                'Full feature matrix not found. Need to create FullTensor first.'
            )

        full_sids_path = full_tensor.get_sids_path()
        full_bytes_path = full_tensor.get_bytes_path()

        full_sids = bytes_to_ndarray(full_sids_path, np.int32)
        full_data = get_rawdata_from_binary(full_bytes_path, len(full_sids))

        sids, tids = get_sids_tids(database, population_name)

        normalised_str = 'normed' if normalised else 'raw'
        if type.startswith('tsne'):
            file_name = '{}_{}_{}_{}_{}.pkl'.format(database_name,
                                                    population_name, type,
                                                    perplexity, normalised_str)
        else:
            file_name = '{}_{}_{}_{}.pkl'.format(database_name,
                                                 population_name, type,
                                                 normalised_str)
        if os.path.isfile(file_name):
            with open(file_name, 'rb') as f:
                saved = pickle.load(f)
                coordinate = saved['coordinate']
                stress = saved['stress']
        else:
            population_data = cherrypick_tensor_data_by_sids(
                full_data, full_sids, sids).astype(np.float64)

            if normalised:
                population_data = zscore(population_data)

            population_data[np.where(np.isnan(population_data))] = 0
            population_data[np.where(np.isinf(population_data))] = 0

            if type.startswith('mds'):
                if type == 'mdspca':
                    dim_reduce_func = PCA(n_components=50)
                    population_data = dim_reduce_func.fit_transform(
                        population_data, y=None)
                    if hasattr(dim_reduce_func, 'explained_variance_ratio_'):
                        print(
                            'Cumulative explained variation for {} principal components: {}'
                            .format(
                                50,
                                np.sum(dim_reduce_func.
                                       explained_variance_ratio_)))

                similarities = squareform(pdist(population_data, 'euclidean'))

                model = MDS(n_components=3,
                            dissimilarity='precomputed',
                            random_state=7,
                            verbose=1,
                            max_iter=1000)
                coordinate = model.fit_transform(similarities)
                stress = model.stress_
            else:
                ntsne_dims = int(type[4:])
                dim_reduce_func = PCA(n_components=50)
                population_data = dim_reduce_func.fit_transform(
                    population_data, y=None)

                print('Cumulative explained variation: {}'.format(
                    np.sum(dim_reduce_func.explained_variance_ratio_)))

                time_start = time.time()
                tsne = TSNE(n_components=ntsne_dims,
                            verbose=1,
                            perplexity=perplexity,
                            n_iter=4000)
                coordinate = tsne.fit_transform(population_data)
                print(
                    't-SNE done! Time elapsed: {} seconds'.format(time.time() -
                                                                  time_start))
                stress = None

        with open(file_name, 'wb') as f:
            pickle.dump(dict(coordinate=coordinate,
                             stress=stress,
                             sids=sids,
                             tids=tids),
                        f,
                        protocol=pickle.HIGHEST_PROTOCOL)