Beispiel #1
0
def __dump_numpy_txt():
    import hashlib
    def dump_to_dir(dataset, out_dir, dataset_name):
        if not os.path.exists(os.path.join(out_dir, dataset_name)):
            os.makedirs(os.path.join(out_dir, dataset_name))

        for i, item in enumerate(dataset.data):
            name = hashlib.sha256('{name}{idx}'.format(name=dataset_name, idx=i).encode()).hexdigest() + ".txt"
            word_dir = os.path.join(out_dir, dataset_name, dataset.idx2word[i])
            if not os.path.exists(word_dir):
                os.makedirs(word_dir)
            np.savetxt(os.path.join(word_dir, name), item)

    start = time.time()

    train_path, dev_path, test_path = get_dataset_paths(current_dataset, fmt='scp')
    data_train = KaldiDataset(train_path, noise_multiplier=1.0, noise_prob=0.5,
                              supplement_rare_with_noisy=False,
                              supplement_seed=112)
    dump_to_dir(data_train, current_dataset, 'train')

    data_dev = KaldiDataset(dev_path, parent_dataset_path=train_path, training=False)
    dump_to_dir(data_dev, current_dataset, 'dev')

    data_test = KaldiDataset(test_path, parent_dataset_path=train_path, training=False)
    dump_to_dir(data_test, current_dataset, 'test')

    print('dump: {0}'.format(time.time() - start))
Beispiel #2
0
def do_calculate_ap(run_dir, epoch, dataset=None, partition='dev'):
    run_name2get_embeddings = {
        'classifier': get_classifier_embeddings,
        'siamese': get_siamese_embeddings
    }

    net, config, checkpoints, checkpoint_dir, run_name, loss, train_scp, _, _, _, mean_sub, var_norm = \
        load_net(run_dir, epoch=None, logger=None, train=False)
    get_embeddings = run_name2get_embeddings[run_name]

    if dataset is None:
        dataset = current_dataset

    train_path, dev_path, test_path = get_dataset_paths(dataset)
    # noinspection PyPep8Naming
    DatasetClass = get_dataset_class_for_path(train_path, logger=None)

    if len(checkpoints) == 0:
        print('No checkpoints found in {0} for run {1}'.format(
            checkpoint_dir, run_dir))
        print('Exiting')
        sys.exit(-1)

    if partition == 'train':
        data_train = DatasetClass(train_path,
                                  parent_dataset_path=train_scp,
                                  training=False,
                                  logger=None,
                                  mean_subtraction=mean_sub,
                                  variance_normalization=var_norm)
        return get_epoch_ap(net,
                            config,
                            checkpoints,
                            loss,
                            data_train,
                            epoch,
                            get_embeddings,
                            subsample_size=3000)

    if partition == 'dev':
        data_dev = DatasetClass(dev_path,
                                parent_dataset_path=train_scp,
                                training=False,
                                logger=None,
                                mean_subtraction=mean_sub,
                                variance_normalization=var_norm)
        return get_epoch_ap(net, config, checkpoints, loss, data_dev, epoch,
                            get_embeddings)

    if partition == 'test':
        data_test = DatasetClass(test_path,
                                 parent_dataset_path=train_scp,
                                 training=False,
                                 logger=None,
                                 mean_subtraction=mean_sub,
                                 variance_normalization=var_norm)
        return get_epoch_ap(net, config, checkpoints, loss, data_test, epoch,
                            get_embeddings)
Beispiel #3
0
def split_snodgrass_dataset(source_sub_dir,
                            snodgrass_file,
                            same_split_as=None):
    """~50% train, ~25% percent dev, ~25% test, while taking care that each patient's data is present exclusively
     in either train, dev, or test sets."""
    snodgrass = os.path.join(processed_data_dir, source_sub_dir,
                             snodgrass_file)
    lines = []
    with open(snodgrass, 'r') as f:
        lines.extend(f.readlines())

    lines = np.array(lines)
    words = np.array([key2word(x) for x in lines])
    patients = np.array([snodgrass_key2patient(x) for x in lines])
    patients_with_counts = [(key, value)
                            for key, value in Counter(patients).items()]

    data_train = []
    data_test = []
    data_dev = []

    words_train = []
    words_test = []
    words_dev = []

    if same_split_as is None:
        # surprise knapsack problem :)
        patients_train = knapsack(patients_with_counts, len(lines) / 2)[1]
        patients_left = remove_all(patients_with_counts, patients_train)
        patients_test = knapsack(patients_left, len(lines) / 4)[1]
        patients_dev = remove_all(patients_left, patients_test)
    else:
        train_path, dev_path, test_path = get_dataset_paths(same_split_as,
                                                            fmt='scp')
        patients_train = scp2snodgrass_patients(train_path)
        patients_test = scp2snodgrass_patients(test_path)
        patients_dev = scp2snodgrass_patients(dev_path)

    for patient, _ in patients_train:
        data_train.extend(lines[np.where(patients == patient)])
        words_train.extend(words[np.where(patients == patient)])

    for patient, _ in patients_test:
        data_test.extend(lines[np.where(patients == patient)])
        words_test.extend(words[np.where(patients == patient)])

    for patient, _ in patients_dev:
        data_dev.extend(lines[np.where(patients == patient)])
        words_dev.extend(words[np.where(patients == patient)])

    print(
        'Unique words in train dataset: {0}, in test: {1}, in dev: {2}'.format(
            len(Counter(words_train)), len(Counter(words_test)),
            len(Counter(words_dev))))

    return data_train, data_dev, data_test
Beispiel #4
0
def __dump_lmdb():
    from base.data_io.dataset2lmdb import dataset2lmdb
    start = time.time()

    train_path, dev_path, test_path = get_dataset_paths(current_dataset, fmt='scp')
    train_path_lmdb, dev_path_lmdb, test_path_lmdb = get_dataset_paths(current_dataset, fmt='lmdb')

    data_train = KaldiDataset(train_path, noise_multiplier=1.0, noise_prob=0.5,
                              supplement_rare_with_noisy=False,
                              supplement_seed=112)
    dataset2lmdb(data_train, train_path_lmdb)

    data_dev = KaldiDataset(dev_path, parent_dataset_path=train_path, training=False)
    dataset2lmdb(data_dev, dev_path_lmdb)

    data_test = KaldiDataset(test_path, parent_dataset_path=train_path, training=False)
    dataset2lmdb(data_test, test_path_lmdb)

    print('dump to LMDB: {0}'.format(time.time() - start))
Beispiel #5
0
def select_independent_words():
    """For the new dataset the words are selected as follows: the emuDB datasets are the source of train data, the SWC
    dataset is used as the test set (SWC is read speech only, which should somewhat resemble patient speech: slower and
    more deliberate than typical spontaneous speech).

    The words are selected randomly from words vaguely matching the train dataset words: same or plus one number of
    characters, also nouns (detected by capitalization), and with no fewer test set examples than in the original
    dev set."""
    counts_per_emu_db = get_emu_word_counts()
    counts_emu_total = collapse_nested_dict(counts_per_emu_db)
    counts_swc = get_swc_word_counts()
    train_path, dev_path, _ = get_dataset_paths('all_snodgrass_cleaned_v5',
                                                fmt='scp')
    counts_train = get_dataset_word_counts(train_path)
    counts_dev = get_dataset_word_counts(dev_path)

    selected_words = {}
    for word, count in counts_train.items():
        num_letters = len(word)

        emu_similar = {}
        for emu_word, emu_count in counts_emu_total.items():
            if not emu_word.isalpha(
            ):  # skipping words with non-alphabetic characters
                continue
            if not emu_word[0].isupper():  # budget noun detector
                continue
            if emu_word in selected_words.values():
                continue
            # factor of 1.5 for the original counts to account for the cleaning removing a lot of examples
            if (len(emu_word) == num_letters or len(emu_word)
                    == num_letters + 1) and emu_count >= count * 1.5:
                emu_similar[emu_word] = emu_count

        similar_list = [
            x for x in emu_similar.keys() if not any([
                y.startswith(x) or x.startswith(y)
                for y in selected_words.values()
            ])
        ]
        random.shuffle(similar_list)
        selected = similar_list[0]
        if word in counts_dev:
            # factor of 1.5 for the original counts to account for the cleaning removing a lot of examples
            while selected not in counts_swc or len(
                    counts_swc[selected]) < counts_dev[word] * 1.5:
                similar_list = similar_list[1:]
                selected = similar_list[0]
                if len(similar_list) == 0:
                    raise RuntimeError()
        selected_words[word] = selected

    return selected_words
Beispiel #6
0
def __main():
    start = time.time()

    train_path, dev_path, test_path = get_dataset_paths(current_dataset, fmt='scp')
    data_train = KaldiDataset(train_path, noise_multiplier=1.0, noise_prob=0.5,
                              supplement_rare_with_noisy=False,
                              supplement_seed=112)
    data_dev = KaldiDataset(dev_path, parent_dataset_path=train_path, training=False)
    data_test = KaldiDataset(test_path, parent_dataset_path=train_path, training=False)

    _print_patients(data_train, data_dev, data_test)

    test = next(data_train.siamese_triplet_epoch(32, augment_parts=True))
    test = next(data_train.siamese_margin_loss_epoch(50, 5))

    print('scp: {0}'.format(time.time() - start))
Beispiel #7
0
def __main_independent_test():
    swc_path = '/home/aleks/data/speech_processed/independent_test_v2/SWC_independent_test.scp'
    data_swc = KaldiDataset(swc_path)

    print(data_swc.counts)

    train_path, dev_path, test_path = get_dataset_paths('independent_cleaned_v3', fmt='scp')
    data_train = KaldiDataset(train_path)
    data_dev = KaldiDataset(dev_path, parent_dataset_path=train_path)
    data_test = KaldiDataset(test_path, parent_dataset_path=train_path)

    print(data_dev.counts)

    swc_keys = set(data_swc.idx2key)
    dev_keys = set(data_dev.idx2key)
    difference = swc_keys.difference(dev_keys)

    left_words = np.array([key2word(x) for x in difference])
    left_counts = Counter(left_words)
    print(left_counts)
Beispiel #8
0
def split_independent_words(output_name, data_sub_dir, dataset_comparable_to):
    output_path = os.path.join(processed_data_dir, output_name)

    train_path, dev_path, _ = get_dataset_paths(dataset_comparable_to)
    counts_train = get_dataset_word_counts(train_path)
    counts_dev = get_dataset_word_counts(dev_path)

    selected_words = load_pickled('selected_words.pckl')

    all_scp = glob.glob(
        os.path.join(processed_data_dir, data_sub_dir,
                     '*independent_test.scp'))
    swc_scp = [x for x in all_scp if os.path.basename(x).startswith('SWC')][0]
    all_scp.remove(swc_scp)

    emu_lines = []  # this will be the train data
    swc_lines = []  # this will be the test data
    for scp in all_scp:
        read_scp_lines(scp, emu_lines)
    read_scp_lines(swc_scp, swc_lines)
    emu_lines = np.array(emu_lines)
    swc_lines = np.array(swc_lines)

    emu_words = np.array([key2word(x) for x in emu_lines])
    swc_words = np.array([key2word(x) for x in swc_lines])

    emu_counts = Counter(emu_words)
    swc_counts = Counter(swc_words)

    # for word in emu_counts:
    #     print('{0:<20}: train {1}, test {2}'.format(word, emu_counts[word], swc_counts.get(word, 0)))

    # for word in counts_train:
    #     new_word = selected_words[word]
    #     print('{0}, train: {1}, dev: {2}'.format(word, counts_train[word], counts_dev.get(word, 0)))
    #     print('{word}: new train count: {0}, new test count: {1}'.format(emu_counts[new_word], swc_counts[new_word],
    #                                                                      word=new_word))

    new_train = []
    new_dev = []
    for word, new_word in selected_words.items():
        train_new_lines = emu_lines[emu_words == new_word]
        np.random.shuffle(train_new_lines)
        new_train.extend(train_new_lines[:counts_train[word]])

        dev_new_lines = swc_lines[swc_words == new_word]
        np.random.shuffle(dev_new_lines)
        # new_dev.extend(dev_new_lines[:counts_dev.get(word, 5)])  # didn't work at all, maybe bad labels?
        new_dev.extend(dev_new_lines[:35])

    train_scp = '{0}_train.scp'.format(output_path)
    dev_scp = '{0}_dev.scp'.format(output_path)

    with open(train_scp, 'w') as train_file:
        for line in new_train:
            train_file.write(line)

    with open(dev_scp, 'w') as dev_file:
        for line in new_dev:
            dev_file.write(line)

    return train_scp, dev_scp, None
Beispiel #9
0
    # and afterwards I switched the dev and test sets, to make sure the test set is the more complete one
    swc_lines = []
    read_scp_lines(swc_path, swc_lines)
    dev_lines = []
    read_scp_lines(dev_path, dev_lines)

    left_lines = np.array([x for x in swc_lines if x not in dev_lines])
    left_words = np.array([key2word(x) for x in left_lines])
    test_lines = []
    for word in np.unique(left_words):
        left_word_lines = left_lines[left_words == word]
        np.random.shuffle(left_word_lines)
        test_lines.extend(left_word_lines[:35])

    with open(test_path, 'w') as test_file:
        for line in test_lines:
            test_file.write(line)


if __name__ == '__main__':
    # train_scp, dev_scp, test_scp = split_independent_words('independent_cleaned_v3', 'independent_test_v2',
    #                                                        'all_snodgrass_cleaned_v4')
    # train_data = KaldiDataset(train_scp)
    # train_data.dump_derived_data()

    train_path, dev_path, test_path = get_dataset_paths(
        'independent_cleaned_v3', fmt='scp')
    swc_path = '/home/aleks/data/speech_processed/independent_test_v2/SWC_independent_test.scp'

    compose_test_from_non_validation_words(swc_path, dev_path, test_path)
Beispiel #10
0
def __main():
    args, config, logger, checkpoint_dir, log_dir, use_gru, noise_mult, noise_prob, mean_sub, var_norm = \
        setup_training_run('classifier')

    supplement_rare = getattr(config.general_training,
                              'supplement_rare_with_noisy', False)
    supplement_seed = getattr(config.general_training, 'supplement_seed', 112)
    train_path, dev_path, _ = get_dataset_paths(current_dataset)
    # noinspection PyPep8Naming
    DatasetClass = get_dataset_class_for_path(train_path, logger=logger)

    data_train = DatasetClass(train_path,
                              logger=logger,
                              noise_multiplier=noise_mult,
                              noise_prob=noise_prob,
                              mean_subtraction=mean_sub,
                              variance_normalization=var_norm,
                              supplement_rare_with_noisy=supplement_rare,
                              supplement_seed=supplement_seed)
    data_dev = DatasetClass(dev_path,
                            parent_dataset_path=train_path,
                            training=False,
                            logger=logger,
                            mean_subtraction=mean_sub,
                            variance_normalization=var_norm)

    data_parallel = args.gpu_count > 1
    batch_first = data_parallel
    if not use_gru:
        net = LSTMClassifier(logger, config, batch_first=batch_first)
    else:
        net = GRUClassifier(logger, config, batch_first=batch_first)
    if args.load_weights is not None:
        net.restore_weights(args.load_weights,
                            exclude_params=['fc.6'],
                            freeze_except=None)
    net.train(True)

    if args.gpu_count > 1:
        net = torch.nn.DataParallel(net)
        config.classifier_training.batch_size = config.classifier_training.batch_size * args.gpu_count
        if config.model.use_cuda:
            net = net.cuda()

    optimizer = torch.optim.Adam(net.parameters(),
                                 lr=config.classifier_training.learning_rate)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           'max',
                                                           factor=0.5,
                                                           patience=7,
                                                           min_lr=1e-5,
                                                           verbose=True)

    # log initial performance level
    dev_losses, dev_accuracy = process_classifier_epoch(net,
                                                        config,
                                                        optimizer,
                                                        data_dev,
                                                        batch_first,
                                                        data_parallel,
                                                        train=False)
    logger.info(
        'Initial avg dev loss = {0:.4f}, dev accuracy = {1:.4f}'.format(
            np.mean(dev_losses), dev_accuracy))

    for epoch in range(config.classifier_training.train_epochs):
        logger.info('Starting epoch {0}, learning_rate = {1}'.format(
            epoch, [group['lr'] for group in optimizer.param_groups][0]))

        start = time.time()
        train_losses, train_accuracy = process_classifier_epoch(net,
                                                                config,
                                                                optimizer,
                                                                data_train,
                                                                batch_first,
                                                                data_parallel,
                                                                train=True)
        dev_losses, dev_accuracy = process_classifier_epoch(net,
                                                            config,
                                                            optimizer,
                                                            data_dev,
                                                            batch_first,
                                                            data_parallel,
                                                            train=False)

        if config.classifier_training.lr_schedule:
            scheduler.step(dev_accuracy)

        torch.save(
            net.state_dict(),
            os.path.join(
                checkpoint_dir,
                '{0}_epoch_{1}.ckpt'.format(net.__class__.__name__, epoch)))

        logger.info(
            'Finished epoch {0}, avg training loss = {1:.4f}, avg dev loss = {2:.4f} epoch time = {3:.3f} sec'
            .format(epoch, np.mean(train_losses), np.mean(dev_losses),
                    time.time() - start))
        logger.info(
            'Epoch {0} training accuracy = {1:.4f}, dev accuracy = {2:.4f}'.
            format(epoch, train_accuracy, dev_accuracy))
Beispiel #11
0
def __main(run_dir,
           dataset=None,
           for_epochs=None,
           gen_train=False,
           gen_dev=True,
           gen_test=False):
    run_name2get_embeddings = {
        'classifier': get_classifier_embeddings,
        'siamese': get_siamese_embeddings
    }

    net, config, checkpoints, checkpoint_dir, run_name, loss, train_scp, _, _, _, mean_sub, var_norm = \
        load_net(run_dir, epoch=None, logger=None, train=False)
    get_embeddings = run_name2get_embeddings[run_name]

    if dataset is None:
        dataset = current_dataset

    train_path, dev_path, test_path = get_dataset_paths(dataset)
    # noinspection PyPep8Naming
    DatasetClass = get_dataset_class_for_path(train_path, logger=None)

    if len(checkpoints) == 0:
        print('No checkpoints found in {0} for run {1}'.format(
            checkpoint_dir, run_dir))
        print('Exiting')
        sys.exit(-1)

    if for_epochs is None:
        for_epochs = sorted(list(checkpoints.keys()))

    if gen_train:
        data_train = DatasetClass(train_path,
                                  parent_dataset_path=train_scp,
                                  training=False,
                                  logger=None,
                                  mean_subtraction=mean_sub,
                                  variance_normalization=var_norm)
        _print_ap_per_epoch(net,
                            config,
                            checkpoints,
                            loss,
                            data_train,
                            'train',
                            for_epochs,
                            get_embeddings,
                            subsample_size=3000)

    if gen_dev:
        data_dev = DatasetClass(dev_path,
                                parent_dataset_path=train_scp,
                                training=False,
                                logger=None,
                                mean_subtraction=mean_sub,
                                variance_normalization=var_norm)
        _print_ap_per_epoch(net, config, checkpoints, loss, data_dev, 'dev',
                            for_epochs, get_embeddings)

    if gen_test:
        data_test = DatasetClass(test_path,
                                 parent_dataset_path=train_scp,
                                 training=False,
                                 logger=None,
                                 mean_subtraction=mean_sub,
                                 variance_normalization=var_norm)
        _print_ap_per_epoch(net, config, checkpoints, loss, data_test, 'test',
                            for_epochs, get_embeddings)
Beispiel #12
0
def __main():
    args, config, logger, checkpoint_dir, log_dir, use_gru, noise_mult, noise_prob, mean_sub, var_norm = \
        setup_training_run('siamese')

    supplement_rare = getattr(config.general_training,
                              'supplement_rare_with_noisy', False)
    supplement_seed = getattr(config.general_training, 'supplement_seed', 112)
    train_path, dev_path, _ = get_dataset_paths(current_dataset)
    # noinspection PyPep8Naming
    DatasetClass = get_dataset_class_for_path(train_path, logger=logger)
    data_train = DatasetClass(train_path,
                              logger=logger,
                              noise_multiplier=noise_mult,
                              noise_prob=noise_prob,
                              mean_subtraction=mean_sub,
                              variance_normalization=var_norm,
                              supplement_rare_with_noisy=supplement_rare,
                              supplement_seed=supplement_seed)
    data_dev = DatasetClass(dev_path,
                            parent_dataset_path=train_path,
                            training=False,
                            logger=logger,
                            mean_subtraction=mean_sub,
                            variance_normalization=var_norm)

    loss_fn = create_embedding_loss(config, len(data_train.word2id))
    data_parallel = args.gpu_count > 1
    batch_first = data_parallel

    if not use_gru:
        net = SiameseLSTM(logger,
                          config,
                          batch_first=batch_first,
                          loss=loss_fn)
    else:
        net = SiameseGRU(logger, config, batch_first=batch_first, loss=loss_fn)

    if args.load_weights is not None:
        # exclude_params specifies the layers to drop when applying pre-trained weights
        net.restore_weights(args.load_weights,
                            exclude_params=['fc.2'],
                            freeze_except=None)
        # net.restore_weights(args.load_weights, exclude_params=[], freeze_except=None)
    net.train(True)

    if args.gpu_count > 1:
        net = torch.nn.DataParallel(net)
        config.siamese_training.batch_size = config.siamese_training.batch_size * args.gpu_count
        if config.model.use_cuda:
            net = net.cuda()

    optimizer = create_optimizer(net, config, loss_fn, wrapped=data_parallel)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           'max',
                                                           factor=0.5,
                                                           patience=6,
                                                           min_lr=1e-5,
                                                           verbose=True)

    # other settings
    augment_parts = config.siamese_training.augment_parts if hasattr(
        config.siamese_training, 'augment_parts') else False

    # log initial performance level
    dev_ap = get_ap(net, loss_fn.metric(), optimizer, config, data_dev,
                    batch_first)
    logger.info('Initial avg dev precision= {0:.4f}'.format(dev_ap))

    for epoch in range(config.siamese_training.train_epochs):
        logger.info('Starting epoch {0}, learning_rate = {1}'.format(
            epoch, [group['lr'] for group in optimizer.param_groups]))

        start = time.time()
        epoch_losses = []
        for data, lengths, sample_idx, orig_order in init_batch_generator(
                config, data_train, loss_fn, augment_parts, batch_first):
            loss, _ = process_siamese_batch(data,
                                            lengths,
                                            data_train.classes(sample_idx),
                                            orig_order,
                                            net,
                                            optimizer,
                                            config,
                                            train=True,
                                            data_parallel=data_parallel)
            del _
            gc.collect()
            epoch_losses.append(loss)

        dev_ap = get_ap(net, loss_fn.metric(), optimizer, config, data_dev,
                        batch_first)

        if config.siamese_training.lr_schedule:
            scheduler.step(dev_ap)

        torch.save(
            net.state_dict(),
            os.path.join(
                checkpoint_dir,
                '{0}_epoch_{1}.ckpt'.format(net.__class__.__name__, epoch)))

        logger.info(
            'Finished epoch {0}, average training loss = {1:.4f}, epoch time = {2:.3f} sec'
            .format(epoch, np.mean(epoch_losses),
                    time.time() - start))
        logger.info('Epoch {0} avg dev precision= {1:.4f}'.format(
            epoch, dev_ap))
Beispiel #13
0
def do_calculate_accuracy(run_dir,
                          epoch,
                          is_classifier,
                          dataset=None,
                          partition='dev',
                          return_percent=False):
    if not is_classifier:
        train_epoch_embeddings, dev_epoch_embeddings, test_epoch_embeddings = \
            get_or_generate_embeddings(run_dir, epoch, dataset=dataset,
                                       dev_needed=(partition == 'dev'), test_needed=(partition == 'test'))

        words_train, datasets_train, vecs_train, counts_train, word_idxs_train = load_embeddings(
            train_epoch_embeddings[epoch], data_name='train')

        if partition == 'dev':
            words_part, datasets_part, vecs_part, counts_part, word_idxs_part = load_embeddings(
                dev_epoch_embeddings[epoch])
        elif partition == 'test':
            words_part, datasets_part, vecs_part, counts_part, word_idxs_part = load_embeddings(
                test_epoch_embeddings[epoch], data_name='test')
        else:
            raise RuntimeError(
                'Cannot calculate accuracy for partition {0}'.format(
                    partition))

        test_word_idxs = word_idxs_part
        test_vecs = vecs_part

        all_words = set()
        all_words.update(word_idxs_train.keys())
        all_words.update(test_word_idxs.keys())

        max_reference_examples = 40
        train_x = []
        train_y = []
        test_x = []
        test_y = []
        for i, word in enumerate(all_words):
            if word in word_idxs_train:
                all_word_idxs = word_idxs_train[word]
                n_train = min(int(np.floor(all_word_idxs.shape[0] / 2)),
                              max_reference_examples)
                train_x.extend(vecs_train[all_word_idxs[:n_train]])
                train_y.extend([i] * n_train)

                if word_idxs_train != test_word_idxs:
                    if word in test_word_idxs:
                        test_dat = test_vecs[test_word_idxs[word]]
                        test_x.extend(test_dat)
                        test_y.extend([i] * test_dat.shape[0])
                else:
                    test_dat = vecs_train[all_word_idxs[n_train:]]
                    test_x.extend(test_dat)
                    test_y.extend([i] * test_dat.shape[0])

            elif word in test_word_idxs:
                all_word_idxs = test_word_idxs[word]
                n_train = min(int(np.floor(all_word_idxs.shape[0] / 2)),
                              max_reference_examples)
                train_x.extend(test_vecs[all_word_idxs[:n_train]])
                train_y.extend([i] * n_train)

                test_dat = test_vecs[all_word_idxs[n_train:]]
                test_x.extend(test_dat)
                test_y.extend([i] * test_dat.shape[0])

        loss_name = read_embedding_loss(run_dir, throw=False)
        metric = loss_name2class[loss_name].metric(
            None) if loss_name is not None else 'cosine'

        print(run_dir, metric)
        print('N train: {0}, N test: {1}'.format(len(train_x), len(test_x)))

        knn = KNeighborsClassifier(n_neighbors=3, metric=metric, n_jobs=8)
        knn.fit(train_x, train_y)
        k_pred_y = knn.predict(test_x)

        if not return_percent:
            return np.sum(k_pred_y == test_y) / len(test_y)
        else:
            return np.sum(k_pred_y == test_y) / len(test_y) * 100.0
    else:
        net, config, checkpoints, checkpoint_dir, run_name, loss, train_scp, _, _, _, mean_sub, var_norm = \
            load_net(run_dir, epoch=epoch, logger=None, train=False)

        if dataset is None:
            dataset = current_dataset
        train_path, dev_path, test_path = get_dataset_paths(dataset)
        # noinspection PyPep8Naming
        DatasetClass = get_dataset_class_for_path(train_path, logger=None)

        if partition == 'train':
            dataset = DatasetClass(train_path,
                                   parent_dataset_path=train_scp,
                                   training=False,
                                   logger=None,
                                   mean_subtraction=mean_sub,
                                   variance_normalization=var_norm)
        if partition == 'dev':
            dataset = DatasetClass(dev_path,
                                   parent_dataset_path=train_scp,
                                   training=False,
                                   logger=None,
                                   mean_subtraction=mean_sub,
                                   variance_normalization=var_norm)
        if partition == 'test':
            dataset = DatasetClass(test_path,
                                   parent_dataset_path=train_scp,
                                   training=False,
                                   logger=None,
                                   mean_subtraction=mean_sub,
                                   variance_normalization=var_norm)

        # TODO: no automatic detection for batch_first and data_parallel
        losses, accuracy = process_classifier_epoch(net,
                                                    config,
                                                    optimizer=None,
                                                    dataset=dataset,
                                                    batch_first=False,
                                                    data_parallel=False,
                                                    train=False)
        if not return_percent:
            return accuracy / 100.0
        else:
            return accuracy
Beispiel #14
0
def generate_embeddings(run_dir,
                        dataset=None,
                        gen_train=False,
                        gen_dev=False,
                        gen_test=False,
                        gen_new=False,
                        gen_background=False,
                        for_epochs=None):
    run_name2emb = {
        'classifier': get_classifier_embeddings,
        'siamese': get_siamese_embeddings
    }

    net, config, checkpoints, checkpoint_dir, run_name, loss, train_scp, _, _, _, mean_sub, var_norm = \
        load_net(run_dir, epoch=None, logger=None, train=False)
    get_embeddings = run_name2emb[run_name]

    # XXX: currently if embeddings exists in train/dev/test folder they are not regenerated,
    #  remove manually when switching datasets
    if dataset is None:
        dataset = current_dataset
    train_path, dev_path, test_path = get_dataset_paths(dataset)
    # noinspection PyPep8Naming
    DatasetClass = get_dataset_class_for_path(train_path, logger=None)

    if gen_train:
        data_train = DatasetClass(train_path,
                                  parent_dataset_path=train_scp,
                                  training=False,
                                  logger=None,
                                  mean_subtraction=mean_sub,
                                  variance_normalization=var_norm)
    if gen_dev:
        data_dev = DatasetClass(dev_path,
                                parent_dataset_path=train_scp,
                                training=False,
                                logger=None,
                                mean_subtraction=mean_sub,
                                variance_normalization=var_norm)
    if gen_test:
        data_test = DatasetClass(test_path,
                                 parent_dataset_path=train_scp,
                                 training=False,
                                 logger=None,
                                 mean_subtraction=mean_sub,
                                 variance_normalization=var_norm)
    if gen_new:
        data_new = DatasetClass(new_path,
                                parent_dataset_path=train_scp,
                                training=False,
                                logger=None,
                                mean_subtraction=mean_sub,
                                variance_normalization=var_norm)
    if gen_background:
        background_path = os.path.join(processed_data_dir,
                                       'background_train_v4',
                                       'background_data.scp')
        data_background = DatasetClass(background_path,
                                       parent_dataset_path=train_scp,
                                       training=False,
                                       logger=None,
                                       mean_subtraction=mean_sub,
                                       variance_normalization=var_norm)

    train_embeddings_dir = os.path.join(run_dir, 'train_embeddings')
    dev_embeddings_dir = os.path.join(run_dir, 'dev_embeddings')
    test_embeddings_dir = os.path.join(run_dir, 'test_embeddings')
    new_embeddings_dir = os.path.join(run_dir, 'new_embeddings')
    background_embeddings_dir = os.path.join(run_dir, 'background_embeddings')

    if len(checkpoints) == 0:
        print('No checkpoints found in {0} for run {1}'.format(
            checkpoint_dir, run_dir))
        print('Exiting')
        sys.exit(-1)

    if for_epochs is None:
        for_epochs = sorted(list(checkpoints.keys()))

    for epoch in for_epochs:
        checkpoint = checkpoints[epoch]
        if gen_train:
            gen_and_save_dataset_embeddings(net, config, checkpoint,
                                            data_train, get_embeddings,
                                            train_embeddings_dir)

        if gen_dev:
            gen_and_save_dataset_embeddings(net, config, checkpoint, data_dev,
                                            get_embeddings, dev_embeddings_dir)

        if gen_test:
            gen_and_save_dataset_embeddings(net, config, checkpoint, data_test,
                                            get_embeddings,
                                            test_embeddings_dir)

        if gen_new:
            gen_and_save_dataset_embeddings(net, config, checkpoint, data_new,
                                            get_embeddings, new_embeddings_dir)

        if gen_background:
            gen_and_save_dataset_embeddings(net, config, checkpoint,
                                            data_background, get_embeddings,
                                            background_embeddings_dir)