Beispiel #1
0
def split_snodgrass_dataset(source_sub_dir,
                            snodgrass_file,
                            same_split_as=None):
    """~50% train, ~25% percent dev, ~25% test, while taking care that each patient's data is present exclusively
     in either train, dev, or test sets."""
    snodgrass = os.path.join(processed_data_dir, source_sub_dir,
                             snodgrass_file)
    lines = []
    with open(snodgrass, 'r') as f:
        lines.extend(f.readlines())

    lines = np.array(lines)
    words = np.array([key2word(x) for x in lines])
    patients = np.array([snodgrass_key2patient(x) for x in lines])
    patients_with_counts = [(key, value)
                            for key, value in Counter(patients).items()]

    data_train = []
    data_test = []
    data_dev = []

    words_train = []
    words_test = []
    words_dev = []

    if same_split_as is None:
        # surprise knapsack problem :)
        patients_train = knapsack(patients_with_counts, len(lines) / 2)[1]
        patients_left = remove_all(patients_with_counts, patients_train)
        patients_test = knapsack(patients_left, len(lines) / 4)[1]
        patients_dev = remove_all(patients_left, patients_test)
    else:
        train_path, dev_path, test_path = get_dataset_paths(same_split_as,
                                                            fmt='scp')
        patients_train = scp2snodgrass_patients(train_path)
        patients_test = scp2snodgrass_patients(test_path)
        patients_dev = scp2snodgrass_patients(dev_path)

    for patient, _ in patients_train:
        data_train.extend(lines[np.where(patients == patient)])
        words_train.extend(words[np.where(patients == patient)])

    for patient, _ in patients_test:
        data_test.extend(lines[np.where(patients == patient)])
        words_test.extend(words[np.where(patients == patient)])

    for patient, _ in patients_dev:
        data_dev.extend(lines[np.where(patients == patient)])
        words_dev.extend(words[np.where(patients == patient)])

    print(
        'Unique words in train dataset: {0}, in test: {1}, in dev: {2}'.format(
            len(Counter(words_train)), len(Counter(words_test)),
            len(Counter(words_dev))))

    return data_train, data_dev, data_test
Beispiel #2
0
def split_train_dev_test(output_name,
                         external_sub_dir,
                         snodgrass_sub_dir,
                         snodgrass_file,
                         same_split_as=None):
    output_path = os.path.join(processed_data_dir, output_name)
    external_snodgrass = glob.glob(
        os.path.join(processed_data_dir, external_sub_dir,
                     '*snodgrass_words.scp'))

    lines = []
    for scp in external_snodgrass:
        with open(scp, 'r') as f:
            lines.extend(f.readlines())

    words = np.array([key2word(line) for line in lines])
    datasets = np.array([key2dataset(line) for line in lines])
    counts = Counter(words)

    word_dataset2idx = {
        key: {dset: []
              for dset in np.unique(datasets)}
        for key in counts
    }
    for i in range(len(lines)):
        word_dataset2idx[words[i]][datasets[i]].append(i)

    idx_train = []
    idx_dev = []
    idx_train.extend(range(len(lines)))

    snodgrass_train, snodgrass_dev, snodgrass_test = split_snodgrass_dataset(
        snodgrass_sub_dir, snodgrass_file, same_split_as=same_split_as)

    train_scp = '{0}_train.scp'.format(output_path)
    dev_scp = '{0}_dev.scp'.format(output_path)
    test_scp = '{0}_test.scp'.format(output_path)

    with open(train_scp, 'w') as train_file:
        for idx in idx_train:
            train_file.write(lines[idx])
        for line in snodgrass_train:
            train_file.write(line)

    with open(dev_scp, 'w') as dev_file:
        for idx in idx_dev:
            dev_file.write(lines[idx])
        for line in snodgrass_dev:
            dev_file.write(line)

    with open(test_scp, 'w') as test_file:
        for line in snodgrass_test:
            test_file.write(line)

    return train_scp, dev_scp, test_scp
Beispiel #3
0
def compose_test_from_non_validation_words(swc_path, dev_path, test_path):
    # and afterwards I switched the dev and test sets, to make sure the test set is the more complete one
    swc_lines = []
    read_scp_lines(swc_path, swc_lines)
    dev_lines = []
    read_scp_lines(dev_path, dev_lines)

    left_lines = np.array([x for x in swc_lines if x not in dev_lines])
    left_words = np.array([key2word(x) for x in left_lines])
    test_lines = []
    for word in np.unique(left_words):
        left_word_lines = left_lines[left_words == word]
        np.random.shuffle(left_word_lines)
        test_lines.extend(left_word_lines[:35])

    with open(test_path, 'w') as test_file:
        for line in test_lines:
            test_file.write(line)
Beispiel #4
0
def load_embeddings(path, data_name='dev', return_keys=False):
    with open(path, 'rb') as f:
        data_dict = pickle.load(f)

    words = np.array([key2word(key) for key in data_dict])
    datasets = np.array([key2dataset(key) for key in data_dict])
    datasets[datasets == 'PHATTSESSIONZ'] = 'PHA'
    vecs = np.array([vec for vec in data_dict.values()])
    counts = Counter(words)
    word_idxs = {key: np.where(words == key)[0] for key in counts}
    print('There are {0} unique words in the {1} set.'.format(
        len(counts), data_name))

    if not return_keys:
        return words, datasets, vecs, counts, word_idxs
    else:
        return words, datasets, vecs, counts, word_idxs, np.array(
            list(data_dict.keys()))
Beispiel #5
0
def collect_session_embeddings_data(session, vecs_dev, keys_dev):
    sessions_vecs = []
    session_keys = []

    for i, key in enumerate(keys_dev):
        if key2dataset(key) == 'snodgrass' and snodgrass_key2date(
                key) == session:
            sessions_vecs.append(vecs_dev[i])
            session_keys.append(key)

    sessions_vecs = np.array(sessions_vecs)
    session_keys = np.array(session_keys)
    session_words = np.array([key2word(key) for key in session_keys])
    session_word_idxs = {
        key: np.where(session_words == key)[0]
        for key in np.unique(session_words)
    }

    return sessions_vecs, session_word_idxs
Beispiel #6
0
def __main_independent_test():
    swc_path = '/home/aleks/data/speech_processed/independent_test_v2/SWC_independent_test.scp'
    data_swc = KaldiDataset(swc_path)

    print(data_swc.counts)

    train_path, dev_path, test_path = get_dataset_paths('independent_cleaned_v3', fmt='scp')
    data_train = KaldiDataset(train_path)
    data_dev = KaldiDataset(dev_path, parent_dataset_path=train_path)
    data_test = KaldiDataset(test_path, parent_dataset_path=train_path)

    print(data_dev.counts)

    swc_keys = set(data_swc.idx2key)
    dev_keys = set(data_dev.idx2key)
    difference = swc_keys.difference(dev_keys)

    left_words = np.array([key2word(x) for x in difference])
    left_counts = Counter(left_words)
    print(left_counts)
Beispiel #7
0
def split_independent_words(output_name, data_sub_dir, dataset_comparable_to):
    output_path = os.path.join(processed_data_dir, output_name)

    train_path, dev_path, _ = get_dataset_paths(dataset_comparable_to)
    counts_train = get_dataset_word_counts(train_path)
    counts_dev = get_dataset_word_counts(dev_path)

    selected_words = load_pickled('selected_words.pckl')

    all_scp = glob.glob(
        os.path.join(processed_data_dir, data_sub_dir,
                     '*independent_test.scp'))
    swc_scp = [x for x in all_scp if os.path.basename(x).startswith('SWC')][0]
    all_scp.remove(swc_scp)

    emu_lines = []  # this will be the train data
    swc_lines = []  # this will be the test data
    for scp in all_scp:
        read_scp_lines(scp, emu_lines)
    read_scp_lines(swc_scp, swc_lines)
    emu_lines = np.array(emu_lines)
    swc_lines = np.array(swc_lines)

    emu_words = np.array([key2word(x) for x in emu_lines])
    swc_words = np.array([key2word(x) for x in swc_lines])

    emu_counts = Counter(emu_words)
    swc_counts = Counter(swc_words)

    # for word in emu_counts:
    #     print('{0:<20}: train {1}, test {2}'.format(word, emu_counts[word], swc_counts.get(word, 0)))

    # for word in counts_train:
    #     new_word = selected_words[word]
    #     print('{0}, train: {1}, dev: {2}'.format(word, counts_train[word], counts_dev.get(word, 0)))
    #     print('{word}: new train count: {0}, new test count: {1}'.format(emu_counts[new_word], swc_counts[new_word],
    #                                                                      word=new_word))

    new_train = []
    new_dev = []
    for word, new_word in selected_words.items():
        train_new_lines = emu_lines[emu_words == new_word]
        np.random.shuffle(train_new_lines)
        new_train.extend(train_new_lines[:counts_train[word]])

        dev_new_lines = swc_lines[swc_words == new_word]
        np.random.shuffle(dev_new_lines)
        # new_dev.extend(dev_new_lines[:counts_dev.get(word, 5)])  # didn't work at all, maybe bad labels?
        new_dev.extend(dev_new_lines[:35])

    train_scp = '{0}_train.scp'.format(output_path)
    dev_scp = '{0}_dev.scp'.format(output_path)

    with open(train_scp, 'w') as train_file:
        for line in new_train:
            train_file.write(line)

    with open(dev_scp, 'w') as dev_file:
        for line in new_dev:
            dev_file.write(line)

    return train_scp, dev_scp, None
Beispiel #8
0
    def __init__(self,
                 data_path,
                 parent_dataset_path=None,
                 training=True,
                 logger=None,
                 variance_normalization=False,
                 noise_multiplier=0,
                 noise_prob=1,
                 mean_subtraction=False,
                 supplement_rare_with_noisy=False,
                 supplement_seed=112):
        self.data_path = data_path
        self.word2idxs = {}
        self.idx2word = []
        self.idx2source_dataset = []
        self.idx2key = []
        self.data = []
        self.training = training
        self.noise_multiplier = noise_multiplier
        self.noise_prob = noise_prob

        util.warn_or_print(
            logger, 'Loading {0}, train = {1}'.format(data_path, training))
        if not training and parent_dataset_path is None:
            util.warn_or_print(
                logger,
                'Non-training mode is selected, but parent_dataset_path is None'
            )
            util.warn_or_print(
                logger,
                'A non-training dataset must always have the parent specified, otherwise'
                'the data mean and other derived values will be incorrect. Aborting.'
            )
            sys.exit(-1)

        for i, (key, mat) in enumerate(self._raw_data_iterator()):
            word = key2word(key)
            dataset = key2dataset(key)

            word_example_idx = self.word2idxs.get(word, [])
            word_example_idx.append(i)
            self.word2idxs[word] = word_example_idx
            self.idx2word.append(word)
            self.idx2source_dataset.append(dataset)
            self.idx2key.append(key)
            self.data.append(mat)

        self.idx2word = np.array(self.idx2word)
        self.idx2source_dataset = np.array(self.idx2source_dataset)
        self.idx2key = np.array(self.idx2key)
        for key in self.word2idxs:
            self.word2idxs[key] = np.array(self.word2idxs[key])
        self.counts = {
            key: self.word2idxs[key].shape[0]
            for key in self.word2idxs
        }

        if parent_dataset_path is None:
            self.mean = self._calculate_feature_mean()
            self.std = self._calculate_feature_std()
        else:
            self.load_derived_data(parent_dataset_path)

        if mean_subtraction:
            util.warn_or_print(logger, 'Applying mean subtraction')
            self.data = np.array([(segment - self.mean).astype(np.float32)
                                  for segment in self.data])
        elif variance_normalization:
            util.warn_or_print(logger, 'Applying variance normalization')
            self.data = np.array([
                ((segment - self.mean) / self.std).astype(np.float32)
                for segment in self.data
            ])
        else:
            util.warn_or_print(logger, 'No mean subtraction')
            self.data = np.array(
                [segment.astype(np.float32) for segment in self.data])

        # TODO: sort keys before doing anything else? (for identical behaviour between Kaldi and LMDB-exported datasets)

        self.noisy = np.zeros(len(self.data), dtype=bool)
        if supplement_rare_with_noisy and training:
            state_before = np.random.get_state()
            np.random.seed(supplement_seed)

            before_mean_examples = int(
                np.ceil(np.mean(list(self.counts.values()))))
            util.warn_or_print(
                logger,
                'Supplementing rare classes with noisy examples up to {0} total'
                .format(before_mean_examples))
            util.warn_or_print(
                logger, 'Original example count: {0}'.format(len(self.data)))
            for word, count in self.counts.items():
                if count < before_mean_examples:
                    to_add = before_mean_examples - count
                    augmented_source_idx = []
                    augmented_examples = []
                    for orig_idx in islice(cycle(self.word2idxs[word]),
                                           to_add):
                        orig_data = self.data[orig_idx]
                        augmented_source_idx.append(orig_idx)
                        augmented_examples.append(
                            orig_data + noise_multiplier * np.random.normal(
                                loc=0, scale=self.std, size=orig_data.shape))
                    augmented_source_idx = np.array(augmented_source_idx)
                    augmented_examples = np.array(augmented_examples)
                    self.data = np.concatenate((self.data, augmented_examples))
                    self.noisy = np.concatenate(
                        (self.noisy, np.ones(to_add, dtype=bool)))
                    self.word2idxs[word] = np.concatenate(
                        (self.word2idxs[word],
                         np.arange(len(self.data) - to_add,
                                   len(self.data),
                                   dtype=np.int32)))
                    self.idx2word = np.concatenate(
                        (self.idx2word, [word for _ in augmented_examples]))
                    self.idx2key = np.concatenate(
                        (self.idx2key,
                         [self.idx2key[x] for x in augmented_source_idx]))
                    self.idx2source_dataset = np.concatenate(
                        (self.idx2source_dataset, [
                            self.idx2source_dataset[x]
                            for x in augmented_source_idx
                        ]))

            self.counts = {
                key: self.word2idxs[key].shape[0]
                for key in self.word2idxs
            }
            util.warn_or_print(
                logger, 'Augmented example count: {0}'.format(len(self.data)))
            np.random.set_state(state_before)

        self.feature_dim = self.data[0].shape[1]
        self.mean_examples = int(np.ceil(np.mean(list(self.counts.values()))))

        # siamese training setup, ignoring words with 1 example
        self.siamese_words = np.array(
            sorted([key for key in self.word2idxs if self.counts[key] > 1]))
        self.num_siamese_words = self.siamese_words.shape[0]

        # classifier training setup
        self.all_words = np.array(sorted(list(self.counts.keys())))
        if parent_dataset_path is None:
            self.word2id = {key: i for i, key in enumerate(self.all_words)}
        else:
            new_words = np.array(
                [x not in self.word2id for x in self.all_words])
            if np.any(new_words):
                max_given_id = max(self.word2id.values())
                for i, x in enumerate(self.all_words[new_words]):
                    self.word2id[x] = max_given_id + i + 1
        self.idx2word_id = np.array(
            [self.word2id[word] for word in self.idx2word], dtype=np.int32)