Ejemplo n.º 1
0
def scp2snodgrass_patients(scp_path):
    scp_data = []
    with open(scp_path, 'r') as f:
        scp_data.extend(f.readlines())

    patients = [
        snodgrass_key2patient(line) for line in scp_data
        if key2dataset(line) == 'snodgrass'
    ]
    return [(patient, None) for patient in np.unique(patients)]
Ejemplo n.º 2
0
def split_train_dev_test(output_name,
                         external_sub_dir,
                         snodgrass_sub_dir,
                         snodgrass_file,
                         same_split_as=None):
    output_path = os.path.join(processed_data_dir, output_name)
    external_snodgrass = glob.glob(
        os.path.join(processed_data_dir, external_sub_dir,
                     '*snodgrass_words.scp'))

    lines = []
    for scp in external_snodgrass:
        with open(scp, 'r') as f:
            lines.extend(f.readlines())

    words = np.array([key2word(line) for line in lines])
    datasets = np.array([key2dataset(line) for line in lines])
    counts = Counter(words)

    word_dataset2idx = {
        key: {dset: []
              for dset in np.unique(datasets)}
        for key in counts
    }
    for i in range(len(lines)):
        word_dataset2idx[words[i]][datasets[i]].append(i)

    idx_train = []
    idx_dev = []
    idx_train.extend(range(len(lines)))

    snodgrass_train, snodgrass_dev, snodgrass_test = split_snodgrass_dataset(
        snodgrass_sub_dir, snodgrass_file, same_split_as=same_split_as)

    train_scp = '{0}_train.scp'.format(output_path)
    dev_scp = '{0}_dev.scp'.format(output_path)
    test_scp = '{0}_test.scp'.format(output_path)

    with open(train_scp, 'w') as train_file:
        for idx in idx_train:
            train_file.write(lines[idx])
        for line in snodgrass_train:
            train_file.write(line)

    with open(dev_scp, 'w') as dev_file:
        for idx in idx_dev:
            dev_file.write(lines[idx])
        for line in snodgrass_dev:
            dev_file.write(line)

    with open(test_scp, 'w') as test_file:
        for line in snodgrass_test:
            test_file.write(line)

    return train_scp, dev_scp, test_scp
Ejemplo n.º 3
0
def load_embeddings(path, data_name='dev', return_keys=False):
    with open(path, 'rb') as f:
        data_dict = pickle.load(f)

    words = np.array([key2word(key) for key in data_dict])
    datasets = np.array([key2dataset(key) for key in data_dict])
    datasets[datasets == 'PHATTSESSIONZ'] = 'PHA'
    vecs = np.array([vec for vec in data_dict.values()])
    counts = Counter(words)
    word_idxs = {key: np.where(words == key)[0] for key in counts}
    print('There are {0} unique words in the {1} set.'.format(
        len(counts), data_name))

    if not return_keys:
        return words, datasets, vecs, counts, word_idxs
    else:
        return words, datasets, vecs, counts, word_idxs, np.array(
            list(data_dict.keys()))
Ejemplo n.º 4
0
def collect_session_embeddings_data(session, vecs_dev, keys_dev):
    sessions_vecs = []
    session_keys = []

    for i, key in enumerate(keys_dev):
        if key2dataset(key) == 'snodgrass' and snodgrass_key2date(
                key) == session:
            sessions_vecs.append(vecs_dev[i])
            session_keys.append(key)

    sessions_vecs = np.array(sessions_vecs)
    session_keys = np.array(session_keys)
    session_words = np.array([key2word(key) for key in session_keys])
    session_word_idxs = {
        key: np.where(session_words == key)[0]
        for key in np.unique(session_words)
    }

    return sessions_vecs, session_word_idxs
Ejemplo n.º 5
0
    def __init__(self,
                 data_path,
                 parent_dataset_path=None,
                 training=True,
                 logger=None,
                 variance_normalization=False,
                 noise_multiplier=0,
                 noise_prob=1,
                 mean_subtraction=False,
                 supplement_rare_with_noisy=False,
                 supplement_seed=112):
        self.data_path = data_path
        self.word2idxs = {}
        self.idx2word = []
        self.idx2source_dataset = []
        self.idx2key = []
        self.data = []
        self.training = training
        self.noise_multiplier = noise_multiplier
        self.noise_prob = noise_prob

        util.warn_or_print(
            logger, 'Loading {0}, train = {1}'.format(data_path, training))
        if not training and parent_dataset_path is None:
            util.warn_or_print(
                logger,
                'Non-training mode is selected, but parent_dataset_path is None'
            )
            util.warn_or_print(
                logger,
                'A non-training dataset must always have the parent specified, otherwise'
                'the data mean and other derived values will be incorrect. Aborting.'
            )
            sys.exit(-1)

        for i, (key, mat) in enumerate(self._raw_data_iterator()):
            word = key2word(key)
            dataset = key2dataset(key)

            word_example_idx = self.word2idxs.get(word, [])
            word_example_idx.append(i)
            self.word2idxs[word] = word_example_idx
            self.idx2word.append(word)
            self.idx2source_dataset.append(dataset)
            self.idx2key.append(key)
            self.data.append(mat)

        self.idx2word = np.array(self.idx2word)
        self.idx2source_dataset = np.array(self.idx2source_dataset)
        self.idx2key = np.array(self.idx2key)
        for key in self.word2idxs:
            self.word2idxs[key] = np.array(self.word2idxs[key])
        self.counts = {
            key: self.word2idxs[key].shape[0]
            for key in self.word2idxs
        }

        if parent_dataset_path is None:
            self.mean = self._calculate_feature_mean()
            self.std = self._calculate_feature_std()
        else:
            self.load_derived_data(parent_dataset_path)

        if mean_subtraction:
            util.warn_or_print(logger, 'Applying mean subtraction')
            self.data = np.array([(segment - self.mean).astype(np.float32)
                                  for segment in self.data])
        elif variance_normalization:
            util.warn_or_print(logger, 'Applying variance normalization')
            self.data = np.array([
                ((segment - self.mean) / self.std).astype(np.float32)
                for segment in self.data
            ])
        else:
            util.warn_or_print(logger, 'No mean subtraction')
            self.data = np.array(
                [segment.astype(np.float32) for segment in self.data])

        # TODO: sort keys before doing anything else? (for identical behaviour between Kaldi and LMDB-exported datasets)

        self.noisy = np.zeros(len(self.data), dtype=bool)
        if supplement_rare_with_noisy and training:
            state_before = np.random.get_state()
            np.random.seed(supplement_seed)

            before_mean_examples = int(
                np.ceil(np.mean(list(self.counts.values()))))
            util.warn_or_print(
                logger,
                'Supplementing rare classes with noisy examples up to {0} total'
                .format(before_mean_examples))
            util.warn_or_print(
                logger, 'Original example count: {0}'.format(len(self.data)))
            for word, count in self.counts.items():
                if count < before_mean_examples:
                    to_add = before_mean_examples - count
                    augmented_source_idx = []
                    augmented_examples = []
                    for orig_idx in islice(cycle(self.word2idxs[word]),
                                           to_add):
                        orig_data = self.data[orig_idx]
                        augmented_source_idx.append(orig_idx)
                        augmented_examples.append(
                            orig_data + noise_multiplier * np.random.normal(
                                loc=0, scale=self.std, size=orig_data.shape))
                    augmented_source_idx = np.array(augmented_source_idx)
                    augmented_examples = np.array(augmented_examples)
                    self.data = np.concatenate((self.data, augmented_examples))
                    self.noisy = np.concatenate(
                        (self.noisy, np.ones(to_add, dtype=bool)))
                    self.word2idxs[word] = np.concatenate(
                        (self.word2idxs[word],
                         np.arange(len(self.data) - to_add,
                                   len(self.data),
                                   dtype=np.int32)))
                    self.idx2word = np.concatenate(
                        (self.idx2word, [word for _ in augmented_examples]))
                    self.idx2key = np.concatenate(
                        (self.idx2key,
                         [self.idx2key[x] for x in augmented_source_idx]))
                    self.idx2source_dataset = np.concatenate(
                        (self.idx2source_dataset, [
                            self.idx2source_dataset[x]
                            for x in augmented_source_idx
                        ]))

            self.counts = {
                key: self.word2idxs[key].shape[0]
                for key in self.word2idxs
            }
            util.warn_or_print(
                logger, 'Augmented example count: {0}'.format(len(self.data)))
            np.random.set_state(state_before)

        self.feature_dim = self.data[0].shape[1]
        self.mean_examples = int(np.ceil(np.mean(list(self.counts.values()))))

        # siamese training setup, ignoring words with 1 example
        self.siamese_words = np.array(
            sorted([key for key in self.word2idxs if self.counts[key] > 1]))
        self.num_siamese_words = self.siamese_words.shape[0]

        # classifier training setup
        self.all_words = np.array(sorted(list(self.counts.keys())))
        if parent_dataset_path is None:
            self.word2id = {key: i for i, key in enumerate(self.all_words)}
        else:
            new_words = np.array(
                [x not in self.word2id for x in self.all_words])
            if np.any(new_words):
                max_given_id = max(self.word2id.values())
                for i, x in enumerate(self.all_words[new_words]):
                    self.word2id[x] = max_given_id + i + 1
        self.idx2word_id = np.array(
            [self.word2id[word] for word in self.idx2word], dtype=np.int32)