Ejemplo n.º 1
0
def save_data_test(data_test):
    filename = format_processed_filename(
        PROCESSED_DATA_DIR,
        PROCESSED_CONCEPTS_DATA_TEST_FILENAME_TEMPLATE,
        genre='clinical')
    save_pickle(filename, data_test)

    print('Saved:', filename.name)
Ejemplo n.º 2
0
    def completed_event(self, stop_time, result):
        self.result = result

        exp_results = {
            'id': self.id,
            'name': self.name,
            'repository': self.repository,
            'config': self.config,
            'result': self.result,
        }

        filename = self._get_experiment_filename(self.id)
        save_pickle(filename, exp_results)
Ejemplo n.º 3
0
def process_test():
    # load the test data
    _, _, data_mli_test = load_data(load_test=True)
    logging.info('Data: %s', data_mli_test.shape)

    for genre in ['clinical']:
        genre_test = data_mli_test.loc[genre]
        logging.info('Genre: %s, test: %s', genre, genre_test.shape)

        tokenized_test = tokenize_data(genre_test)

        # save all the data into a numpy file
        filename = format_processed_filename(PROCESSED_DATA_DIR, PROCESSED_DATA_TEST_FILENAME_TEMPLATE, genre=genre)
        save_pickle(filename, tokenized_test)
Ejemplo n.º 4
0
    def save(self, weights_path, weights_name):
        sess = K.get_session()
        variables = tf.trainable_variables()
        var_dict = dict()
        for v in variables:
            if "transition" in v.name:
                name = re.sub('transition/', '', v.name)
                name = re.sub(':0', '', name)
                layer_name, var_name = name.split('/')
                layer_name = re.sub('dense_', '', layer_name)
                if not var_dict.get(layer_name):
                    var_dict[layer_name] = dict()
                var_dict[layer_name][var_name] = v
        for k in var_dict.keys():
            print(k)
            for j in var_dict[k].keys():
                print('---{0}'.format(j))

        weights = sess.run(var_dict)
        save_pickle(weights, weights_path, weights_name)
Ejemplo n.º 5
0
def main():
    # load SNLI, MultiNLI and MLI datasets
    data_dev, data_train = load_data()
    logging.info('Data: train - %s, dev - %s', data_train.shape, data_dev.shape)

    if not PROCESSED_DATA_DIR.exists():
        PROCESSED_DATA_DIR.mkdir()

    for genre in GENRES:
        if genre not in data_train.index:
            continue

        genre_train = data_train.loc[genre]
        genre_dev = data_dev.loc[genre]
        logging.info('Genre: %s, train: %s, dev: %s', genre, genre_train.shape, genre_dev.shape)

        tokenized_train = tokenize_data(genre_train)
        tokenized_dev = tokenize_data(genre_dev)

        # save all the data into a numpy file
        filename = format_processed_filename(PROCESSED_DATA_DIR, PROCESSED_DATA_FILENAME_TEMPLATE, genre=genre)
        save_pickle(filename, (tokenized_train, tokenized_dev))
Ejemplo n.º 6
0
def process_data(genre_source,
                 genre_target,
                 genre_tune,
                 max_len,
                 lowercase,
                 stem,
                 clean,
                 downsample_source,
                 word_vectors_type,
                 word_vectors_replace_cui,
                 use_umls_attention,
                 use_token_level_attention,
                 padding='pre'):
    """Load data for the target genres, create and fit tokenizer, and return the input matrices"""

    data_source_train, data_source_dev, data_target_train, data_target_dev, data_tune_train, data_tune_dev = \
        load_processed_genre_data(PROCESSED_DATA_DIR, PROCESSED_CONCEPTS_DATA_FILENAME_TEMPLATE,
                                  genre_source, genre_target, genre_tune)

    _, _, data_clinical_test = load_single_genre_data(
        PROCESSED_DATA_DIR,
        PROCESSED_CONCEPTS_DATA_FILENAME_TEMPLATE,
        genre='clinical',
        filename_test_template=PROCESSED_CONCEPTS_DATA_TEST_FILENAME_TEMPLATE)

    if clean:
        data_source_train = clean_data(data_source_train)
        data_source_dev = clean_data(data_source_dev)

        data_target_train = clean_data(data_target_train)
        data_target_dev = clean_data(data_target_dev)

        data_tune_train = clean_data(data_tune_train)
        data_tune_dev = clean_data(data_tune_dev)

        data_clinical_test = clean_data(data_clinical_test)

        logging.info('Data cleaned')

    if stem:
        data_source_train = stem_data(data_source_train)
        data_source_dev = stem_data(data_source_dev)

        data_target_train = stem_data(data_target_train)
        data_target_dev = stem_data(data_target_dev)

        data_tune_train = stem_data(data_tune_train)
        data_tune_dev = stem_data(data_tune_dev)

        data_clinical_test = stem_data(data_clinical_test)

        logging.info('Data stemmed')

    if use_token_level_attention:
        data_source_train = create_token_cuis(data_source_train)
        data_source_dev = create_token_cuis(data_source_dev)
        data_target_dev = create_token_cuis(data_target_dev)

        data_clinical_test = create_token_cuis(data_clinical_test)

    word_vectors_replacement = None
    if word_vectors_replace_cui != '':
        word_vectors_replacement_filename = get_word_vectors_filename(
            word_vectors_replace_cui)
        word_vectors_replacement = load_pickle(
            word_vectors_replacement_filename)
        logging.info('Replacements word vectors loaded: %s',
                     word_vectors_replacement_filename.name)

        target_cuis = set(word_vectors_replacement.keys())
        logging.info('Target CUIs: %s', len(target_cuis))

        data_source_train = replace_cui_data(data_source_train, target_cuis)
        data_source_dev = replace_cui_data(data_source_dev, target_cuis)

        data_target_train = replace_cui_data(data_target_train, target_cuis)
        data_target_dev = replace_cui_data(data_target_dev, target_cuis)

        data_tune_train = replace_cui_data(data_tune_train, target_cuis)
        data_tune_dev = replace_cui_data(data_tune_dev, target_cuis)

        data_clinical_test = replace_cui_data(data_clinical_test, target_cuis)

        logging.info('CUIs replaced')

    if downsample_source != 0:
        # downsample train and dev sets to the size of the clinical dataset
        nb_clinical_train = 11232
        nb_clinical_dev = 1395

        data_source_train = downsample_data(data_source_train,
                                            nb_needed=nb_clinical_train)
        data_source_dev = downsample_data(data_source_dev,
                                          nb_needed=nb_clinical_dev)

    # create tokenizer and vocabulary
    sentences_train = data_source_train['premise'] + data_source_train[
        'hypothesis']
    if data_tune_train is not None:
        sentences_train += data_tune_train['premise'] + data_tune_train[
            'hypothesis']

    tokenizer = Tokenizer(lower=lowercase, filters='')
    tokenizer.fit_on_texts(sentences_train)

    # create data matrices
    m_source_train = create_data_matrices(tokenizer, data_source_train,
                                          max_len, padding)
    m_source_dev = create_data_matrices(tokenizer, data_source_dev, max_len,
                                        padding)
    logging.info('Source: %s - train: %s, %s, %s, dev: %s, %s, %s',
                 genre_source, m_source_train['premise'].shape,
                 m_source_train['hypothesis'].shape,
                 m_source_train['label'].shape, m_source_dev['premise'].shape,
                 m_source_dev['hypothesis'].shape, m_source_dev['label'].shape)

    m_tune_train = None
    m_tune_dev = None
    if data_tune_train is not None:
        m_tune_train = create_data_matrices(tokenizer, data_tune_train,
                                            max_len, padding)
        m_tune_dev = create_data_matrices(tokenizer, data_tune_dev, max_len,
                                          padding)
        logging.info('Tune: %s - train: %s, %s, %s, dev: %s, %s, %s',
                     genre_tune, m_tune_train['premise'].shape,
                     m_tune_train['hypothesis'].shape,
                     m_tune_train['label'].shape, m_tune_dev['premise'].shape,
                     m_tune_dev['hypothesis'].shape, m_tune_dev['label'].shape)

    m_target_train = None
    m_target_dev = None
    if data_target_train is not None:
        m_target_train = create_data_matrices(tokenizer, data_target_train,
                                              max_len, padding)
        m_target_dev = create_data_matrices(tokenizer, data_target_dev,
                                            max_len, padding)
        logging.info('Target: %s - train: %s, %s, %s, dev: %s, %s, %s',
                     genre_target, m_target_train['premise'].shape,
                     m_target_train['hypothesis'].shape,
                     m_target_train['label'].shape,
                     m_target_dev['premise'].shape,
                     m_target_dev['hypothesis'].shape,
                     m_target_dev['label'].shape)

    else:
        m_target_dev = m_source_dev  # target domain was not specified - use the dev set of the source domain
        data_target_dev = data_source_dev
        logging.info('Target: %s - dev: %s, %s, %s', genre_source,
                     m_target_dev['premise'].shape,
                     m_target_dev['hypothesis'].shape,
                     m_target_dev['label'].shape)

    m_clinical_test = create_data_matrices(tokenizer, data_clinical_test,
                                           max_len, padding)
    logging.info('Clinical test: %s, %s, %s', m_clinical_test['premise'].shape,
                 m_clinical_test['hypothesis'].shape,
                 m_clinical_test['label'].shape)

    # create embedding matrix
    if word_vectors_type != 'random':
        word_vectors_filename = get_word_vectors_filename(word_vectors_type)
        word_vectors = load_pickle(word_vectors_filename)
        logging.info('Word vectors loaded: %s', word_vectors_filename.name)

        if word_vectors_replacement is not None:
            word_vectors.update(word_vectors_replacement)

    else:
        random_vectors_params = (
            -0.5,
            0.5,
            300,
        )
        word_vectors = {}
        for token in tokenizer.word_index.keys():
            word_vectors[token] = np.random.uniform(*random_vectors_params)

        logging.info('Random vectors created: %s', random_vectors_params)

    W_emb = create_embedding_matrix(word_vectors, tokenizer.word_index)

    id_to_token = {i: t for t, i in tokenizer.word_index.items()}
    logging.info('Id to token: %s', len(id_to_token))

    if word_vectors_replace_cui != '' or use_token_level_attention:
        concepts_graph = nx.read_gpickle(str(UMLS_CONCEPTS_GRAPH_FILENAME))
        logging.info('UMLS concepts graph: %s', len(concepts_graph))

        # create UMLS-based attention
        if use_token_level_attention:
            att_source_train = create_umls_attention(
                m_source_train, id_to_token, concepts_graph,
                use_token_level_attention,
                data_source_train['premise_token_cuis'],
                data_source_train['hypothesis_token_cuis'])
            att_source_dev = create_umls_attention(
                m_source_dev, id_to_token, concepts_graph,
                use_token_level_attention,
                data_source_dev['premise_token_cuis'],
                data_source_dev['hypothesis_token_cuis'])
            att_target_dev = create_umls_attention(
                m_target_dev, id_to_token, concepts_graph,
                use_token_level_attention,
                data_target_dev['premise_token_cuis'],
                data_target_dev['hypothesis_token_cuis'])

            att_clinical_test = create_umls_attention(
                m_clinical_test, id_to_token, concepts_graph,
                use_token_level_attention,
                data_clinical_test['premise_token_cuis'],
                data_clinical_test['hypothesis_token_cuis'])

            m_source_train.update(att_source_train)
            m_source_dev.update(att_source_dev)
            m_target_dev.update(att_target_dev)

            m_clinical_test.update(att_clinical_test)

        # create memory
        if not use_token_level_attention:
            memory_source_train = create_memory_matrix(
                m_source_train, id_to_token, concepts_graph, word_vectors,
                use_token_level_attention)
            memory_source_dev = create_memory_matrix(
                m_source_dev, id_to_token, concepts_graph, word_vectors,
                use_token_level_attention)
            memory_target_dev = create_memory_matrix(
                m_target_dev, id_to_token, concepts_graph, word_vectors,
                use_token_level_attention)

            memory_clinical_test = create_memory_matrix(
                m_clinical_test, id_to_token, concepts_graph, word_vectors,
                use_token_level_attention)

            m_source_train.update(memory_source_train)
            m_source_dev.update(memory_source_dev)
            m_target_dev.update(memory_target_dev)

            m_clinical_test.update(memory_clinical_test)

    # use WordNet attention
    if genre_source != 'clinical' and word_vectors_replace_cui == '' and use_umls_attention:
        att_source_train = create_wordnet_attention(m_source_train,
                                                    id_to_token)
        att_source_dev = create_wordnet_attention(m_source_dev, id_to_token)
        att_target_dev = create_wordnet_attention(m_target_dev, id_to_token)

        att_clinical_test = create_wordnet_attention(m_clinical_test,
                                                     id_to_token)

        m_source_train.update(att_source_train)
        m_source_dev.update(att_source_dev)
        m_target_dev.update(att_target_dev)

        m_clinical_test.update(att_clinical_test)

    # save tokenizer and embeddings matrix for demo server
    save_pickle(
        DATA_DIR / 'tokenizer_{}_{}.pickled'.format(genre_source, genre_tune),
        tokenizer)
    save_pickle(
        DATA_DIR / 'embeddings_{}_{}.pickled'.format(genre_source, genre_tune),
        W_emb)

    return m_source_train, m_source_dev, m_tune_train, m_tune_dev, m_target_train, m_target_dev, m_clinical_test, W_emb
Ejemplo n.º 7
0
def save_embeddings(filename, data_train, data_dev, word_vectors, target_cuis,
                    mode):
    # find with which words concepts are presented in the data
    concepts_tokens = defaultdict(list)
    data_all = [
        zip(data_train['premise'], data_train['premise_concepts']),
        zip(data_train['hypothesis'], data_train['hypothesis_concepts']),
    ]
    for i, (sentence,
            concepts) in enumerate(itertools.chain.from_iterable(data_all)):
        for concept in concepts:
            cui = concept['cui']

            if cui not in target_cuis:
                continue

            pos_info = concept['pos_info']

            tokens = [sentence[p[0]:p[1]] for p in pos_info]
            concepts_tokens[cui].extend(tokens)

    logging.info('Concepts mode: %s', mode)
    logging.info('Concepts: %s', len(concepts_tokens))

    # filter out concepts without a representation with a single token,
    # and the rest is the same as the cbow_most_common mode
    if mode == 'single_token':
        concepts_tokens = {
            concept: [t for t in tokens_list if not ' ' in t]
            for concept, tokens_list in concepts_tokens.items()
        }
        concepts_tokens = {
            c: t
            for c, t in concepts_tokens.items() if len(t) > 0
        }
        logging.info('Concepts single tokens: %s', len(concepts_tokens))

    if mode == 'cbow_all':
        concepts_tokens = {
            cui: [tok for tokens in tokens_list for tok in tokens.split()]
            for cui, tokens_list in concepts_tokens.items()
        }
        concepts_tokens = {
            cui: set(tokens)
            for cui, tokens in concepts_tokens.items()
        }

    elif mode == 'single_most_common':
        concepts_tokens = {
            cui: [tok for tokens in tokens_list for tok in tokens.split()]
            for cui, tokens_list in concepts_tokens.items()
        }
        concepts_tokens_counter = {
            cui: Counter(tokens)
            for cui, tokens in concepts_tokens.items()
        }
        concepts_tokens = {}
        for concept, tokens_counts in concepts_tokens_counter.items():
            # there might be several tokens with the same frequency - take the longest one in this case
            _, nb_most_common = tokens_counts.most_common(1)[0]
            tokens = [
                t for t, c in tokens_counts.most_common()
                if c == nb_most_common
            ]
            tokens = sorted(tokens, key=lambda x: len(x), reverse=True)
            concepts_tokens[concept] = tokens[:1]

    elif mode == 'cbow_most_common' or mode == 'single_token':
        concepts_tokens_counter = {
            cui: Counter(tokens)
            for cui, tokens in concepts_tokens.items()
        }

        concept_tokens = {}
        for concept, tokens_counts in concepts_tokens_counter.items():
            # add first most common that have at least one embedding
            for tokens, counts in tokens_counts.most_common():
                tokens = tokens.split(' ')
                if any([t in word_vectors for t in tokens]):
                    concept_tokens[concept] = tokens
                    break

    else:
        raise ValueError('Unknown mode: {}'.format(mode))

    logging.info('Concepts tokens: %s', len(concepts_tokens))

    # create a word vectors for each CUI as an average of embeddings
    cuis_embeddings = {}
    for cui, tokens in concepts_tokens.items():
        cui_embeds = []
        for token in tokens:
            if token in word_vectors:
                cui_embeds.append(word_vectors[token])

        if len(cui_embeds) > 0:
            cuis_embeddings[cui] = np.mean(cui_embeds, axis=0)

    logging.info('Concepts with embeddings: %s', len(cuis_embeddings))

    del word_vectors
    if filename.suffix == '.txt':
        # save embeddings in the retrofitting format
        with open(str(filename), 'w') as f:
            for cui, embeddings in cuis_embeddings.items():
                row = '{} {}\n'.format(cui, ' '.join(embeddings.astype(str)))
                f.write(row)
    else:
        save_pickle(filename, cuis_embeddings)

    logging.info('Embeddings saved: %s', filename.name)