コード例 #1
0
def word2vec_initialization(data_dir, metamap_map):
    # encode docs
    df = str_utils.read_csv('{}/word.ids'.format(data_dir))
    with open('{}/word.nodes'.format(data_dir), 'w') as fw:
        for i, (word_id, word) in enumerate(zip(df['word_id'], df['word'])):
            word_embedding = get_word_embedding(word_id, word)
            fw.write('{}\t{}\n'.format(word_id, '\t'.join([str(x) for x in word_embedding])))
        fw.close()

    df = str_utils.read_csv('{}/doc.ids'.format(data_dir))
    with open('{}/doc.nodes'.format(data_dir), 'w') as fw:
        for i, (doc_id, text, label) in enumerate(zip(df['doc_id'], df['text'], df['label'])):
            doc_embedding = get_doc_embedding(text, metamap_map)
            fw.write('{}\t{}\t{}\n'.format(doc_id, '\t'.join([str(x) for x in doc_embedding]), label))
        fw.close()
コード例 #2
0
def randim_initialization(data_dir):
    emb_size = 768

    # encode docs
    df = str_utils.read_csv('{}/doc.ids'.format(data_dir))
    n = len(df)
    embeddings = np.random.rand(n, emb_size)
    with open('{}/doc.nodes'.format(data_dir), 'w') as fw:
        for i, (doc_id, label) in enumerate(zip(df['doc_id'], df['label'])):
            fw.write('{}\t{}\t{}\n'.format(doc_id, '\t'.join([str(x) for x in embeddings[i].tolist()]), label))
        fw.close()

    df = str_utils.read_csv('{}/word.ids'.format(data_dir))
    n = len(df)
    embeddings = np.random.rand(n, emb_size)
    with open('{}/word.nodes'.format(data_dir), 'w') as fw:
        for i, word_id in enumerate(df['word_id']):
            fw.write('{}\t{}\n'.format(word_id, '\t'.join([str(x) for x in embeddings[i].tolist()])))
        fw.close()
コード例 #3
0
import mmlrestclient
import sys
import pandas as pd
import argparse
import json
from str_utils import preprocess, read_csv

if __name__ == '__main__':
    infile = sys.argv[1]
    df = read_csv(infile)
    texts = df.text.tolist()
    texts = ['{}|{}'.format(i, preprocess(text.replace(r'\n', ''))) for i, text in enumerate(texts)]
    texts = '\n'.join(texts)
    args = mmlrestclient.construct_args(texts)
    response = mmlrestclient.process(args)
    #print(response.text)

    responses = response.text.split('\n')
    hashtbl = {}
    for r in responses:
        arr = r.split('|')
        if len(arr) != 10:
            continue
        raw_text = arr[6].split('-')[3].strip('"')
        norm_term = arr[3].lower()

        if raw_text not in hashtbl:
            hashtbl[raw_text] = {}
        if norm_term not in hashtbl[raw_text]:
            hashtbl[raw_text][norm_term] = 0
        hashtbl[raw_text][norm_term] += 1
コード例 #4
0
def get_pmi(pair_count, a_count, b_count, total_count):
    p_i = a_count/total_count
    p_j = b_count/total_count
    p_ij = pair_count/total_count
    pmi = math.log(p_ij/(p_i*p_j))
    return pmi

if __name__ == '__main__':
    data_path = sys.argv[1]

    train = '{}/train.csv'.format(data_path)
    dev = '{}/dev.csv'.format(data_path)
    test = '{}/test.csv'.format(data_path)

    train_df = read_csv(train)
    dev_df = read_csv(dev)
    test_df = read_csv(test)

    #docs = train_df.text.tolist() + dev_df.text.tolist() + test_df.text.tolist()
    docs = train_df.text.tolist()
    labels = train_df.label.tolist()
    docs = [preprocess(doc) for doc in docs]

    #docs = docs
    #labels = labels
    #docs = train_df.text.tolist() + dev_df.text.tolist() + test_df.text.tolist()
    #labels = train_df.label.tolist() + dev_df.label.tolist() + test_df.label.tolist()

    # Only use the vocabulary from the training data
    tfidf_mtx, vocab, tfidf_vectorizer = get_tfidf(docs)