def word2vec_initialization(data_dir, metamap_map): # encode docs df = str_utils.read_csv('{}/word.ids'.format(data_dir)) with open('{}/word.nodes'.format(data_dir), 'w') as fw: for i, (word_id, word) in enumerate(zip(df['word_id'], df['word'])): word_embedding = get_word_embedding(word_id, word) fw.write('{}\t{}\n'.format(word_id, '\t'.join([str(x) for x in word_embedding]))) fw.close() df = str_utils.read_csv('{}/doc.ids'.format(data_dir)) with open('{}/doc.nodes'.format(data_dir), 'w') as fw: for i, (doc_id, text, label) in enumerate(zip(df['doc_id'], df['text'], df['label'])): doc_embedding = get_doc_embedding(text, metamap_map) fw.write('{}\t{}\t{}\n'.format(doc_id, '\t'.join([str(x) for x in doc_embedding]), label)) fw.close()
def randim_initialization(data_dir): emb_size = 768 # encode docs df = str_utils.read_csv('{}/doc.ids'.format(data_dir)) n = len(df) embeddings = np.random.rand(n, emb_size) with open('{}/doc.nodes'.format(data_dir), 'w') as fw: for i, (doc_id, label) in enumerate(zip(df['doc_id'], df['label'])): fw.write('{}\t{}\t{}\n'.format(doc_id, '\t'.join([str(x) for x in embeddings[i].tolist()]), label)) fw.close() df = str_utils.read_csv('{}/word.ids'.format(data_dir)) n = len(df) embeddings = np.random.rand(n, emb_size) with open('{}/word.nodes'.format(data_dir), 'w') as fw: for i, word_id in enumerate(df['word_id']): fw.write('{}\t{}\n'.format(word_id, '\t'.join([str(x) for x in embeddings[i].tolist()]))) fw.close()
import mmlrestclient import sys import pandas as pd import argparse import json from str_utils import preprocess, read_csv if __name__ == '__main__': infile = sys.argv[1] df = read_csv(infile) texts = df.text.tolist() texts = ['{}|{}'.format(i, preprocess(text.replace(r'\n', ''))) for i, text in enumerate(texts)] texts = '\n'.join(texts) args = mmlrestclient.construct_args(texts) response = mmlrestclient.process(args) #print(response.text) responses = response.text.split('\n') hashtbl = {} for r in responses: arr = r.split('|') if len(arr) != 10: continue raw_text = arr[6].split('-')[3].strip('"') norm_term = arr[3].lower() if raw_text not in hashtbl: hashtbl[raw_text] = {} if norm_term not in hashtbl[raw_text]: hashtbl[raw_text][norm_term] = 0 hashtbl[raw_text][norm_term] += 1
def get_pmi(pair_count, a_count, b_count, total_count): p_i = a_count/total_count p_j = b_count/total_count p_ij = pair_count/total_count pmi = math.log(p_ij/(p_i*p_j)) return pmi if __name__ == '__main__': data_path = sys.argv[1] train = '{}/train.csv'.format(data_path) dev = '{}/dev.csv'.format(data_path) test = '{}/test.csv'.format(data_path) train_df = read_csv(train) dev_df = read_csv(dev) test_df = read_csv(test) #docs = train_df.text.tolist() + dev_df.text.tolist() + test_df.text.tolist() docs = train_df.text.tolist() labels = train_df.label.tolist() docs = [preprocess(doc) for doc in docs] #docs = docs #labels = labels #docs = train_df.text.tolist() + dev_df.text.tolist() + test_df.text.tolist() #labels = train_df.label.tolist() + dev_df.label.tolist() + test_df.label.tolist() # Only use the vocabulary from the training data tfidf_mtx, vocab, tfidf_vectorizer = get_tfidf(docs)