def main(): args = argparse.ArgumentParser() args.add_argument( 'source', help='source language word vector file (.magnitude)', ) args.add_argument( 'target', help='target language word vector file (.magnitude)', ) args.add_argument( '-n', dest='count', type=int, default=10, help='number of neighbors per word', ) argv = args.parse_args() src = mag.Magnitude(argv.source) tgt = mag.Magnitude(argv.target) for word in sys.stdin: word = word.rstrip() v = src.query(word) neighbors = tgt.most_similar(v) show(word, neighbors, argv.count)
def __init__(self): self.q_and_as = [{ 'question': '''Following a C3-C7 laminoplasty in a myelopathic patient with cervical stenosis, the most common neurologic complication would manifest with which of the following new postoperative exam findings?''', 'keywords': [['bicep'], ['weakness']], 'answer': 'Bicep weakness' }, { 'question': '''Which variables has the strongest association with poor clinical outcomes in patients who undergo expansive laminoplasty for cervical spondylotic myelopathy?''', 'keywords': [['angle'], ['small', 'few', '13', 'degrees'], ['kyphosis']], 'answer': 'Local kyphosis angle > 13 degrees' }, { 'question': '''Which classification system for cervical myelopathy focuses exclusively on lower extremity function?''', 'keywords': [['nurick']], 'answer': 'Nurick' }, { 'question': '''Motor-dominant radiculopathy with weakness of the deltoid''', 'keywords': [['motor'], ['radiculopathy'], ['weak'], ['deltoid']] }] self.q_index = 0 self.wv = pymagnitude.Magnitude( '../wiki-news-300d-1M-subword.magnitude')
def create_doodle_vocab(doodle_class_path, w2v_magnitdue_path, wordmap_path, out_doodle_path, topn=10): # Read W2v retrofitted wv = pymagnitude.Magnitude(w2v_magnitdue_path) # read doodle with open(doodle_class_path, 'r') as j: doodle = json.load(j) # Build doodle+ list X 10 doodle_plus = [] for d in doodle: doodle_plus.append(d) doodle_plus += set( [k[0].lower() for k in wv.most_similar(d, topn=topn)]) # Read wordmap with open(wordmap_path, 'r') as j: word_map = json.load(j) rev_word_map = {v: k for k, v in word_map.items()} # ix2word # find the intersection with vocab doodle_map = { d: word_map.get(d) for d in doodle_plus if word_map.get(d) is not None } with open(out_doodle_path, 'w') as j: json.dump(doodle_map, j)
def sketch2caption(doodle_class, checkpoint, word_map_path, w2v_magnitdue_path, beam_size=1, num_sen=5): # read wordmap with open(word_map_path, 'r') as j: word_map = json.load(j) rev_word_map = {v: k for k, v in word_map.items()} # ix2word # read w2v w2v = pymagnitude.Magnitude(w2v_magnitdue_path) # Load model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") checkpoint = torch.load(checkpoint) decoder = checkpoint['decoder'] decoder = decoder.to(device) decoder.eval() keyword_size = checkpoint['keyword_size'] failure = 0 # generate keywords key_candidates = [ w[0].lower() for w in w2v.most_similar(doodle_class, topn=keyword_size * 10) ] key_candidates = [w for w in key_candidates if word_map.get(w) is not None] # if len(key_candidates) < keyword_size: # failure = 100 # Encode, decode with attention and beam search sentences = [] keys = [] sent_count = 0 while (sent_count < num_sen): random.shuffle(key_candidates) key = [doodle_class] key += key_candidates[:keyword_size - 1] if len(key) < keyword_size: key = [doodle_class] * keyword_size seq = caption_beam_search(decoder, key, word_map, beam_size) unk_count = [s for s in seq if s in {word_map['<unk>']}] if len(seq) == 0: # or len(unk_count) > 0: # print('Caption is not generated on ', key) failure += 1 if failure > num_sen * 10: break else: continue sentences.append([ rev_word_map.get(s) for s in seq if s not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']} ]) keys.append(key) sent_count += 1 return keys, sentences, failure > 0
def main(opts): wvecs = pymagnitude.Magnitude(opts.wordvecfile) dim = wvecs.dim word_vecs = {} for k in tqdm.trange(len(wvecs)): word_vecs[wvecs[k][0]] = wvecs[k][1] alpha = 0.075 ontology = read_lexicon(opts.lexicon) for t in tqdm.trange(10): for i in range(len(wvecs)): wordvec = wvecs[i] sums = edge_vectors(word_vecs, wordvec[0], ontology, dim) word_vecs[wordvec[0]] = (sums[0] + alpha*wordvec[1]) / (sums[1] + alpha) f = open(opts.output, 'w', encoding = 'utf-8') for k,v in word_vecs.items(): f.write(str(k)) for x in v: f.write(' ' + str(x)) f.write('\n') f.close()
def readMagnitude(self, wv_magnitude_file): wv = pymagnitude.Magnitude(wv_magnitude_file) word_vectors = {} for key, vectors in wv: word_vectors[key] = np.zeros(len(vectors)) for index, vector in enumerate(vectors): word_vectors[key][index] = vector return word_vectors
def create_res_fr_en(eng_mag, fr_mag, b_dict, t_dict, output_f): print("first") eng_vectors = py.Magnitude(eng_mag) fr_vectors = py.Magnitude(fr_mag) # we create two dictionaries from the dict data # one goes en->fr and the other fr->en # this is the training set data_dict_en_to_fr = {} data_dict_fr_to_en = {} with open(b_dict) as f: for line in f: pair = line.split(" ") pair[1] = pair[1][:-1] data_dict_en_to_fr[pair[0]] = pair[1] data_dict_fr_to_en[pair[1]] = pair[0] en_mat = [] fr_mat = [] for key in data_dict_fr_to_en.keys(): en = eng_vectors.query(data_dict_fr_to_en[key]) fr = fr_vectors.query(key) en_mat.append(en) fr_mat.append(fr) en_mat = np.array(en_mat) fr_mat = np.array(fr_mat) u, sig, vt = np.linalg.svd(np.matmul(en_mat.transpose(), fr_mat)) W = np.matmul(np.transpose(vt), np.transpose(u)) final = [] with open(t_dict) as f: for i, line in enumerate(f): line = line[:-1] pair = line.split(" ") word = eng_vectors.most_similar(np.matmul(fr_vectors.query(pair[1]), W), topn=5)#[0][0] to_add = "" #print(i) for w in word: to_add += (" " + w[0]) line = pair[1] + " " + pair[0] + " " + to_add[1:] final.append(line) if i % 100==0: print(i) np.savetxt(output_f, final, fmt="%s")
def create_res_fr_en(eng_mag, fr_mag, b_dict, t_dict, output_f): print("first") eng_vectors = py.Magnitude(eng_mag) fr_vectors = py.Magnitude(fr_mag) some_keys = [] i = 0 for key, vector in eng_vectors: if key in fr_vectors: some_keys.append(key) i+=1 data_dict_fr_to_en = {} for line in some_keys: data_dict_fr_to_en[line] = line en_mat = [] fr_mat = [] for key in data_dict_fr_to_en.keys(): en = eng_vectors.query(data_dict_fr_to_en[key]) fr = fr_vectors.query(key) en_mat.append(en) fr_mat.append(fr) en_mat = np.array(en_mat) fr_mat = np.array(fr_mat) u, sig, vt = np.linalg.svd(np.matmul(en_mat.transpose(), fr_mat)) W = np.matmul(np.transpose(vt), np.transpose(u)) final = [] with open(t_dict) as f: for i, line in enumerate(f): line = line[:-1] pair = line.split(" ") word = eng_vectors.most_similar(np.matmul(fr_vectors.query(pair[1]), W), topn=5)#[0][0] to_add = "" #print(i) for w in word: to_add += (" " + w[0]) line = pair[1] + " " + pair[0] + " " + to_add[1:] final.append(line) if i % 100==0: print(i) np.savetxt(output_f, final, fmt="%s")
def __init__(self, retrofitted_magnitude, wvec_file, retrofitted_vector, topn=10): self.retrofitted_magnitude = pymagnitude.Magnitude( retrofitted_magnitude ) # This is the Q_hat vector 100 dimenstional GloVe word vectors self.topn = topn self.wvecs = wvec_file self.wvecKey = set(self.wvecs.keys()) self.retrofitted_vector = retrofitted_vector
def load_pymagnitude_model(self, given_model_name=None, language=None): '''load models; simple wrapper''' t0 = time() # ugly but from tut: import pymagnitude print("loading pymagnitude model {} ...".format(given_model_name)) if language is None: self.embedding_model = pymagnitude.Magnitude(given_model_name) else: self.embedding_model = pymagnitude.Magnitude(given_model_name, language=language) print("... done in %0.3fs." % (time() - t0)) print("initializing for most_similar-searches...") t0 = time() print(self.embedding_model.most_similar(positive=["test"])) print("... done in %0.3fs." % (time() - t0)) return
def __init__(self, filepath='source/wiki-news-300d-1M-subword.magnitude', dimensions=300): """ Load the pretrained Embeddings :param string filename: Path to pymagnitude file as *.magnitude :param int dimensions: Dimensions of the Vectors (to generate zeros for padding) """ self.dimensions = dimensions self.filepath = filepath self.vectors = magnitude.Magnitude(filepath)
def main(opts): wvecs = pymagnitude.Magnitude(opts.wordvecfile) dim = wvecs.dim word_vecs = {} #copy the read-only word vectors in a dictionary for modification for k in tqdm.trange(len(wvecs)): word_vecs[wvecs[k][0]] = wvecs[k][1] alpha = 0.075 #normalizing the lexicon to all lower case letters ontology = read_lexicon(opts.lexicon) for t in tqdm.trange(10): for i in range(len(wvecs)): wordvec = wvecs[i] #calculate the retrofitted matrix by increasing the similarities between the synonyms as per the lexicon file sums = edge_vectors(word_vecs, wordvec[0], ontology, dim) word_vecs[wordvec[0]] = (sums[0] + alpha*wordvec[1]) / (sums[1] + alpha) f = open(opts.output, 'w', encoding = 'utf-8') #write the retrofitted word vectors in output file for k,v in word_vecs.items(): f.write(str(k) for x in v: f.write(' ' + str(x)) f.write('\n') f.close() if __name__ == '__main__': optparser = optparse.OptionParser() optparser.add_option("-w", "--wordvecfile", dest="wordvecfile", default=os.path.join('data', 'glove.6B.100d.magnitude'), help="word vectors file") optparser.add_option("-l", "--lexicon", dest="lexicon", default=os.path.join('data', 'lexicons', 'wordnet-synonyms.txt'), help="lexicon path") optparser.add_option("-o", "--output", dest="output", default=os.path.join('data', 'glove.6B.100d.retrofit.txt'), help="output txt file path") (opts, _) = optparser.parse_args() main(opts)
def __init__(self, vectors_path: str, scaling: float = 1.0) -> None: self.vectors = pymagnitude.Magnitude(vectors_path, normalized=False) self.scaling = scaling
def topic_extractor(data_df, type_of_extraction): """Topic extractor function extracts topics from a csv file entered in file path. Depending on Type it can extract topics of individual papers or the whole session""" def topic_params_object(topics, words, vectors_300): #normalization parameters start_range = 1 end_range = 10 topic_parameters = [] topics_ids = topics.columns.values.tolist() topic_weight = 0 for each_topic in topics: topic_average_vector = [] this_topic = { 'topic_id': each_topic, 'vector': [], 'vector300': [], 'words': [], 'weight': 0 } this_topic['words'] = topics[each_topic].dropna().values.tolist() for word in this_topic['words']: if word[len(word) - 1] == '*': this_topic['vector'].append(words[words['word*'] == word] ['vector'].values.tolist()[0]) this_topic['weight'] = this_topic['weight'] + words[ words['word*'] == word]['sigma_nor'].values.tolist()[0] this_topic['vector300'].append( words[words['word*'] == word]['vector300'].values.tolist()[0]) # pdb.set_trace() this_topic['vector'] = numpy.mean(this_topic['vector'], axis=0) this_topic['vector300'] = numpy.mean(this_topic['vector300'], axis=0) topic_parameters.append(this_topic) #Normalize values between 0-1 df_topic_parameters = pd.DataFrame(topic_parameters) df_topic_parameters_weight = df_topic_parameters['weight'] df_topic_parameters['weight'] = (end_range - start_range) * ( df_topic_parameters_weight - df_topic_parameters_weight.min()) / ( df_topic_parameters_weight.max() - df_topic_parameters_weight.min()) + start_range # pdb.set_trace() return df_topic_parameters #SETUP LANGUAGE MODEL AND PIPELINE VARIABLES#### lang = 'en' language_model = { 'en': './classes/nsaSrc/data/external/wiki-news-300d-1M.magnitude' } if type_of_extraction == 'session': percentile_C = 95 else: percentile_C = 80 target_dim = 10 cluster_selection_method = 'leaf' #We only get the ones that have text data_df = data_df[data_df['text'] != 'Parsing Error'] def en_filter(text): spacey_doc = nlp(text) # pdb.set_trace() sentences = [] for sentence in spacey_doc.sents: for token in sentence: if not token.__len__( ) < 4 and not token.is_stop and not token.like_num and not token.is_digit: sentences.append(str(token)) # else: # print(token) return sentences # data_subset = [record for record in data if record['Text'] != False and 'sagepub' in record['Text'].lower()] # data_subset = data_df text_fn = { 'en': './classes/nsaSrc/data/processed/en_flat.txt', 'ko': './classes/nsaSrc/data/processed/ko_flat.txt' } # THIS CODE CREATES THE DATA FOR PROCESSING AND STORES IT IN ./vizlit/data/processed AS A FLAT TEXT FILE ########## # # if type_of_extraction == 'session': with open(text_fn[lang], 'w', encoding='utf-8') as fp: for record_text in data_df['text']: # if record['Text'] != False: sentences = en_filter(record_text) for s in sentences: fp.write(s + '\n') elif type_of_extraction == 'document': with open(text_fn[lang], 'w', encoding='utf-8') as fp: # for x in range(2): for record_text in data_df['text']: # if record['Text'] != False: sentences = en_filter( record_text) #needs to be an array to work for sent in sentences: fp.write(sent + '\n') ########################## #################################### ##FIND SIGNIGICANT TERMS IN CORPUS ######### word_level_statistics = WordLevelStatistics(corpus_file=[text_fn[lang]], percentile_C=percentile_C) word_level_statistics.compute_spectra() full_collection = pd.DataFrame(word_level_statistics.level_stat) lvls_df = pd.DataFrame(word_level_statistics.level_stat_thresholded) lvls_df['threshold'] = word_level_statistics.threshold # pdb.set_trace() #Minimize corpus to most important words significant_terms = word_level_statistics.significant_terms #SOMETHING BROKE HERE FOR SOME REASON LVLS_DF IS ONE VALUE BIGGER THAN IT SHOULD AFTER FILTERING # if type_of_extraction == 'session': #Remove numbers and short words spacey_significant_terms = nlp(' '.join( word_level_statistics.significant_terms)) significant_terms = significant_terms significant_terms = [] for sentence in spacey_significant_terms.sents: for token in sentence: if not token.__len__( ) < 4 and not token.is_stop and not token.like_num and not token.is_digit: significant_terms.append(str(token)) # else: # # pdb.set_trace() # # print (token) # #Remove token from dataframe # # pdb.set_trace() # lvls_df = lvls_df[lvls_df.word != str(token)] lvls_df_filtered = pd.DataFrame() for each_word in significant_terms: lvls_df_filtered = lvls_df_filtered.append( lvls_df[lvls_df.word == each_word]) lvls_df = lvls_df_filtered # # print('With threshold = {}, ({} percentile) find {} significant terms.'.format( # # word_level_statistics.threshold, word_level_statistics.percentile_C, len(significant_terms))) ##CLUSTER WORD EMBEDDINGS vectors = {} for l in ['en']: vectors[l] = pymagnitude.Magnitude(language_model[l]) significant_vectors = vectors[lang].query(significant_terms) try: fit = umap.UMAP(n_neighbors=15, n_components=target_dim, metric='euclidean') data_d2v = fit.fit_transform( significant_vectors ) #np.asfarray(significant_vectors, dtype='float64' )) if type_of_extraction == 'session': #store model joblib.dump(fit, model_file_name) fit = umap.UMAP(n_neighbors=15, n_components=2, metric='euclidean') vec_2d = fit.fit_transform(data_d2v) except Exception as ex: pdb.set_trace() logging.error( "Trying with less dimensions. Got exception {}".format(ex)) # data_d2v = bhtsne.tsne(np.asfarray(significant_vectors, dtype='float64' ),dimensions=2) # vec_2d = data_d2v #Try again with less neighbors this is just a TEMPORAL FIX fit = umap.UMAP(n_neighbors=7, n_components=target_dim, metric='euclidean') data_d2v = fit.fit_transform(significant_vectors) fit = umap.UMAP(n_neighbors=7, n_components=2, metric='euclidean') vec_2d = fit.fit_transform(data_d2v) try: lvls_df['vector'] = [v for v in data_d2v] except ValueError: pdb.set_trace() print('Error') lvls_df['vector300'] = [v for v in significant_vectors] significant_terms_enriched = enrich_significant_terms( lvls_df, data_d2v, vec_2d, cluster_selection_method) topics, top_columns = display_topics(significant_terms_enriched, n_rows=25, n_cols=250) # topics,top_columns = display_topics(significant_terms_enriched,n_rows=25,n_cols=10)#testing with 10 topics print('{} topics'.format(significant_terms_enriched['topic'].max() + 1)) print('/n') print(topics) topic_params = topic_params_object(topics, lvls_df, significant_vectors) return {'topics': topics, 'lvls_df': lvls_df, 'topic_params': topic_params}
def __init__(self, fname): self._vectors = pymagnitude.Magnitude(fname, lazy_loading=-1, blocking=True)
def create_input_embeddings(base_name, keyword_size, caption_json_path, doodle_json_path, w2v_magnitdue_path, min_word_freq=5, max_len=50): dataset_name = 'coco_' + base_name output_folder = os.path.join('data', base_name) captions_per_image = 5 # Read Karpathy JSON with open(caption_json_path, 'r') as j: data = json.load(j) # Read doodle JSON with open(doodle_json_path, 'r') as j: doodle = json.load(j) # Read w2v w2v = pymagnitude.Magnitude(w2v_magnitdue_path) # Read image paths and captions for each image train_keywords = [] train_image_captions = [] val_keywords = [] val_image_captions = [] test_keywords = [] test_image_captions = [] word_freq = Counter() exclude_count = 0 total_count = 0 for img in data['images']: captions = [] key_freq = Counter() for c in img['sentences']: # Update word frequency word_freq.update(c['tokens']) if len(c['tokens']) <= max_len: captions.append(c['tokens']) keys = [t for t in c['tokens'] if doodle.get(t) is not None] if len(keys) > 0: key_freq.update(keys) if len(captions) == 0 or len(key_freq) == 0: print("----------> no key match: ", captions[-1]) continue keywords = [nn for nn, c in key_freq.most_common(keyword_size)] total_count += 1 if len(keywords) < keyword_size: # print(keywords) exclude_count += 1 continue if img['split'] in {'train', 'restval'}: train_keywords.append(keywords) train_image_captions.append(captions) elif img['split'] in {'val'}: val_keywords.append(keywords) val_image_captions.append(captions) elif img['split'] in {'test'}: test_keywords.append(keywords) test_image_captions.append(captions) # Create word map words = [w for w in word_freq.keys() if word_freq[w] > min_word_freq] word_map = {k: v + 1 for v, k in enumerate(words)} word_map['<unk>'] = len(word_map) + 1 word_map['<start>'] = len(word_map) + 1 word_map['<end>'] = len(word_map) + 1 word_map['<pad>'] = 0 # Create a base/root name for all output files base_filename = dataset_name + '_' + str( keyword_size ) # + str(captions_per_image) + '_cap_per_img_' + str(min_word_freq) + '_min_word_freq' # Save word map to a JSON if not os.path.exists(output_folder): os.mkdir(output_folder) with open( os.path.join(output_folder, 'WORDMAP_' + base_filename + '.json'), 'w') as j: json.dump(word_map, j) print("{} of {} will be excluded".format(exclude_count, total_count)) # Sample captions for each image, save images to HDF5 file, and captions and their lengths to JSON files seed(123) for keys, imcaps, split in [(train_keywords, train_image_captions, 'TRAIN'), (val_keywords, val_image_captions, 'VAL'), (test_keywords, test_image_captions, 'TEST')]: with h5py.File( os.path.join(output_folder, split + '_IMAGES_' + base_filename + '.hdf5'), 'a') as h: # Make a note of the number of captions we are sampling per image h.attrs['captions_per_image'] = captions_per_image print("\nReading %s images and captions, storing to file...\n" % split) enc_keywords = [] enc_captions = [] caplens = [] for i in range(len(imcaps)): # Sample captions if len(imcaps[i]) < captions_per_image: captions = imcaps[i] + [ choice(imcaps[i]) for _ in range(captions_per_image - len(imcaps[i])) ] else: captions = sample(imcaps[i], k=captions_per_image) # Sanity check assert len(captions) == captions_per_image for j, c in enumerate(captions): if keys[i] in c: print("found", keys[i], c) # Encode keywords enc_k = [ word_map.get(key, word_map['<unk>']) for key in keys[i] ] # Encode captions enc_c = [word_map['<start>']] + [ word_map.get(word, word_map['<unk>']) for word in c ] + [word_map['<end>'] ] + [word_map['<pad>']] * (max_len - len(c)) # Find caption lengths c_len = len(c) + 2 enc_keywords.append(enc_k) enc_captions.append(enc_c) caplens.append(c_len) # Sanity check assert len(enc_keywords) == len(enc_captions) == len(caplens) # Save encoded captions and their lengths to JSON files with open( os.path.join( output_folder, split + '_KEYWORDS_' + base_filename + '.json'), 'w') as j: json.dump(enc_keywords, j) with open( os.path.join( output_folder, split + '_CAPTIONS_' + base_filename + '.json'), 'w') as j: json.dump(enc_captions, j) with open( os.path.join(output_folder, split + '_CAPLENS_' + base_filename + '.json'), 'w') as j: json.dump(caplens, j)
# ppdb-xl # alpha = 1 # beta = 1.0185 # dev.out score: 44.9207 # optparser.add_option("-r", action="context_word", dest="context_word", default=False) (opts, _) = optparser.parse_args() if opts.logfile is not None: logging.basicConfig(filename=opts.logfile, filemode='w', level=logging.DEBUG) retrain = False word_vector = load_wvecs(pymagnitude.Magnitude(opts.wordvecfile)) new_retrofitted_magnitude = os.path.join( 'data', 'glove.6B.100d.retrofit.magnitude') if retrain: new_retrofitted_txt = os.path.join('data', 'glove.6B.100d.retrofit.txt') lexicon = load_lexicon(opts.lexiconfile) retrofitted_vector = retrofitting(word_vector, lexicon, opts.iteration, opts.alpha, opts.beta) # We need to do retrofitting here save_word_vecs(retrofitted_vector, new_retrofitted_txt) os.system("python3 -m pymagnitude.converter -i " + new_retrofitted_txt + " -o " + new_retrofitted_magnitude) else: retrofitted_vector = load_wvecs(
def main(tmx_file, lang, percentile): """ Computes topic models by clustering dense word embeddings. """ logger = logging.getLogger(__name__) logger.info('Compute topic model: {}, {}, {}'.format( tmx_file, lang, percentile)) word_level_statistics = WordLevelStatistics(corpus_file=[text_fn[lang]], percentile_C=percentile) word_level_statistics.compute_spectra() lvls_df = pd.DataFrame(word_level_statistics.level_stat_thresholded) significant_terms = word_level_statistics.significant_terms print('Threshold: {}, ({} percentile) find {} significant terms.'.format( word_level_statistics.threshold, word_level_statistics.percentile_C, len(significant_terms))) vectors = {} for language in ['en']: vectors[language] = pymagnitude.Magnitude(language_model[language]) significant_vectors = vectors[lang].query(significant_terms) try: fit = umap.UMAP(n_neighbors=15, n_components=target_dim, metric='euclidean') data_d2v = fit.fit_transform(significant_vectors) fit = umap.UMAP(n_neighbors=15, n_components=2, metric='euclidean') vec_2d = fit.fit_transform(data_d2v) except Exception as ex: logging.error("Trying bhtsne. Got exception {}".format(ex)) data_d2v = bhtsne.tsne(np.asfarray(significant_vectors, dtype='float64'), dimensions=2) vec_2d = data_d2v lvls_df['vector'] = [v for v in data_d2v] significant_terms_enriched = enrich_significant_terms( lvls_df, data_d2v, vec_2d, cluster_selection_method) exemplar_scores, hovers = topic_exemplars(significant_terms_enriched) sents = [s['sentence'] for s in sentences] sent_ids = [s['sent_id'] for s in sentences] significant_terms_enriched['weight'] = significant_terms_enriched[ 'sigma_nor'] msg_topics = message_topics(topic_model=significant_terms_enriched, sentences=sents, sentences_ids=sent_ids, significant_terms=significant_terms) msg_topics_df = pd.DataFrame(msg_topics).fillna(0.0).T K = significant_terms_enriched['topic'].max() + 1 topics, top_columns = display_topics(significant_terms_enriched, n_rows=25, n_cols=K) pwd = os.environ.get('PWD') fmt = '{}/models/{}_{}_{}.csv' significant_terms_file_name = fmt.format(pwd, 'significant_terms', lang, str(percentile)) msg_topics_file_name = fmt.format(pwd, 'msg_topics', lang, str(percentile)) data_filename_fmt = '{}/models/significant_vectors_{}_{}.npy' data_filename = data_filename_fmt.format(pwd, lang, percentile) significant_terms_enriched.to_csv(significant_terms_file_name, index=False, encoding='utf-8') msg_topics_df.to_csv(msg_topics_file_name, index=False, encoding='utf-8') np.save(data_filename, data_d2v)
def extension3_eng_fr(eng_mag, fr_mag, b_dict, t_dict, output_f): eng_vectors = py.Magnitude(eng_mag) fr_vectors = py.Magnitude(fr_mag) data_dict_en_to_fr = {} data_dict_fr_to_en = {} with open(b_dict) as f: for line in f: pair = line.split(" ") pair[1] = pair[1][:-1] data_dict_en_to_fr[pair[0]] = pair[0] data_dict_fr_to_en[pair[1]] = pair[1] en_mat = [] fr_mat = [] for key in data_dict_fr_to_en.keys(): en = eng_vectors.query( data_dict_fr_to_en[key]) #vector of english word fr = fr_vectors.query(key) #vector of french word en_mat.append(en) fr_mat.append(fr) en_mat = np.array(en_mat) fr_mat = np.array(fr_mat) u, sig, vt = np.linalg.svd(np.matmul(fr_mat.transpose(), en_mat)) W = np.matmul(np.transpose(vt), np.transpose(u)) mapped = np.matmul(en_mat, W) mat_avg = [] for key in data_dict_fr_to_en.keys(): en = eng_vectors.query( data_dict_fr_to_en[key]) #vector of english word fr = fr_vectors.query(key) #vector of french word average = (np.matmul(en, W) + fr) / 2 mat_avg.append(average) mat_avg = np.array(mat_avg) uu, sigsig, vtvt = np.linalg.svd(np.matmul(mat_avg.transpose(), mapped)) WW = np.matmul(np.transpose(vtvt), np.transpose(uu)) final_transform = np.matmul(W, WW) final = [] with open(t_dict) as f: for i, line in enumerate(f): print(i) line = line[:-1] pair = line.split(" ") line = pair[0] + " " + pair[1] topn = fr_vectors.most_similar(np.matmul( eng_vectors.query(pair[0]), final_transform), topn=5) for j in range(5): word = topn[j][0] line = line + " " + word final.append(line) np.savetxt(output_f, final, fmt="%s")
def __init__(self, wvec_file, topn=10,lexicon=None): self.wvecs = pymagnitude.Magnitude(wvec_file) # self.wvecfile = wvec_file self.topn = topn self.lexicon = lexicon
import numpy as np from wiki import data from wiki import utils from musket_core.datasets import PredictionItem import pymagnitude m_path = "/Users/dreamflyer/Downloads/glove-lemmatized.6B.300d.magnitude" vectors = pymagnitude.Magnitude(m_path) none = vectors.query("none") def convert_name(name): tokens = utils.stoa_1(name) result = [] size = len(tokens) for i in range(size): result.append(' '.join(tokens[0:i + 1])) result.reverse() return result
def __init__(self, wvec_file, topn=10): self.wvecs = pymagnitude.Magnitude(wvec_file) self.topn = topn
import pickle import numpy as np import pymagnitude # load input data X = pickle.load(open('X.pkl', 'rb')) # load the pretrained word2vec model for feature assignment pretrained_magnitude = r'../../../Downloads/pretrained/glove.6B.300d.magnitude' vectors = pymagnitude.Magnitude(pretrained_magnitude) # setup speciality cleaning def get_document_features(data_in): """Used to clean 80k Mechanical Turk responses. Params: data_in -- text segment to process Returns: features for input text and features """ data_in = data_in.replace('<span class=\"active_text\">', '').replace('</span>', '') body = data_in.split(r'\n ')[1].replace( '\n', '') avg_vec = np.mean(vectors.query(body.split(' ')), axis=(0)) high_text = data_in.split( r'\n ')[0].replace('\n', '') high_avg_vec = np.mean(vectors.query(high_text.split(' ')), axis=(0)) return avg_vec, high_avg_vec
def sewing_space_en_fr(eng_mag, fr_mag, b_dict, t_dict, output_f, alpha=0.1): eng_vectors = py.Magnitude(eng_mag) fr_vectors = py.Magnitude(fr_mag) data_dict_en_to_fr = {} data_dict_fr_to_en = {} with open(b_dict) as f: for line in f: pair = line.split(" ") pair[1] = pair[1][:-1] data_dict_en_to_fr[pair[0]] = pair[0] data_dict_fr_to_en[pair[1]] = pair[1] #print('i get here') en_d = {} fr_d = {} for key in data_dict_fr_to_en.keys(): en_d[data_dict_fr_to_en[key]] = eng_vectors.query( data_dict_fr_to_en[key]) fr_d[key] = fr_vectors.query(key) for key in data_dict_fr_to_en.keys(): en = en_d[data_dict_fr_to_en[key]] #vector of english word fr = fr_d[key] #vector of french word diff_vec_fr = en - fr diff_vec_en = fr - en en = en + diff_vec_en * alpha fr = fr + diff_vec_fr * alpha en_d[data_dict_fr_to_en[key]] = en fr_d[key] = fr en_mat = [] fr_mat = [] for key in data_dict_fr_to_en.keys(): en = en_d[data_dict_fr_to_en[key]] fr = fr_d[key] en_mat.append(en) fr_mat.append(fr) en_mat = np.array(en_mat) fr_mat = np.array(fr_mat) u, sig, vt = np.linalg.svd(np.matmul(fr_mat.transpose(), en_mat)) W = np.matmul(np.transpose(vt), np.transpose(u)) final_transform = W #np.matmul(W, WW) final = [] with open(t_dict) as f: for i, line in enumerate(f): print(i) line = line[:-1] pair = line.split(" ") line = pair[0] + " " + pair[1] topn = fr_vectors.most_similar(np.matmul( eng_vectors.query(pair[0]), final_transform), topn=5) for j in range(5): word = topn[j][0] line = line + " " + word final.append(line) np.savetxt(output_f, final, fmt="%s")
def load_embedding_model(path: str) -> pymagnitude.Magnitude: # github.com/plasticityai/magnitude logging.info(f"loading embedding model from:\n {path}") vectors = pymagnitude.Magnitude(path=path) return vectors