def googlenews(allowed_str): # Word2vec (GoogleNews): # non-normalized. # unordered, from gensim's dict-like structure. an_w = an.load(fnames[2], verbosity=1) if an_w is not None: an_w.add_evaluators(get_e()) an_w.analysis(print_report=False) an_w.save() else: import gensim model_w = gensim.models.KeyedVectors.load_word2vec_format( "/mnt/pccfs/not_backed_up/nate/analyst_embeddings/" "GoogleNews-vectors-negative300.bin", binary=True) #common_w = list(filter(lambda w: w in model_w.vocab.keys() \ # or bytes(w) in model_w.vocab.keys(), allowed_str)) common_w = [w for w in allowed_str if w in model_w.vocab.keys()] embed_w = [model_w.get_vector(w) for w in common_w] an_w = an.Analyst(embeddings=embed_w, strings=common_w, metric=metric, auto_print=printing, desc="GoogleNews", parallel_count=cpus, evaluators=get_e(), auto_save=2, file_name=fnames[2], over_write=True)
def deps(allowed_str): # Dependency-Based Word Embeddings: # appears to be ordered by frequency. # Normalized. a = an.load(fnames[8], verbosity=1) if a is not None: a.add_evaluators(get_e()) a.analysis(print_report=False) a.save() else: strings, embed_g = read_text_table( "/mnt/pccfs/not_backed_up/nate/analyst_embeddings/" "dependency_based_word_embeddings/deps.words", firstline=False, limit_lines=MAX_LINES) common = [w for w in allowed_str if w in strings] indeces = [strings.index(w) for w in common] embed_g = embed_g[indeces] a = an.Analyst(embeddings=embed_g, strings=common, metric=metric, auto_print=printing, desc="DEPS", parallel_count=cpus, evaluators=get_e(), auto_save=2, file_name=fnames[8], over_write=True)
def glove(allowed_str): # GloVe: # ordered by frequency. # non-normalized. an_g = an.load(fnames[3], verbosity=1) if an_g is not None: an_g.add_evaluators(get_e()) an_g.analysis(print_report=False) an_g.save() else: str_g, embed_g = read_text_table( "/mnt/pccfs/not_backed_up/nate/analyst_embeddings/" "glove.6B.300d.txt", firstline=False, limit_lines=MAX_LINES) #embed_g = [normalize(v) for v in embed_g] common = [w for w in allowed_str if w in str_g] indeces = [str_g.index(w) for w in common] embed_g = embed_g[indeces] an_g = an.Analyst(embeddings=embed_g, strings=common, metric=metric, auto_print=printing, desc="GloVe", parallel_count=cpus, evaluators=get_e(), auto_save=2, file_name=fnames[3], over_write=True)
def numberbatch(allowed_str): # ConceptNet Numberbatch: # alphanumeric order. # normalized. #if not os.path.isfile("embeddings/an_numberbatch"): an_nb = an.load(fnames[1], verbosity=1) if an_nb is not None: an_nb.add_evaluators(get_e()) an_nb.analysis(print_report=False) an_nb.save() else: str_nb, embed_nb = read_text_table( "/mnt/pccfs/not_backed_up/nate/analyst_embeddings/" "numberbatch-en-17.06.txt", firstline=True) common_nb = [w for w in allowed_str if w in str_nb] indeces_nb = [str_nb.index(w) for w in common_nb] #embed_nb = np.array([embed_nb[i] for i in indeces_nb]) embed_nb = embed_nb[indeces_nb] an_nb = an.Analyst(embeddings=embed_nb, strings=common_nb, metric=metric, auto_print=printing, parallel_count=cpus, desc="ConceptNet Numberbatch", evaluators=get_e(), auto_save=2, file_name=fnames[1], over_write=True)
def fasttext(allowed_str): # Fasttext: # ordered by frequency. # non-normalized. an_fnc = an.load(fnames[0], verbosity=1) if an_fnc is not None: an_fnc.add_evaluators(get_e()) # + get_e_freq()) an_fnc.analysis(print_report=False) an_fnc.save() else: with open( "/mnt/pccfs/not_backed_up/nate/analyst_embeddings/" "fasttext.en.py2.pkl", 'rb') as f: data_ft = pkl.load(f) str_f = data_ft['tokens'][:MAX_LINES] str_f = list(map(str, str_f)) embed_f = data_ft['vectors'][:MAX_LINES] #embed_fn = np.array([normalize(v) for v in embed_f]) common = [w for w in allowed_str if w in str_f] indeces = [str_f.index(w) for w in common] embed_f = embed_f[indeces] an_fnc = an.Analyst(embeddings=embed_f, strings=common, auto_print=printing, metric=metric, desc="Fasttext", evaluators=get_e(), auto_save=2, file_name=fnames[0], over_write=True, parallel_count=cpus) # + get_e_freq())
def sense_2_vec(allowed_str): # Sense2Vec: # originally from reddit, then through sense2vec, I modify sense2vec # by doing a weighted average of all the parts of speech of each word # I seek, since they are often close in the space. # NOT normalized. # 128 dimensions. a = an.load(fnames[4], verbosity=1) if a is not None: a.add_evaluators(get_e()) a.analysis(print_report=False) a.save() else: import sense2vec s2v = sense2vec.load('/mnt/pccfs/not_backed_up/nate/' 'analyst_embeddings/reddit_vectors-1.1.0/') strings = [] vectors = [] endings = [ '|ADJ', '|ADP', '|ADV', '|AUX', '|CONJ', '|DET', '|INTJ', '|NOUN', '|NUM', '|PART', '|PRON', '|PROPN', '|PUNCT', '|SCONJ', '|SYM', '|VERB', '|X' ] for s in allowed_str: senses = [] freq_sum = 0 for e in endings: try: t = s2v[s + e] senses.append(t[1] * t[0]) freq_sum += t[0] except: pass if len(senses) > 0: strings.append(s) vectors.append(np.sum(senses, axis=0) / freq_sum) a = an.Analyst(embeddings=np.array(vectors), strings=strings, metric=metric, auto_print=printing, desc="Sense2Vec", parallel_count=cpus, evaluators=get_e(), auto_save=2, file_name=fnames[4], over_write=True)
def use_large(allowed_str): # Universal Sentence Encoder: # embeddings must be found by hand from things to encode. # normalized. # 512 dimensions. an_u = an.load(fnames[7], verbosity=1) if an_u is not None: an_u.add_evaluators(get_e()) an_u.analysis(print_report=False) an_u.save() else: import tensorflow as tf import tensorflow_hub as hub module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3" embed = hub.Module(module_url) tf.logging.set_verbosity(tf.logging.ERROR) batches = [ allowed_str[b:b + 10000] for b in range(0, len(allowed_str), 10000) ] embeddings = [] with tf.Session() as sess: sess.run([ tf.global_variables_initializer(), tf.tables_initializer() ]) for b in batches: embeddings.append(sess.run(embed(b))) embeddings = np.vstack(embeddings) an_u = an.Analyst(embeddings=embeddings, strings=allowed_str, metric=metric, auto_print=printing, desc="USE Large", parallel_count=cpus, evaluators=get_e(), auto_save=2, file_name=fnames[7], over_write=True)
def use_lite(allowed_str): # Universal Sentence Encoder: # embeddings must be found by hand from things to encode. # normalized. # 512 dimensions. an_u = an.load(fnames[6], verbosity=1) if an_u is not None: an_u.add_evaluators(get_e()) an_u.analysis(print_report=False) an_u.save() else: import tensorflow as tf import tensorflow_hub as hub import sentencepiece as spm def process_to_IDs_in_sparse_format(sp, sentences): # An utility method that processes sentences with the sentence piece processor # 'sp' and returns the results in tf.SparseTensor-similar format: # (values, indices, dense_shape) ids = [sp.EncodeAsIds(x) for x in sentences] max_len = max(len(x) for x in ids) dense_shape = (len(ids), max_len) values = [item for sublist in ids for item in sublist] indices = [[row, col] for row in range(len(ids)) for col in range(len(ids[row]))] return (values, indices, dense_shape) with tf.Session() as sess: module = hub.Module( "https://tfhub.dev/google/universal-sentence-encoder-lite/2" ) spm_path = sess.run(module(signature="spm_path")) # spm_path now contains a path to the SentencePiece model stored inside the # TF-Hub module sp = spm.SentencePieceProcessor() sp.Load(spm_path) input_placeholder = tf.sparse_placeholder(tf.int64, shape=[None, None]) embedder = module( inputs=dict(values=input_placeholder.values, indices=input_placeholder.indices, dense_shape=input_placeholder.dense_shape)) sess.run([ tf.global_variables_initializer(), tf.tables_initializer() ]) batches = [ allowed_str[b:b + 10000] for b in range(0, len(allowed_str), 10000) ] embeddings = [] for b in batches: values, indices, dense_shape = process_to_IDs_in_sparse_format( sp, b) embeddings.append( sess.run(embedder, feed_dict={ input_placeholder.values: values, input_placeholder.indices: indices, input_placeholder.dense_shape: dense_shape })) embeddings = np.vstack(embeddings) an_u = an.Analyst(embeddings=embeddings, strings=allowed_str, metric=metric, auto_print=printing, desc="USE Lite", parallel_count=cpus, evaluators=get_e(), auto_save=2, file_name=fnames[6], over_write=True)
# numvecs = len(lines) if limit_lines == None \ # else min(len(lines), limit_lines) # dim = len(lines[0].split(" ")) - 1 # strings = [] # embeddings = np.empty(shape=(numvecs, dim)) # for i in tqdm(range(numvecs), desc="Reading " + path): # row = lines[i + firstline].split(" ") # strings.append(row[0])#str(row[0])) # embeddings[i] = row[1:] # return strings, embeddings # def get_strings(): # with open("embeddings/fasttext.en.py2.pkl", 'rb') as f: # data_ft = pkl.load(f) # str_f = data_ft['tokens'][:MAX_LINES] # return data_ft, list(map(str, str_f)) if __name__ == "__main__": #data_ft, str_f = get_strings() #fasttext(str_f, data_ft) #word2vec_analysis() a=an.load("Word2Vec Canonical Test.dill") #a.add_evaluators(get_e2()) # for e in a.evaluators: # if len(e.stats_dict) == 0 and "file_name" in dir(e): # e.file_name = e.file_name[:53] + "text/" + e.file_name[53:] a.analysis()
#assert len(set([str(unique_pts[i]) for i in range(len(unique_pts))])) == len(unique_pts) #print("asserted uniqueness of vectors") an_ccc = an.Analyst( embeddings=unique_pts[:MAX_LINES], strings=unique_lines[:MAX_LINES], metric=metric, auto_print=True, desc="ChitChatChallenge Utterance Hubs", #evaluators=["Nodal 4-Hubs"], calculate=True) print("Success at saving ChitChatChallenge Utterance Hubs: " + str(an.Analyst.save(an_ccc, filename))) a = an.load(filename) hubber = a.find_evaluator("Nodal 4-Hubs") hubs = hubber.get_clusters() sizes = [len(h) for h in hubs] order = np.argsort(sizes)[::-1] #order = np.argsort([h.stats_dict["Dispersion"] for h in hubs]) hubs = np.array(hubs)[order] #.tolist() #print(np.array(sizes)[order]) print("Number of Utterances:", len(a.strings)) print("Number of Hubs:", len(hubs)) #""" for i, h in enumerate(hubs): print("")
module_url = "https://tfhub.dev/google/universal-sentence-encoder/1" embed = hub.Module(module_url) tf.logging.set_verbosity(tf.logging.ERROR) with tf.Session() as sess: sess.run([tf.global_variables_initializer(), tf.tables_initializer()]) embed_u = sess.run(embed(str_f)) an_u = an.Analyst(embeddings=embed_u, strings=str_f, metric=metric, auto_print=False, desc="Universal Sentence Encoder") print("Success at saving Universal Sentence Encoder: " + str(an.Analyst.save( an_u, "saved_analyses/an" + str(MAX_LINES) + "_universal_sentence_encoder"))) #messagebox.showinfo("Information","Analysis 5 complete!")''' """ an_fnc = an.load("saved_analyses/an" + str(MAX_LINES) + "_fasttext_normalized") an_nb = an.load("saved_analyses/an" + str(MAX_LINES) + "_numberbatch") an_w = an.load("saved_analyses/an" + str(MAX_LINES) + "_googlenews_normalized") an_g = an.load("saved_analyses/an" + str(MAX_LINES) + "_glove_normalized") an_u = an.load("saved_analyses/an" + str(MAX_LINES) + "_universal_sentence_encoder") #an.Analyst.compare([an_fnc, an_fe, an_fne, an_fc]) #an.Analyst.compare([an_w, an_fnc, an_g, an_nb, an_u]) #an.Analyst.graph_comparison([an_w, an_fnc, an_g, an_nb, an_u], "Nodes", "Count") an.Analyst.graph_multi([an_w, an_fnc, an_g, an_nb, an_u], [("Nodes", "Count"), ("Nuclei", "Count"), ("Nodal 4-Hubs", "Count")], group_by_stat=False)