def load_things(): rcgan = RetroCycleGAN(save_folder="test", batch_size=32, generator_lr=0.0001, discriminator_lr=0.001) rcgan.load_weights(preface="final", folder=rcgan_folder) print("Loading ft") generate_fastext_embedding("cat", ft_dir=fasttext_folder) print("Ready") return rcgan
def create_sentence_embedding(c1, retrofitted_embeddings, rcgan, newlist): s = c1.split(" ") concept_vecs = [] if len(s) > 1: for word in s: try: concept_vecs.append(retrofitted_embeddings.loc[word]) except: print("Creating emb for", word) added = False for tup in newlist: if word == tup[0]: concept_vecs.append(tup[1]) added = True break if not added: concept_vecs.append( pd.Series( rcgan.g_AB.predict( np.array( generate_fastext_embedding(word)).reshape( 1, 300)).reshape(300))) concept_vecs = np.array(concept_vecs) avg = np.mean(concept_vecs, axis=0) return (c1, pd.Series(avg))
def get_embedding(param): global rcgan if param in word_dict.keys(): return word_dict[param] s = param.split(" ") if len(s)>1: a = create_sentence_embedding(param) word_dict[param] = a return word_dict[param] else: a= rcgan.g_AB.predict(np.array(generate_fastext_embedding(param)).reshape(1, 300)).reshape(300) word_dict[param] = a return word_dict[param]
def create_sentence_embedding(c1): s = c1.split(" ") concept_vecs = [] for word in s: try: concept_vecs.append(retrofitted_embeddings.loc[word]) except: # print("Creating emb for", word) if word in word_dict.keys(): concept_vecs.append(word_dict[word]) else: concept_vecs.append(pd.Series(rcgan.g_AB.predict(np.array(generate_fastext_embedding(word)) .reshape(1, 300) ).reshape(300))) concept_vecs = np.array(concept_vecs) avg = np.mean(concept_vecs, axis=0) return pd.Series(avg)
def create_data2(use_cache=True): global retrofitted_embeddings rcgan = RetroCycleGAN(save_folder="test", batch_size=32, generator_lr=0.0001, discriminator_lr=0.001) rcgan.load_weights(preface="final", folder="trained_models/retrogans/ft_full_alldata_feb11") if os.path.exists("tmp/valid_rels.hd5") and use_cache: print("Using cache") a = pd.read_hdf("tmp/valid_rels.hd5", "mat") b = pd.read_hdf("tmp/updated_embeddings.hd5", "mat") b.drop_duplicates(inplace=True) return a, b assertionspath = "train600k.txt" valid_relations = [] new = [] with open(assertionspath) as assertionsfile: assertions = csv.reader(assertionsfile, delimiter="\t") row_num = 0 skipped = 0 for assertion_row in tqdm(assertions): # if row_num>100: break row_num += 1 if row_num % 100000 == 0: print(row_num) try: rel = assertion_row[0] if "/r/" not in rel: rel = "/r/" + rel weight = float(assertion_row[3]) # print(c1_split) c1 = assertion_row[1] c2 = assertion_row[2] if c1 not in retrofitted_embeddings.index or \ c2 not in retrofitted_embeddings.index or \ rel not in relations: if rel not in relations: print("Skipping relation", rel) skipped += 1 continue # try: if len(c1.split(" ")) > 1: #We have a sentence so create the embedding with the average. # print("Not in index c1:",c1,len(c1.split(" "))) a = create_sentence_embedding(c1, retrofitted_embeddings, rcgan, new) # a = rcgan.g_AB.predict(np.array(generate_fastext_embedding(c1)).reshape(1, 300)) new.append(a) elif c1 not in retrofitted_embeddings.index: print("Not in index still and less than 1 c1:", c1, len(c1.split(" "))) a = rcgan.g_AB.predict( np.array(generate_fastext_embedding(c1)).reshape( 1, 300)).reshape(300) new.append((c1, pd.Series(a))) if len(c2.split(" ")) > 1: # print("Not in index c2:",c2,len(c2.split(" "))) a = create_sentence_embedding(c2, retrofitted_embeddings, rcgan, new) new.append(a) elif c2 not in retrofitted_embeddings.index: print("Not in index still and less than 1 c2:", c2, len(c2.split(" "))) a = rcgan.g_AB.predict( np.array(generate_fastext_embedding(c2)).reshape( 1, 300)).reshape(300) new.append((c2, pd.Series(a))) # else: # a = np.array(generate_fastext_embedding(c1)) # new.append(a) # print("Adding",c1) # except Exception as e: # print(e) # skipped+=1 # continue valid_relations.append([rel, c1, c2, weight]) except Exception as e: print("An error ocurred in", row_num, assertion_row) print(e) pass # print(e) if len(valid_relations) % 10000 == 0: print(len(valid_relations), skipped) print(skipped) new_index = [x for x in retrofitted_embeddings.index] new_vals = [x for x in retrofitted_embeddings.values] for i in range(len(new)): new_index.append(new[i][0]) new_vals.append(new[i][1]) print("Updating embeddings") retrofitted_embeddings = pd.DataFrame(data=new_vals, index=new_index) print("Dropping dupes") retrofitted_embeddings.drop_duplicates(inplace=True) print("SAVING TO FILE") retrofitted_embeddings.to_hdf("tmp/updated_embeddings.hd5", "mat") print("Generating the training data") af = pd.DataFrame(data=valid_relations, index=range(len(valid_relations))) # af = retrofitted_embeddings print("Training data:") print(af) af.to_hdf("tmp/valid_rels.hd5", "mat") return af, retrofitted_embeddings
compile=False) retrogan.compile(optimizer=Adam(), loss=['mae']) retrogan.load_weights(trained_model_path) # Load our vocabulary target_voc = pd.read_hdf(target_file_loc, 'mat') triples = [] # beef up our vocab with missing entries clean_file_contents = profession_words in_dataset = tools.check_index_in_dataset(clean_file_contents, target_voc) for i, val in enumerate(in_dataset): if not val: missing_text = clean_file_contents[i] print(missing_text) # print("Missing:",missing_text) we = tools.generate_fastext_embedding(missing_text, ft_dir="../fasttext_model/cc.en.300.bin") # print("We:",we) if missing_text in names: print("Name") index = "/c/en/"+missing_text else: print("Not name") index = tools.standardized_concept_uri("en", missing_text) # print(index) rwe = tools.get_retrofitted_embedding(we, retrogan) # print("Retrofitted_embedding",rwe) df = pd.DataFrame(data=[rwe], index=[index]) target_voc = target_voc.append(df) print(target_voc.shape) target_voc.to_hdf(output_file_loc,'mat')