def train(rels, lang=LANG, epochs=VALUE_DEFAULT_EPOCHS, epochs_load=0, size=VALUE_DEFAULT_SIZE, negative=VALUE_DEFAULT_NEGATIVE, memo=VALUE_DEFAULT_MEMO, burnin=None, reg=None, resume=False): try: if resume: filename = make_filename_model(lang, epochs_load, size, negative, memo, burnin, reg) model = PoincareModel.load(filename) print("resume {}".format(filename)) else: print("first training") raise ValueError() except: if resume: print("file not found") model = PoincareModel(rels, burn_in=0, regularization_coeff=0, negative=negative, size=size) model.train(epochs=epochs, print_every=1500) model.save(make_filename_model(lang, epochs+epochs_load, size, negative, memo, burnin, reg)) return model
def test_persistence_separate_file(self): """Tests whether the model is saved and loaded correctly when the arrays are stored separately.""" model = PoincareModel(self.data, burn_in=0, negative=3) model.train(epochs=1) model.save(testfile(), sep_limit=1) loaded = PoincareModel.load(testfile()) self.models_equal(model, loaded)
def poincare(relations_file, key_epochs): # read relation file with open(relations_file) as f: output_all = json.load(f) relations = [(x[0], x[3]) for x in output_all if x[3] != ""] # ######### poincare embedding ########## # train print(datetime.datetime.now(), "---poincare embedding Start", location()) # os.environ['PYTHONHASHSEED'] = '0' # If you want to get the same result for the same input, set the PoincareModel () argument to workers = 1 and lock seed. model = PoincareModel(train_data=relations, size=2, negative=8, workers=1, seed=1) model.train(epochs=key_epochs) print(datetime.datetime.now(), "---poincare embedding End", location()) # Create dictionary {keys: value} = {term: 2D coordinate values} vec = {} for word in model.kv.vocab.keys(): vec[word] = model.kv.get_vector(word) print(datetime.datetime.now(), "---model.kv.vocab.keys End", location()) # plot ''' poincare_map = gensim.viz.poincare.poincare_2d_visualization(model=model, tree=relations, figure_title="tutorial", show_node_labels=model.kv.vocab.keys()) offline.plot(poincare_map) ''' return vec, model
def test_wrong_gradients_raises_assertion(self): """Tests that discrepancy in gradients raises an error.""" model = PoincareModel(self.data, negative=3) model._loss_grad = Mock(return_value=np.zeros((2 + model.negative, model.size))) with self.assertRaises(AssertionError): model.train(epochs=1, batch_size=1, check_gradients_every=1)
def test_burn_in_only_done_once(self): """Tests that burn-in does not happen when train is called a second time.""" model = PoincareModel(self.data, negative=3, burn_in=1) model.train(epochs=0) original_vectors = np.copy(model.kv.syn0) model.train(epochs=0) self.assertTrue(np.allclose(model.kv.syn0, original_vectors))
def test_gradients_check(self): """Tests that the model is trained successfully with gradients check enabled.""" model = PoincareModel(self.data, negative=3) try: model.train(epochs=1, batch_size=1, check_gradients_every=1) except Exception as e: self.fail('Exception %s raised unexpectedly while training with gradient checking' % repr(e))
def test_persistence(self): """Tests whether the model is saved and loaded correctly.""" model = PoincareModel(self.data, burn_in=0, negative=3) model.train(epochs=1) model.save(testfile()) loaded = PoincareModel.load(testfile()) self.models_equal(model, loaded)
def train_embeddings( input_path, # path to input edge relations delimiter, # input file delim output_path, # path to output embedding vectors size=2, # embed dimension alpha=0.1, # learning rate burn_in=10, # burn in train rounds burn_in_alpha=0.01, # burn in learning rate workers=1, # number of training threads used negative=10, # negative sample size epochs=100, # training rounds print_every=500, # print train info batch_size=10): # num samples in batch # load file with edge relations between entities relations = PoincareRelations(file_path=input_path, delimiter=delimiter) # train model model = PoincareModel(train_data=relations, size=size, alpha=alpha, burn_in=burn_in, burn_in_alpha=burn_in_alpha, workers=workers, negative=negative) model.train(epochs=epochs, print_every=print_every, batch_size=batch_size) # save output vectors model.kv.save_word2vec_format(output_path) return
def train_run(args): # create experiment name from args # create log folder, params folder from exp name # Start logging in exp log folder # save trained model in exp params folder exp_name = 'HB'+'time' + str(datetime.now()) + '_EXP' + str(args.train_dir) + \ '_prbt' + str(args.prob_threshold) + '_reg' + str(args.reg_coef) + \ '_dim' + str(args.embed_dim) + '_lr' + str(args.learning_rate) + \ '_neg' + str(args.negs) + '_epoc' + str(args.epochs) + '_burnin' + str(args.burn_in) exp_name = exp_name.replace(":", "-") exp_name = exp_name.replace("/", "-") exp_name = exp_name.replace(" ", "-") print(exp_name) # Training Logs Folder exp_log_folder = args.log_folder + exp_name + '/' if not os.path.exists(exp_log_folder): os.makedirs(exp_log_folder) logging_file = exp_log_folder + 'logging.txt' logging.basicConfig(filename=logging_file, level=logging.INFO) # Model saving folder exp_params_folder = args.params_folder + exp_name + '/' if not os.path.exists(exp_params_folder): os.makedirs(exp_params_folder) training_file = args.train_dir + args.trn_file trn_dataset = data_loader.get_data_list(training_file, args.prob_threshold) print("Number of training examples: ", len(trn_dataset)) # Create the model definition model = PoincareModel(train_data=trn_dataset, size=args.embed_dim, alpha=args.learning_rate, negative=args.negs, regularization_coeff=args.reg_coef, burn_in=args.burn_in, burn_in_alpha=args.burn_in_alpha, init_range=args.init_range, seed=args.random_seed) # Start the model training model.train(epochs=args.epochs, batch_size=args.batch_size, print_every=args.print_every) # Save the model model_save_name = exp_params_folder + 'gensim_model.params' model.save(model_save_name) # Save the arguments in the params folder args_fname = exp_params_folder + 'args_model.pkl' with open(args_fname, "wb") as f: pickle.dump(args, f) return
def test_reproducible(self): """Tests that vectors are same for two independent models trained with the same seed.""" model_1 = PoincareModel(self.data_large, seed=1, negative=3, burn_in=1) model_1.train(epochs=2) model_2 = PoincareModel(self.data_large, seed=1, negative=3, burn_in=1) model_2.train(epochs=2) self.assertTrue(np.allclose(model_1.kv.syn0, model_2.kv.syn0))
def get_poincare_model(relations, emb_size, num_threads=1): print('Learning Poincare embeddings with %d relations' % len(relations)) model = PoincareModel(relations, size=emb_size, negative=2) t_start = datetime.now() model.train(epochs=50) t_end = datetime.now() print('Training time: %s' % (t_end - t_start)) return model
def embedding(namespace, emb_fname): graph = go_graph.copy() for n, attr in go_graph._node.items(): if attr['namespace'] != namespace: graph.remove_node(n) model = PoincareModel(train_data=graph.edges(), size=dim) model.train(epochs=num_epochs, print_every=500) model.kv.save(emb_fname) return model.kv
def test_vector_dtype(self): """Tests whether vectors have the correct dtype before and after training.""" model = PoincareModel(self.data_large, dtype=np.float32, burn_in=0, negative=3) self.assertEqual(model.kv.syn0.dtype, np.float32) model.train(epochs=1) self.assertEqual(model.kv.syn0.dtype, np.float32)
def test_train_after_load(self): """Tests whether the model can be trained correctly after loading from disk.""" model = PoincareModel(self.data, burn_in=0, negative=3) model.train(epochs=1) model.save(testfile()) loaded = PoincareModel.load(testfile()) model.train(epochs=1) loaded.train(epochs=1) self.models_equal(model, loaded)
def poincare_disk_model(relations, dimension = 2, workers = 1, negative_sample = 2, batch_number = 10): #for i in range(100): # print(relations[i]) print("poincare ball model initialization") model = PoincareModel(relations, negative = negative_sample, size = dimension, workers = workers) print("start poincare ball model training") #batch = int(len(relations)/batch_number) #print("batch size: ",batch) model.train(epochs = 50, print_every=1000, batch_size = 100000) return model
def hello_world(): relations = [('math', 'science'), ('cs', 'science'), ('ml', 'cs'), ('db', 'cs'), ('linalg', 'math')] model = PoincareModel(relations, size=8, negative=2) model.train(epochs=50) # Poincare distance between two entities print(model.kv.distance('ml', 'db')) # Compute absolute position in hierarchy of input node or vector. # Values range between 0 and 1. A lower value indicates the input # node or vector is higher in the hierarchy. print(model.kv.norm('ml')) print(model.kv.norm('ml')) # Get the vectors print(model.kv.get_vector('ml')) model.save('test_embeddings.bin') model.kv.save_word2vec_format('test_embeddings.w2v') return
def test_poincare(self): doc_count = 100 cits, _ = self.get_citation_graph(doc_count) poincare_model = PoincareModel( cits, size=300, alpha=0.1, negative=10, workers=1, epsilon=1e-05, regularization_coeff=1.0, burn_in=10, burn_in_alpha=0.01, init_range=(-0.001, 0.001), ) poincare_model.train(epochs=2, ) print(poincare_model.kv.vector_size)
def poincare_train(hypertouple_dataset, size=2, burn_in=0, epochs = 5, print_freq = 100): """ Train a poincare embedding Args: hypertouple_dataset (list): The hypertouple dataset to feed for training size (int): size of model burn_in (int): Burnin identifier epochs (int): Number of epochs to train print_freq (int): Update frequency number Returns: poincare_model (model object) : The trained Poincare Model """ poincare_model = None try: #poincare_model = PoincareModel(train_data = hypertouple_dataset) poincare_model = PoincareModel(train_data=hypertouple_dataset, size = size, burn_in = burn_in) poincare_model.train(epochs=epochs, print_every = print_freq) except Exception as e: print(e) return poincare_model
def test_training_multiple(self): """Tests that calling train multiple times results in different vectors.""" model = PoincareModel(self.data_large, burn_in=0, negative=3) model.train(epochs=2) old_vectors = np.copy(model.kv.syn0) model.train(epochs=1) self.assertFalse(np.allclose(old_vectors, model.kv.syn0)) old_vectors = np.copy(model.kv.syn0) model.train(epochs=0) self.assertTrue(np.allclose(old_vectors, model.kv.syn0))
def run(mode, language): if mode == "train_poincare_custom": gold_s,_ = read_all_data(domain = "science", language = language) gold_e,_ = read_all_data(domain = "environment", language = language) gold_f,_ = read_all_data(domain = "food", language = language) vocabulary = set([relation[0].lower() for relation in gold_s] + [relation[1].lower() for relation in gold_s]) vocabulary = vocabulary | set([relation[0].lower() for relation in gold_f] + [relation[1].lower() for relation in gold_f]) vocabulary = vocabulary | set([relation[0].lower() for relation in gold_e] + [relation[1].lower() for relation in gold_e]) relations ="data/" + language + "/poincare_common_and_domains_" + language + ".tsv" assert len(open(relations, 'r').readlines()) > 10, "Not enough relations to train embeddings. Aborting ..." poincare_rel = PoincareRelations(relations) dim = 50 model = PoincareModel(poincare_rel, size = dim) print("Starting Training...") model.train(epochs=400) model.save("embeddings/poincare_common_and_domains_5_3_" + language + "_" + str(dim)) if mode == 'train_poincare_wordnet': assert language == 'EN', "Wordnet consists only of English nouns" gold_s,_ = read_all_data(domain = "science") gold_e,_ = read_all_data(domain = "environment") gold_f,_ = read_all_data(domain = "food") vocabulary = set([relation[0].lower() for relation in gold_s] + [relation[1].lower() for relation in gold_s]) vocabulary = vocabulary | set([relation[0].lower() for relation in gold_f] + [relation[1].lower() for relation in gold_f]) vocabulary = vocabulary | set([relation[0].lower() for relation in gold_e] + [relation[1].lower() for relation in gold_e]) preprocess_wordnet('data/EN/noun_closure.tsv', vocabulary) poincare_rel = PoincareRelations('data/EN/noun_closure_filtered.tsv') dim = 50 model = PoincareModel(poincare_rel, size = dim) print("Starting Training...") model.train(epochs=400) model.save("embeddings/wordnet_filtered" + "_" + str(dim)) if mode == "train_word2vec": gold_s,relations_s = read_all_data("science") gold_e,relations_e = read_all_data("environment") gold_f,relations_f = read_all_data("food") vocabulary = set([relation[0].lower() for relation in gold_s] + [relation[1].lower() for relation in gold_s]) vocabulary = vocabulary | set([relation[0].lower() for relation in gold_f] + [relation[1].lower() for relation in gold_f]) vocabulary = vocabulary | set([relation[0].lower() for relation in gold_e] + [relation[1].lower() for relation in gold_e]) documents = [] documents = list(read_input("/data/EN/wikipedia_utf8_filtered_20pageviews.csv",vocabulary)) model = gensim.models.Word2Vec(documents, size= 300, window = 10, min_count = 2, workers = 10) model.train(documents, total_examples=len(documents), epochs=30) print("Finished building word2vec model") model.save("embeddings/own_embeddings_w2v")
def normalize_diacritics_text(text_string): return unicodedata.normalize("NFC", text_string) path = "/path/to/training_set.txt" # poincare_dict.txt path_lexical = "path/to/lexical.txt" # example file_lexical_fon.txt following the HyperLex Format path_validation = "/file/to/validation_set.txt" # validation set poincare_embedding_validation.txt relations_ = load_doc(path) # parameters size = 15 # dimension of the embedding space c = 15 # constant of negative curvature epochs = 2000 # number of training epochs # define the model model = PoincareModel(relations_, size=size, negative=c) model.train(epochs) # save the model model.save('/path/to/model') # save model embedding model.kv.save_word2vec_format("/path/to/embedding") # load the model and the embedding model = PoincareModel.load("/path/to/model") model.kv.load_word2vec_format("/path/to/embedding") all_relations = set(relations_) # add different classes to the labels to add them to the graph labels = list(set([_[0] for _ in relations_])) + ["girl_name", "boy_name", "mixed_name", "body_part", "benin_city"] title = "Title Figure"
import pandas as pd from gensim.models.poincare import PoincareModel, PoincareRelations, LexicalEntailmentEvaluation from gensim.models.poincare import * from gensim.test.utils import datapath from gensim.models.poincare import PoincareKeyedVectors data_dir = '../../../Documents/kg_embeddings_data/data/' relations = [] edgelist = pd.read_csv(data_dir + 'edgelist_has_indication.txt', header=None, sep=' ') edgelist_data = edgelist.values data = pd.read_csv(data_dir + 'mapping_RDFGraph.txt', header=None, sep='\t') data = data.values mapping_dict = dict(zip(data[:, 1], data[:, 0])) for item in edgelist_data: it1 = mapping_dict[item[0]].split('/')[-1] it2 = mapping_dict[item[1]].split('/')[-1] relations.append((it1, it2)) model = PoincareModel(relations) model.train( epochs=50, batch_size=200 ) # according to paper, good representations could be learned after only 20 epochs model.kv.save_word2vec_format( data_dir + 'poincare/embeddings_has_indication_poincare_minibatch_200.txt') pdb.set_trace()
def compute_doc_vecs(experiment, data_dir='./data', workers=None, override=False, dense_vector_size=300, sparse_vector_size=500000, gpu=None): """ Examples: python cli.py compute_doc_vecs wikisource --override=1 --gpu 0 python cli.py compute_doc_vecs ocb --override=1 --gpu 1 :param data_dir: Path to data (for input and output) :param experiment: Experiment name (ocb or wikisource) :param workers: Number of workers :param override: Override existing output :param dense_vector_size: Size of dense document vectors (avg word2vec, graph embeddings, ...) :param sparse_vector_size: Size of sparse document vectors (TF-IDF) :param cuda_device: Use CUDA device for Transformer models :return: """ env = get_env() data_dir = Path(data_dir) logger.info(f'Experiment: {experiment}') exp = Experiment(name=experiment, env=env, data_dir=data_dir) exp.load_data() exp.filter_docs() models_dir = exp.models_dir common_kwargs = exp.get_common_kwargs() if not workers: workers = env['workers'] logger.info(f'Using {workers} workers') if gpu: logger.info(f'Using CUDA device={gpu}') os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu) # TF-IDF out_fp = models_dir / 'tfidf.pickle' if override or not os.path.exists(out_fp): rs = TfIdfRecSys(vector_size=sparse_vector_size, **common_kwargs) rs.train(exp.texts) rs.save_to_disk(out_fp, override=override) # Doc2Vec out_fp = models_dir / 'doc2vec.w2v.txt' if override or not os.path.exists(out_fp): rs = Doc2VecRecSys(**common_kwargs, vector_size=dense_vector_size) rs.train(exp.texts) rs.save_word2vec_format(out_fp, override=override) out_fp = models_dir / 'doc2vec_512.w2v.txt' if override or not os.path.exists(out_fp): rs = Doc2VecRecSys(**common_kwargs, vector_size=dense_vector_size) rs.train(exp.get_limited_texts(512)) rs.save_word2vec_format(out_fp, override=override) out_fp = models_dir / 'doc2vec_4096.w2v.txt' if override or not os.path.exists(out_fp): rs = Doc2VecRecSys(**common_kwargs, vector_size=dense_vector_size) rs.train(exp.get_limited_texts(4096)) rs.save_word2vec_format(out_fp, override=override) # Avg GloVe out_fp = models_dir / 'avg_glove.w2v.txt' if override or not os.path.exists(out_fp): rs = WeightedAvgWordVectorsRecSys(w2v_model=exp.get_w2v_model('glove'), **common_kwargs) rs.train(exp.texts) rs.save_word2vec_format(out_fp, override=override) # With custom GloVe embeddings out_fp = models_dir / 'avg_glove_custom.w2v.txt' if override or not os.path.exists(out_fp): rs = WeightedAvgWordVectorsRecSys( w2v_model=exp.get_w2v_model('glove_custom'), **common_kwargs) rs.train(exp.texts) rs.save_word2vec_format(out_fp, override=override) out_fp = models_dir / 'avg_fasttext.w2v.txt' if override or not os.path.exists(out_fp): rs = WeightedAvgWordVectorsRecSys( w2v_model=exp.get_w2v_model('fasttext'), **common_kwargs) rs.train(exp.texts) rs.save_word2vec_format(out_fp, override=override) out_fp = models_dir / 'avg_fasttext_custom.w2v.txt' if override or not os.path.exists(out_fp): rs = WeightedAvgWordVectorsRecSys( w2v_model=exp.get_w2v_model('fasttext_custom'), **common_kwargs) rs.train(exp.texts) rs.save_word2vec_format(out_fp, override=override) out_fp = models_dir / 'avg_fasttext_custom_512.w2v.txt' if override or not os.path.exists(out_fp): rs = WeightedAvgWordVectorsRecSys( w2v_model=exp.get_w2v_model('fasttext_custom'), **common_kwargs) rs.train(exp.get_limited_texts(512)) rs.save_word2vec_format(out_fp, override=override) out_fp = models_dir / 'avg_fasttext_custom_4096.w2v.txt' if override or not os.path.exists(out_fp): rs = WeightedAvgWordVectorsRecSys( w2v_model=exp.get_w2v_model('fasttext_custom'), **common_kwargs) rs.train(exp.get_limited_texts(4096)) rs.save_word2vec_format(out_fp, override=override) # Transformers # BERT standard pooled out_fp = models_dir / 'bert-base-cased.w2v.txt' if override or not os.path.exists(out_fp): rs = TransformerRecSys(model_name_or_path=env['bert_dir'] + '/bert-base-cased', **common_kwargs) rs.train(exp.texts) rs.save_word2vec_format(models_dir / 'bert-base-cased.w2v.txt', override=override) # All "MEAN" transformers for tf_name in [ 'bert-base-cased', 'bert-large-cased', 'roberta-base', 'roberta-large', 'legal-bert' ]: out_fp = models_dir / f'{tf_name}_mean.w2v.txt' if override or not os.path.exists(out_fp): rs = TransformerRecSys(model_name_or_path=env['bert_dir'] + '/' + tf_name, pooling_strategy='reduce_mean', **common_kwargs) rs.train(exp.texts) rs.save_word2vec_format(out_fp, override=override) # Long former if transformers.__version__ == '2.0.0': from longformer.longformer import Longformer from transformers import RobertaTokenizer out_fp = models_dir / 'longformer-base-4096-mean.w2v.txt' if override or not os.path.exists(out_fp): lf_lm = Longformer.from_pretrained(env['bert_dir'] + '/longformer-base-4096') lf_tokenizer = RobertaTokenizer.from_pretrained(env['bert_dir'] + '/roberta-base') lf_tokenizer.max_len = lf_lm.config.max_position_embeddings rs = TransformerRecSys(language_model=lf_lm, tokenizer=lf_tokenizer, max_length=4096, pooling_strategy='reduce_mean', **common_kwargs) rs.train(exp.texts) rs.save_word2vec_format(out_fp, override=override) out_fp = models_dir / 'longformer-large-4096-mean.w2v.txt' if override or not os.path.exists(out_fp): lf_lm = Longformer.from_pretrained(env['bert_dir'] + '/longformer-large-4096') lf_tokenizer = RobertaTokenizer.from_pretrained(env['bert_dir'] + '/roberta-large') lf_tokenizer.max_len = lf_lm.config.max_position_embeddings rs = TransformerRecSys(language_model=lf_lm, tokenizer=lf_tokenizer, max_length=4096, pooling_strategy='reduce_mean', **common_kwargs) rs.train(exp.texts) rs.save_word2vec_format(out_fp, override=override) else: # Wait for https://github.com/allenai/longformer/pull/14 logger.warning('Cannot run LongFormer with transformers!=2.0.0') # Sentence transformer if LooseVersion(transformers.__version__) >= LooseVersion('2.8.0'): # See https://github.com/UKPLab/sentence-transformers/blob/master/requirements.txt#L1 st_models = [ 'bert-base-nli-mean-tokens', 'bert-large-nli-mean-tokens', 'roberta-base-nli-mean-tokens', 'roberta-large-nli-mean-tokens', 'bert-base-nli-stsb-mean-tokens', 'bert-large-nli-stsb-mean-tokens', 'roberta-base-nli-stsb-mean-tokens', 'roberta-large-nli-stsb-mean-tokens', ] st_dir = env['datasets_dir'] + '/sentence_transformers/' for st_model_name in st_models: out_fp = models_dir / f's{st_model_name}.w2v.txt' if override or not os.path.exists(out_fp): rs = SentenceTransformerRecSys(model_name_or_path=st_dir + st_model_name, **common_kwargs) rs.train(exp.texts) rs.save_word2vec_format(out_fp, override=override) # break else: logger.warning( 'Cannot run sentence-transformers with transformers==%s' % transformers.__version__) # Citation # DeepWalk out_fp = models_dir / 'deepwalk.pickle' if override or not os.path.exists(out_fp): rs = GraphEmbeddingRecSys(include_seeds=exp.get_included_seeds(), graph_model_cls='karateclub.DeepWalk', graph_model_kwargs=dict( dimensions=dense_vector_size, workers=workers), **common_kwargs) rs.train(exp.cits) rs.save_to_disk(out_fp, override=override) # Diff2Vec """ out_fp = models_dir / 'diff2vec.pickle' if override or not os.path.exists(out_fp): diff2vec = GraphEmbeddingRecSys( include_seeds=exp.get_included_seeds(), graph_model_cls='karateclub.Diff2Vec', graph_model_kwargs=dict(dimensions=dense_vector_size, workers=workers), **common_kwargs ) diff2vec.train(exp.cits) diff2vec.save_to_disk(out_fp, override=override) """ # Walklets out_fp = models_dir / 'walklets.pickle' if override or not os.path.exists(out_fp): walklets_window_size = 5 # or 3 walklets_dim = int(dense_vector_size / walklets_window_size) # must be int rs = GraphEmbeddingRecSys(include_seeds=exp.get_included_seeds(), graph_model_cls='karateclub.Walklets', graph_model_kwargs=dict( dimensions=walklets_dim, window_size=walklets_window_size, workers=workers), **common_kwargs) rs.train(exp.cits) rs.save_to_disk(out_fp, override=override) # Node2Vec out_fp = models_dir / 'node2vec.pickle' if override or not os.path.exists(out_fp): rs = GraphEmbeddingRecSys(include_seeds=exp.get_included_seeds(), graph_model_cls='node2vec.Node2Vec', graph_model_kwargs=dict( dimensions=dense_vector_size, workers=workers), **common_kwargs) rs.train(exp.cits) rs.save_to_disk(out_fp, override=override) # NodeSketch """ out_fp = models_dir / 'nodesketch.pickle' if override or not os.path.exists(out_fp): nodesketch = GraphEmbeddingRecSys( include_seeds=exp.get_included_seeds(), graph_model_cls='karateclub.NodeSketch', graph_model_kwargs=dict(dimensions=dense_vector_size), **common_kwargs ) nodesketch.train(exp.cits) nodesketch.save_to_disk(out_fp, override=override) """ # BoostNE out_fp = models_dir / 'boostne.pickle' if override or not os.path.exists(out_fp): boostne_iters = 9 # 14 boostne_dim = 30 # 20 assert boostne_dim * (boostne_iters + 1) == dense_vector_size boostne = GraphEmbeddingRecSys( include_seeds=exp.get_included_seeds(), # vector_size=dense_vector_size, graph_model_cls='karateclub.BoostNE', graph_model_kwargs=dict( dimensions=boostne_dim, # 8 order=2, # 2 iterations=boostne_iters, # 16 alpha=0.01, ), # Take only embedding from last boosting # node_embedding_slice=slice(dense_vector_size * boostne_iters, dense_vector_size * (boostne_iters + 1)), **common_kwargs) boostne.train(exp.cits) boostne.save_to_disk(out_fp, override=override) # Poincare from gensim.models.poincare import PoincareModel out_fp = models_dir / 'poincare.w2v.txt' if override or not os.path.exists(out_fp): poincare_model = PoincareModel( exp.cits, size=300, alpha=0.1, negative=10, workers=1, epsilon=1e-05, regularization_coeff=1.0, burn_in=10, burn_in_alpha=0.01, init_range=(-0.001, 0.001), ) poincare_model.train(epochs=50, ) # init empty model poincare = KeyedVectors(vector_size=poincare_model.kv.vector_size) # ignore items not part of gold standard for doc_id in list(poincare_model.kv.vocab.keys()): if doc_id in exp.get_included_seeds(): poincare.add(doc_id, poincare_model.kv.get_vector(doc_id)) poincare.save_word2vec_format(out_fp) logger.info('Done')
# writer = csv.writer(outfile, delimiter='\t') # writer.writerows(hyp) with open('polyFileEdgesRand.tsv', 'w+') as outfile: writer = csv.writer(outfile, delimiter='\t') writer.writerows(poly) with open('polyTrainFileRand.tsv', 'w+') as outfile: writer = csv.writer(outfile, delimiter='\t') writer.writerows(polyTrain) with open('polyTestFileRand.tsv', 'w+') as outfile: writer = csv.writer(outfile, delimiter='\t') writer.writerows(polyTest) # with open('hypTestFile.tsv', 'w+') as outfile: # writer = csv.writer(outfile, delimiter='\t') # writer.writerows(hypTest) # # with open('hypTrainFile.tsv', 'w+') as outfile: # writer = csv.writer(outfile, delimiter='\t') # writer.writerows(hypTrain) #file_path = datapath('randFileEdges.tsv') print("POLY TO HYP") model = PoincareModel(PoincareRelations("randFileEdges.tsv"), negative=2) model.train(epochs=100) #print(model.kv.most_similar('pitch.n.02', topn=10)) test = LinkPredictionEvaluation("polyTrainFileRand.tsv", "polyTestFileRand.tsv", model.kv) print(test.evaluate()) recon = ReconstructionEvaluation("polyFileEdgesRand.tsv", model.kv) print(recon.evaluate())
import json from gensim.models.poincare import PoincareModel, PoincareRelations from gensim.test.utils import datapath # leaves and total nodes num = 1840 total = 8801 file_path = datapath('path of the original dataset for poincare') model = PoincareModel(PoincareRelations(file_path,delimiter=','), negative=2, size=32) model.train(epochs=10000,print_every=10) child2ParentDict = {} for each in range(total): r = model.kv.closest_parent(str(each)) if r is None: print(each) else: child2ParentDict[each] = r parent2ChildDict = {} for child in child2ParentDict: parent = child2ParentDict[child] if(parent in parent2ChildDict): parent2ChildDict[parent].append(child) else:
from gensim.models.poincare import PoincareModel, PoincareRelations from gensim.test.utils import datapath from utils import Parameters import pandas as pd data_path = datapath("D:/PyCharm/PyCharm_Project/paper/data/type_relation.tsv") type_embedding_path = "data/type_embedding" model = PoincareModel(train_data=PoincareRelations(data_path, encoding="gbk"), size=Parameters.type_embedding_dim, negative=3) model.train(epochs=50, print_every=5) print(model.kv.word_vec("川菜")) model.save(type_embedding_path) # poincareModel = PoincareModel.load("data/type_embedding") # print(poincareModel.kv.word_vec('东北菜'))
def test_training(self): """Tests that vectors are different before and after training.""" model = PoincareModel(self.data_large, burn_in=0, negative=3) old_vectors = np.copy(model.kv.syn0) model.train(epochs=2) self.assertFalse(np.allclose(old_vectors, model.kv.syn0))
def test_error_if_negative_more_than_population(self): """Tests error is rased if number of negatives to sample is more than remaining nodes.""" model = PoincareModel(self.data, negative=5) with self.assertRaises(ValueError): model.train(epochs=1)
def run(mode, embedding, embedding_name, experiment_name=None, log=False, trial=False): if embedding == "fasttext": #model = gensim.models.KeyedVectors.load_word2vec_format('wiki-news-300d-1M-subword.vec', binary=False) model = gensim.models.FastText.load_fasttext_format('wiki.en.bin') #model = gensim.models.FastText.load_fasttext_format('crawl-300d-2M.vec') elif embedding == "wiki2M": #model = gensim.models.FastText.load_fasttext_format('crawl-300d-2M.vec','vec') model = gensim.models.KeyedVectors.load_word2vec_format( 'embeddings/crawl-300d-2M.vec', binary=False) #model.save("crawl-300d-2M.bin") elif embedding == "wiki1M_subword": model = gensim.models.KeyedVectors.load_word2vec_format( 'embeddings/wiki-news-300d-1M-subword.vec', binary=False) elif embedding == "own_w2v": model = gensim.models.KeyedVectors.load( 'embeddings/own_embeddings_w2v') elif embedding == "quick": model = gensim.models.KeyedVectors.load_word2vec_format( 'embeddings/crawl-300d-2M.vec', binary=False, limit=50000) elif embedding == "poincare": model = PoincareModel.load( 'embeddings/poincare_common_domains02_5_3_50') print(len(model.kv.vocab)) words = [ "computer_science", "biology", "physics", "science", "virology", "life_science", "chemistry", "earth_science", "algebra", "economics", "optics" "immunology" ] for word in words: print("Current word: ", word) if word in model.kv.vocab: try: print("Closest Parent: ", model.kv.closest_parent(word)) print("Closest Child ", model.kv.closest_child(word)) print("Descendants: ", model.kv.descendants(word)) print("Ancestors: ", model.kv.ancestors(word)) print("Hierarchy diff to Science: ", model.kv.difference_in_hierarchy(word, "science")) print('\n') except: continue else: print("Word not in Vocab") if mode == "visualize_embedding_poincare": relations = set([]) filename_in = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data/isas_1000.tsv") with open(filename_in, 'r') as f: reader = csv.reader(f, delimiter='\t') for i, line in enumerate(reader): relations.add((line[0], line[1])) plot = poincare_2d_visualization(model, relations, experiment_name) py.image.save_as(plot, "vis/" + experiment_name + '.png') print("Starting visualization") #visualize_taxonomy(vectors, names) #todo own file for train if mode == "visualize_embedding": gold, relations = read_all_data() vectors = [] names = [] for relation in ( [relation1[1].replace(" ", "_") for relation1 in relations] + [relation2[2].replace(" ", "_") for relation2 in relations]): if relation not in names: if relation not in model.wv: print(relation) continue vectors.append(model.wv[relation]) names.append(relation) visualize_taxonomy(vectors, names, experiment_name) if mode == 'train_poincare': # gold,relations = read_all_data() # freq_science = [3,5] # for entry_science in freq_science: # relations = './data/' + domain +'_crawl_' + str(entry_science) +'.tsv' # #relations = './data/science_crawl_merge_10_3_02.tsv' # poincare_rel = PoincareRelations(relations) # dim = 50 # model = PoincareModel(poincare_rel, size = dim) # print("Starting Training...") # model.train(epochs=400) # model.save("embeddings/embeddings_" + domain + "_crawl_poincare_" + str(entry_science) + "_" + str(dim)) # #model.save("embeddings/embeddings_science_crawl_merge_poincare_10_3_50_02") # break relations = './data/poincare_common_domains.tsv' #relations = './data/science_crawl_merge_10_3_02.tsv' poincare_rel = PoincareRelations(relations) dim = 50 model = PoincareModel(poincare_rel, size=dim) print("Starting Training...") model.train(epochs=400) model.save("embeddings/poincare_common_domains_5_3" + "_" + str(dim)) if mode == "train_word2vec": gold_s, relations_s = read_all_data("science") gold_e, relations_e = read_all_data("environment") gold_f, relations_f = read_all_data("food") vocabulary = set([relation[2] for relation in gold_s] + [relation[1] for relation in gold_s]) vocabulary = vocabulary | set([relation[2] for relation in gold_f] + [relation[1] for relation in gold_f]) vocabulary = vocabulary | set([relation[2] for relation in gold_e] + [relation[1] for relation in gold_e]) documents = list( read_input( "/srv/data/5aly/data_text/wikipedia_utf8_filtered_20pageviews.csv", vocabulary)) model = gensim.models.Word2Vec(size=300, window=5, min_count=5, workers=30) model.build_vocab(documents) #model.train(documents, total_examples = len(documents), epochs=10) model.train(documents, total_examples=model.corpus_count, epochs=30) model.save("embeddings/own_embeddings_w2v_all") elif mode == "analysis": gold, relations = read_all_data() voc_rel = set([relation[1] for relation in relations] + [relation[2] for relation in relations]) voc_gold = set([relation[1] for relation in gold] + [relation[2] for relation in gold]) print("Vokabeln in Gold: " + str(len(voc_gold)) + "Vokabeln in Taxonomy: " + str(len(voc_rel)))
def train(): from gensim.models.poincare import PoincareModel,PoincareRelations relations = PoincareRelations(file_path="../data/word_relation.csv", delimiter=',') model = PoincareModel(relations, negative=10,size=5) model.train(epochs=500) return model
def test_burn_in(self): """Tests that vectors are different after burn-in.""" model = PoincareModel(self.data, burn_in=1, negative=3) original_vectors = np.copy(model.kv.syn0) model.train(epochs=0) self.assertFalse(np.allclose(model.kv.syn0, original_vectors))
('West Romance', 'Spanish'), ('West Romance', 'Portguese'), ('West Romance', 'Galician'), ('West Romance', 'Catalan'), ('West Romance', 'Provencal'), ('West Romance', 'Romansh'), ('Celtic', 'Breton'), ('Celtic', 'Welsh'), ('Celtic', 'Irish'), ('Baltic', 'Latvian'), ('Baltic', 'Lituanian'), ('Slavic', 'Russian'), ('Slavic', 'Polish'), ('Slavic', 'Ukrainian'), ('Slavic', 'Bulgagian'), ('Slavic', 'Czech'), ('Slavic', 'Slovakian'), ('Slavic', 'Croatian'), ('Slavic', 'Serbian'), ('Finno-Ugric', 'Finnish'), ('Finno-Ugric', 'Estonian'), ('Finno-Ugric', 'Hungarian'), ('Non-Indo-European', 'Uralic'), ('Romance', 'East-Romance'), ('Romance', 'West Romance'), ('Germanic', 'West Germanic'), ('Germanic', 'North Germanic'), ('Anglo-Norman', 'English'), ('Anglo-Norman', 'French'), ('West Germanic', 'Scots'), ('West Romance', 'Anglo-Norman')] model = PoincareModel(data, negative=2, size=2) model.train(epochs=50) print("Distance between English and French: ", model.kv.distance('English', 'French')) print("Distance between English and German: ", model.kv.distance('English', 'German')) print("Distance between English and Frisian: ", model.kv.distance('English', 'Frisian')) print("Distance between English and Welsh: ", model.kv.distance('English', 'Welsh')) print("Distance between English and Russian: ", model.kv.distance('English', 'Russian')) print("Distance between English and Hungarian: ", model.kv.distance('English', 'Hungarian')) print("Distance between English and Albanian: ", model.kv.distance('English', 'Albanian')) print("Distance between English and Greek: ",