Ejemplo n.º 1
0
def train(rels, 
          lang=LANG, 
          epochs=VALUE_DEFAULT_EPOCHS,
          epochs_load=0,
          size=VALUE_DEFAULT_SIZE, 
          negative=VALUE_DEFAULT_NEGATIVE,
          memo=VALUE_DEFAULT_MEMO,
          burnin=None,
          reg=None,
          resume=False):
    try:
        if resume:
            filename = make_filename_model(lang, epochs_load, size, negative, memo, burnin, reg)
            model = PoincareModel.load(filename)
            print("resume {}".format(filename))
        else:
            print("first training")
            raise ValueError()
    except:
        if resume:
            print("file not found")
        model = PoincareModel(rels, burn_in=0, regularization_coeff=0, negative=negative, size=size)
    
    model.train(epochs=epochs, print_every=1500)
    model.save(make_filename_model(lang, epochs+epochs_load, size, negative, memo, burnin, reg))
    
    return model
Ejemplo n.º 2
0
 def test_persistence_separate_file(self):
     """Tests whether the model is saved and loaded correctly when the arrays are stored separately."""
     model = PoincareModel(self.data, burn_in=0, negative=3)
     model.train(epochs=1)
     model.save(testfile(), sep_limit=1)
     loaded = PoincareModel.load(testfile())
     self.models_equal(model, loaded)
Ejemplo n.º 3
0
 def test_persistence_separate_file(self):
     """Tests whether the model is saved and loaded correctly when the arrays are stored separately."""
     model = PoincareModel(self.data, burn_in=0, negative=3)
     model.train(epochs=1)
     model.save(testfile(), sep_limit=1)
     loaded = PoincareModel.load(testfile())
     self.models_equal(model, loaded)
Ejemplo n.º 4
0
def poincare(relations_file, key_epochs):
    # read relation file
    with open(relations_file) as f:
        output_all = json.load(f)
    relations = [(x[0], x[3]) for x in output_all if x[3] != ""]

    # ######### poincare embedding ##########
    # train
    print(datetime.datetime.now(), "---poincare embedding Start", location())
    # os.environ['PYTHONHASHSEED'] = '0'
    # If you want to get the same result for the same input, set the PoincareModel () argument to workers = 1 and lock seed.
    model = PoincareModel(train_data=relations,
                          size=2,
                          negative=8,
                          workers=1,
                          seed=1)
    model.train(epochs=key_epochs)
    print(datetime.datetime.now(), "---poincare embedding End", location())

    # Create dictionary {keys: value} = {term: 2D coordinate values}
    vec = {}
    for word in model.kv.vocab.keys():
        vec[word] = model.kv.get_vector(word)
    print(datetime.datetime.now(), "---model.kv.vocab.keys End", location())

    # plot
    '''
    poincare_map = gensim.viz.poincare.poincare_2d_visualization(model=model,
                                                                tree=relations,
                                                                figure_title="tutorial",
                                                                show_node_labels=model.kv.vocab.keys())
    offline.plot(poincare_map)
    '''

    return vec, model
Ejemplo n.º 5
0
 def test_wrong_gradients_raises_assertion(self):
     """Tests that discrepancy in gradients raises an error."""
     model = PoincareModel(self.data, negative=3)
     model._loss_grad = Mock(return_value=np.zeros((2 + model.negative,
                                                    model.size)))
     with self.assertRaises(AssertionError):
         model.train(epochs=1, batch_size=1, check_gradients_every=1)
Ejemplo n.º 6
0
 def test_burn_in_only_done_once(self):
     """Tests that burn-in does not happen when train is called a second time."""
     model = PoincareModel(self.data, negative=3, burn_in=1)
     model.train(epochs=0)
     original_vectors = np.copy(model.kv.syn0)
     model.train(epochs=0)
     self.assertTrue(np.allclose(model.kv.syn0, original_vectors))
Ejemplo n.º 7
0
 def test_burn_in_only_done_once(self):
     """Tests that burn-in does not happen when train is called a second time."""
     model = PoincareModel(self.data, negative=3, burn_in=1)
     model.train(epochs=0)
     original_vectors = np.copy(model.kv.syn0)
     model.train(epochs=0)
     self.assertTrue(np.allclose(model.kv.syn0, original_vectors))
Ejemplo n.º 8
0
 def test_gradients_check(self):
     """Tests that the model is trained successfully with gradients check enabled."""
     model = PoincareModel(self.data, negative=3)
     try:
         model.train(epochs=1, batch_size=1, check_gradients_every=1)
     except Exception as e:
         self.fail('Exception %s raised unexpectedly while training with gradient checking' % repr(e))
Ejemplo n.º 9
0
 def test_persistence(self):
     """Tests whether the model is saved and loaded correctly."""
     model = PoincareModel(self.data, burn_in=0, negative=3)
     model.train(epochs=1)
     model.save(testfile())
     loaded = PoincareModel.load(testfile())
     self.models_equal(model, loaded)
Ejemplo n.º 10
0
 def test_persistence(self):
     """Tests whether the model is saved and loaded correctly."""
     model = PoincareModel(self.data, burn_in=0, negative=3)
     model.train(epochs=1)
     model.save(testfile())
     loaded = PoincareModel.load(testfile())
     self.models_equal(model, loaded)
Ejemplo n.º 11
0
 def test_gradients_check(self):
     """Tests that the model is trained successfully with gradients check enabled."""
     model = PoincareModel(self.data, negative=3)
     try:
         model.train(epochs=1, batch_size=1, check_gradients_every=1)
     except Exception as e:
         self.fail('Exception %s raised unexpectedly while training with gradient checking' % repr(e))
Ejemplo n.º 12
0
def train_embeddings(
        input_path,  # path to input edge relations
        delimiter,  # input file delim
        output_path,  # path to output embedding vectors 
        size=2,  # embed dimension
        alpha=0.1,  # learning rate
        burn_in=10,  # burn in train rounds
        burn_in_alpha=0.01,  # burn in learning rate
        workers=1,  # number of training threads used
        negative=10,  # negative sample size
        epochs=100,  # training rounds
        print_every=500,  # print train info
        batch_size=10):  # num samples in batch

    # load file with edge relations between entities
    relations = PoincareRelations(file_path=input_path, delimiter=delimiter)

    # train model
    model = PoincareModel(train_data=relations,
                          size=size,
                          alpha=alpha,
                          burn_in=burn_in,
                          burn_in_alpha=burn_in_alpha,
                          workers=workers,
                          negative=negative)
    model.train(epochs=epochs, print_every=print_every, batch_size=batch_size)

    # save output vectors
    model.kv.save_word2vec_format(output_path)

    return
Ejemplo n.º 13
0
def train_run(args):
    # create experiment name from args
    # create log folder, params folder from exp name
    # Start logging in exp log folder
    # save trained model in exp params folder

    exp_name = 'HB'+'time' + str(datetime.now()) + '_EXP' + str(args.train_dir) + \
    '_prbt' + str(args.prob_threshold) + '_reg' + str(args.reg_coef) + \
    '_dim' + str(args.embed_dim) + '_lr' + str(args.learning_rate) + \
    '_neg' + str(args.negs) + '_epoc' + str(args.epochs) + '_burnin' + str(args.burn_in)

    exp_name = exp_name.replace(":", "-")
    exp_name = exp_name.replace("/", "-")
    exp_name = exp_name.replace(" ", "-")
    print(exp_name)

    # Training Logs Folder
    exp_log_folder = args.log_folder + exp_name + '/'
    if not os.path.exists(exp_log_folder):
        os.makedirs(exp_log_folder)

    logging_file = exp_log_folder + 'logging.txt'
    logging.basicConfig(filename=logging_file, level=logging.INFO)

    # Model saving folder
    exp_params_folder = args.params_folder + exp_name + '/'
    if not os.path.exists(exp_params_folder):
        os.makedirs(exp_params_folder)

    training_file = args.train_dir + args.trn_file
    trn_dataset = data_loader.get_data_list(training_file, args.prob_threshold)
    print("Number of training examples: ", len(trn_dataset))

    # Create the model definition
    model = PoincareModel(train_data=trn_dataset,
                          size=args.embed_dim,
                          alpha=args.learning_rate,
                          negative=args.negs,
                          regularization_coeff=args.reg_coef,
                          burn_in=args.burn_in,
                          burn_in_alpha=args.burn_in_alpha,
                          init_range=args.init_range,
                          seed=args.random_seed)

    # Start the model training
    model.train(epochs=args.epochs,
                batch_size=args.batch_size,
                print_every=args.print_every)

    # Save the model
    model_save_name = exp_params_folder + 'gensim_model.params'
    model.save(model_save_name)

    # Save the arguments in the params folder
    args_fname = exp_params_folder + 'args_model.pkl'
    with open(args_fname, "wb") as f:
        pickle.dump(args, f)

    return
Ejemplo n.º 14
0
    def test_reproducible(self):
        """Tests that vectors are same for two independent models trained with the same seed."""
        model_1 = PoincareModel(self.data_large, seed=1, negative=3, burn_in=1)
        model_1.train(epochs=2)

        model_2 = PoincareModel(self.data_large, seed=1, negative=3, burn_in=1)
        model_2.train(epochs=2)
        self.assertTrue(np.allclose(model_1.kv.syn0, model_2.kv.syn0))
Ejemplo n.º 15
0
    def test_reproducible(self):
        """Tests that vectors are same for two independent models trained with the same seed."""
        model_1 = PoincareModel(self.data_large, seed=1, negative=3, burn_in=1)
        model_1.train(epochs=2)

        model_2 = PoincareModel(self.data_large, seed=1, negative=3, burn_in=1)
        model_2.train(epochs=2)
        self.assertTrue(np.allclose(model_1.kv.syn0, model_2.kv.syn0))
def get_poincare_model(relations, emb_size, num_threads=1):
    print('Learning Poincare embeddings with %d relations' % len(relations))
    model = PoincareModel(relations, size=emb_size, negative=2)
    t_start = datetime.now()
    model.train(epochs=50)
    t_end = datetime.now()
    print('Training time: %s' % (t_end - t_start))
    return model
Ejemplo n.º 17
0
def embedding(namespace, emb_fname):
    graph = go_graph.copy()
    for n, attr in go_graph._node.items():
        if attr['namespace'] != namespace:
            graph.remove_node(n)
    model = PoincareModel(train_data=graph.edges(), size=dim)
    model.train(epochs=num_epochs, print_every=500)
    model.kv.save(emb_fname)
    return model.kv
Ejemplo n.º 18
0
 def test_vector_dtype(self):
     """Tests whether vectors have the correct dtype before and after training."""
     model = PoincareModel(self.data_large,
                           dtype=np.float32,
                           burn_in=0,
                           negative=3)
     self.assertEqual(model.kv.syn0.dtype, np.float32)
     model.train(epochs=1)
     self.assertEqual(model.kv.syn0.dtype, np.float32)
Ejemplo n.º 19
0
 def test_train_after_load(self):
     """Tests whether the model can be trained correctly after loading from disk."""
     model = PoincareModel(self.data, burn_in=0, negative=3)
     model.train(epochs=1)
     model.save(testfile())
     loaded = PoincareModel.load(testfile())
     model.train(epochs=1)
     loaded.train(epochs=1)
     self.models_equal(model, loaded)
Ejemplo n.º 20
0
 def test_train_after_load(self):
     """Tests whether the model can be trained correctly after loading from disk."""
     model = PoincareModel(self.data, burn_in=0, negative=3)
     model.train(epochs=1)
     model.save(testfile())
     loaded = PoincareModel.load(testfile())
     model.train(epochs=1)
     loaded.train(epochs=1)
     self.models_equal(model, loaded)
Ejemplo n.º 21
0
def poincare_disk_model(relations, dimension = 2, workers = 1, negative_sample = 2, batch_number = 10):
     #for i in range(100):
     #     print(relations[i])
     print("poincare ball model initialization")
     model = PoincareModel(relations, negative = negative_sample, size = dimension, workers = workers)
     print("start poincare ball model training")
     #batch = int(len(relations)/batch_number)
     #print("batch size: ",batch)
     model.train(epochs = 50, print_every=1000, batch_size = 100000)
     return model
def hello_world():
    relations = [('math', 'science'), ('cs', 'science'), ('ml', 'cs'),
                 ('db', 'cs'), ('linalg', 'math')]
    model = PoincareModel(relations, size=8, negative=2)
    model.train(epochs=50)
    # Poincare distance between two entities
    print(model.kv.distance('ml', 'db'))
    # Compute absolute position in hierarchy of input node or vector.
    # Values range between 0 and 1. A lower value indicates the input
    # node or vector is higher in the hierarchy.
    print(model.kv.norm('ml'))
    print(model.kv.norm('ml'))
    # Get the vectors
    print(model.kv.get_vector('ml'))
    model.save('test_embeddings.bin')
    model.kv.save_word2vec_format('test_embeddings.w2v')
    return
Ejemplo n.º 23
0
    def test_poincare(self):
        doc_count = 100
        cits, _ = self.get_citation_graph(doc_count)

        poincare_model = PoincareModel(
            cits,
            size=300,
            alpha=0.1,
            negative=10,
            workers=1,
            epsilon=1e-05,
            regularization_coeff=1.0,
            burn_in=10,
            burn_in_alpha=0.01,
            init_range=(-0.001, 0.001),
        )
        poincare_model.train(epochs=2, )

        print(poincare_model.kv.vector_size)
Ejemplo n.º 24
0
def poincare_train(hypertouple_dataset, size=2, burn_in=0, epochs = 5, print_freq = 100):
    """
    Train a poincare embedding

    Args:
        hypertouple_dataset (list): The hypertouple dataset to feed for training
        size (int): size of model
        burn_in (int): Burnin identifier
        epochs (int): Number of epochs to train
        print_freq (int): Update frequency number

    Returns:
        poincare_model (model object) : The trained Poincare Model
    """
    poincare_model = None
    try:

        #poincare_model = PoincareModel(train_data = hypertouple_dataset)
        poincare_model = PoincareModel(train_data=hypertouple_dataset, size = size, burn_in = burn_in)
        poincare_model.train(epochs=epochs, print_every = print_freq)

    except Exception as e:
        print(e)
    return poincare_model
Ejemplo n.º 25
0
    def test_training_multiple(self):
        """Tests that calling train multiple times results in different vectors."""
        model = PoincareModel(self.data_large, burn_in=0, negative=3)
        model.train(epochs=2)
        old_vectors = np.copy(model.kv.syn0)

        model.train(epochs=1)
        self.assertFalse(np.allclose(old_vectors, model.kv.syn0))

        old_vectors = np.copy(model.kv.syn0)
        model.train(epochs=0)
        self.assertTrue(np.allclose(old_vectors, model.kv.syn0))
Ejemplo n.º 26
0
    def test_training_multiple(self):
        """Tests that calling train multiple times results in different vectors."""
        model = PoincareModel(self.data_large, burn_in=0, negative=3)
        model.train(epochs=2)
        old_vectors = np.copy(model.kv.syn0)

        model.train(epochs=1)
        self.assertFalse(np.allclose(old_vectors, model.kv.syn0))

        old_vectors = np.copy(model.kv.syn0)
        model.train(epochs=0)
        self.assertTrue(np.allclose(old_vectors, model.kv.syn0))
def run(mode, language):
    if mode == "train_poincare_custom":
        gold_s,_ = read_all_data(domain = "science", language = language)
        gold_e,_ = read_all_data(domain = "environment", language = language)
        gold_f,_ = read_all_data(domain = "food", language = language)
        vocabulary = set([relation[0].lower() for relation in gold_s] + [relation[1].lower() for relation in gold_s])
        vocabulary = vocabulary | set([relation[0].lower() for relation in gold_f] + [relation[1].lower() for relation in gold_f])
        vocabulary = vocabulary | set([relation[0].lower() for relation in gold_e] + [relation[1].lower() for relation in gold_e])
        relations ="data/" + language + "/poincare_common_and_domains_" + language + ".tsv"
        assert len(open(relations, 'r').readlines()) > 10, "Not enough relations to train embeddings. Aborting ..."
        poincare_rel = PoincareRelations(relations)
        dim = 50
        model = PoincareModel(poincare_rel, size = dim)
        print("Starting Training...")
        model.train(epochs=400)
        model.save("embeddings/poincare_common_and_domains_5_3_" + language + "_" + str(dim))


    if mode == 'train_poincare_wordnet':
        assert language == 'EN', "Wordnet consists only of English nouns"

        gold_s,_ = read_all_data(domain = "science")
        gold_e,_ = read_all_data(domain = "environment")
        gold_f,_ = read_all_data(domain = "food")
        vocabulary = set([relation[0].lower() for relation in gold_s] + [relation[1].lower() for relation in gold_s])
        vocabulary = vocabulary | set([relation[0].lower() for relation in gold_f] + [relation[1].lower() for relation in gold_f])
        vocabulary = vocabulary | set([relation[0].lower() for relation in gold_e] + [relation[1].lower() for relation in gold_e])

        preprocess_wordnet('data/EN/noun_closure.tsv', vocabulary)
        poincare_rel = PoincareRelations('data/EN/noun_closure_filtered.tsv')
        dim = 50
        model = PoincareModel(poincare_rel, size = dim)
        print("Starting Training...")
        model.train(epochs=400)
        model.save("embeddings/wordnet_filtered" + "_" + str(dim))

    if mode == "train_word2vec":
        gold_s,relations_s = read_all_data("science")
        gold_e,relations_e = read_all_data("environment")
        gold_f,relations_f = read_all_data("food")
        vocabulary = set([relation[0].lower() for relation in gold_s] + [relation[1].lower() for relation in gold_s])
        vocabulary = vocabulary | set([relation[0].lower() for relation in gold_f] + [relation[1].lower() for relation in gold_f])
        vocabulary = vocabulary | set([relation[0].lower() for relation in gold_e] + [relation[1].lower() for relation in gold_e])
        documents =  []

        documents = list(read_input("/data/EN/wikipedia_utf8_filtered_20pageviews.csv",vocabulary))
        model = gensim.models.Word2Vec(documents, size= 300, window = 10, min_count = 2, workers = 10)
        model.train(documents, total_examples=len(documents), epochs=30)
        print("Finished building word2vec model")
        model.save("embeddings/own_embeddings_w2v")
Ejemplo n.º 28
0
def normalize_diacritics_text(text_string):
    return unicodedata.normalize("NFC", text_string)


path = "/path/to/training_set.txt" # poincare_dict.txt
path_lexical = "path/to/lexical.txt" # example file_lexical_fon.txt following the HyperLex Format
path_validation = "/file/to/validation_set.txt" # validation set poincare_embedding_validation.txt
relations_ = load_doc(path)

# parameters
size = 15 # dimension of the embedding space
c = 15 # constant of negative curvature
epochs = 2000 # number of training epochs
# define the model
model = PoincareModel(relations_, size=size, negative=c)
model.train(epochs)

# save the model
model.save('/path/to/model')
# save model embedding
model.kv.save_word2vec_format("/path/to/embedding")

# load the model and the embedding
model = PoincareModel.load("/path/to/model")
model.kv.load_word2vec_format("/path/to/embedding")

all_relations = set(relations_)
# add different classes to the labels to add them to the graph
labels = list(set([_[0] for _ in relations_])) + ["girl_name", "boy_name", "mixed_name", "body_part", "benin_city"]

title = "Title Figure"
Ejemplo n.º 29
0
import pandas as pd
from gensim.models.poincare import PoincareModel, PoincareRelations, LexicalEntailmentEvaluation
from gensim.models.poincare import *
from gensim.test.utils import datapath
from gensim.models.poincare import PoincareKeyedVectors

data_dir = '../../../Documents/kg_embeddings_data/data/'
relations = []
edgelist = pd.read_csv(data_dir + 'edgelist_has_indication.txt',
                       header=None,
                       sep=' ')
edgelist_data = edgelist.values

data = pd.read_csv(data_dir + 'mapping_RDFGraph.txt', header=None, sep='\t')
data = data.values
mapping_dict = dict(zip(data[:, 1], data[:, 0]))

for item in edgelist_data:
    it1 = mapping_dict[item[0]].split('/')[-1]
    it2 = mapping_dict[item[1]].split('/')[-1]
    relations.append((it1, it2))

model = PoincareModel(relations)
model.train(
    epochs=50, batch_size=200
)  # according to paper, good representations could be learned after only 20 epochs

model.kv.save_word2vec_format(
    data_dir + 'poincare/embeddings_has_indication_poincare_minibatch_200.txt')
pdb.set_trace()
Ejemplo n.º 30
0
def compute_doc_vecs(experiment,
                     data_dir='./data',
                     workers=None,
                     override=False,
                     dense_vector_size=300,
                     sparse_vector_size=500000,
                     gpu=None):
    """

    Examples:

    python cli.py compute_doc_vecs wikisource --override=1 --gpu 0
    python cli.py compute_doc_vecs ocb --override=1 --gpu 1


    :param data_dir: Path to data (for input and output)
    :param experiment: Experiment name (ocb or wikisource)
    :param workers: Number of workers
    :param override: Override existing output
    :param dense_vector_size: Size of dense document vectors (avg word2vec, graph embeddings, ...)
    :param sparse_vector_size: Size of sparse document vectors (TF-IDF)
    :param cuda_device: Use CUDA device for Transformer models
    :return:
    """
    env = get_env()
    data_dir = Path(data_dir)

    logger.info(f'Experiment: {experiment}')

    exp = Experiment(name=experiment, env=env, data_dir=data_dir)

    exp.load_data()
    exp.filter_docs()

    models_dir = exp.models_dir
    common_kwargs = exp.get_common_kwargs()

    if not workers:
        workers = env['workers']

    logger.info(f'Using {workers} workers')

    if gpu:
        logger.info(f'Using CUDA device={gpu}')
        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu)

    # TF-IDF
    out_fp = models_dir / 'tfidf.pickle'
    if override or not os.path.exists(out_fp):
        rs = TfIdfRecSys(vector_size=sparse_vector_size, **common_kwargs)
        rs.train(exp.texts)
        rs.save_to_disk(out_fp, override=override)

    # Doc2Vec
    out_fp = models_dir / 'doc2vec.w2v.txt'
    if override or not os.path.exists(out_fp):
        rs = Doc2VecRecSys(**common_kwargs, vector_size=dense_vector_size)
        rs.train(exp.texts)
        rs.save_word2vec_format(out_fp, override=override)

    out_fp = models_dir / 'doc2vec_512.w2v.txt'
    if override or not os.path.exists(out_fp):
        rs = Doc2VecRecSys(**common_kwargs, vector_size=dense_vector_size)
        rs.train(exp.get_limited_texts(512))
        rs.save_word2vec_format(out_fp, override=override)

    out_fp = models_dir / 'doc2vec_4096.w2v.txt'
    if override or not os.path.exists(out_fp):
        rs = Doc2VecRecSys(**common_kwargs, vector_size=dense_vector_size)
        rs.train(exp.get_limited_texts(4096))
        rs.save_word2vec_format(out_fp, override=override)

    # Avg GloVe
    out_fp = models_dir / 'avg_glove.w2v.txt'
    if override or not os.path.exists(out_fp):
        rs = WeightedAvgWordVectorsRecSys(w2v_model=exp.get_w2v_model('glove'),
                                          **common_kwargs)
        rs.train(exp.texts)
        rs.save_word2vec_format(out_fp, override=override)

    # With custom GloVe embeddings
    out_fp = models_dir / 'avg_glove_custom.w2v.txt'
    if override or not os.path.exists(out_fp):
        rs = WeightedAvgWordVectorsRecSys(
            w2v_model=exp.get_w2v_model('glove_custom'), **common_kwargs)
        rs.train(exp.texts)
        rs.save_word2vec_format(out_fp, override=override)

    out_fp = models_dir / 'avg_fasttext.w2v.txt'
    if override or not os.path.exists(out_fp):
        rs = WeightedAvgWordVectorsRecSys(
            w2v_model=exp.get_w2v_model('fasttext'), **common_kwargs)
        rs.train(exp.texts)
        rs.save_word2vec_format(out_fp, override=override)

    out_fp = models_dir / 'avg_fasttext_custom.w2v.txt'
    if override or not os.path.exists(out_fp):
        rs = WeightedAvgWordVectorsRecSys(
            w2v_model=exp.get_w2v_model('fasttext_custom'), **common_kwargs)
        rs.train(exp.texts)
        rs.save_word2vec_format(out_fp, override=override)

    out_fp = models_dir / 'avg_fasttext_custom_512.w2v.txt'
    if override or not os.path.exists(out_fp):
        rs = WeightedAvgWordVectorsRecSys(
            w2v_model=exp.get_w2v_model('fasttext_custom'), **common_kwargs)
        rs.train(exp.get_limited_texts(512))
        rs.save_word2vec_format(out_fp, override=override)

    out_fp = models_dir / 'avg_fasttext_custom_4096.w2v.txt'
    if override or not os.path.exists(out_fp):
        rs = WeightedAvgWordVectorsRecSys(
            w2v_model=exp.get_w2v_model('fasttext_custom'), **common_kwargs)
        rs.train(exp.get_limited_texts(4096))
        rs.save_word2vec_format(out_fp, override=override)

    # Transformers
    # BERT standard pooled
    out_fp = models_dir / 'bert-base-cased.w2v.txt'
    if override or not os.path.exists(out_fp):
        rs = TransformerRecSys(model_name_or_path=env['bert_dir'] +
                               '/bert-base-cased',
                               **common_kwargs)
        rs.train(exp.texts)
        rs.save_word2vec_format(models_dir / 'bert-base-cased.w2v.txt',
                                override=override)

    # All "MEAN" transformers
    for tf_name in [
            'bert-base-cased', 'bert-large-cased', 'roberta-base',
            'roberta-large', 'legal-bert'
    ]:
        out_fp = models_dir / f'{tf_name}_mean.w2v.txt'
        if override or not os.path.exists(out_fp):
            rs = TransformerRecSys(model_name_or_path=env['bert_dir'] + '/' +
                                   tf_name,
                                   pooling_strategy='reduce_mean',
                                   **common_kwargs)
            rs.train(exp.texts)
            rs.save_word2vec_format(out_fp, override=override)

    # Long former
    if transformers.__version__ == '2.0.0':

        from longformer.longformer import Longformer
        from transformers import RobertaTokenizer

        out_fp = models_dir / 'longformer-base-4096-mean.w2v.txt'
        if override or not os.path.exists(out_fp):
            lf_lm = Longformer.from_pretrained(env['bert_dir'] +
                                               '/longformer-base-4096')
            lf_tokenizer = RobertaTokenizer.from_pretrained(env['bert_dir'] +
                                                            '/roberta-base')
            lf_tokenizer.max_len = lf_lm.config.max_position_embeddings

            rs = TransformerRecSys(language_model=lf_lm,
                                   tokenizer=lf_tokenizer,
                                   max_length=4096,
                                   pooling_strategy='reduce_mean',
                                   **common_kwargs)
            rs.train(exp.texts)
            rs.save_word2vec_format(out_fp, override=override)

        out_fp = models_dir / 'longformer-large-4096-mean.w2v.txt'
        if override or not os.path.exists(out_fp):
            lf_lm = Longformer.from_pretrained(env['bert_dir'] +
                                               '/longformer-large-4096')
            lf_tokenizer = RobertaTokenizer.from_pretrained(env['bert_dir'] +
                                                            '/roberta-large')
            lf_tokenizer.max_len = lf_lm.config.max_position_embeddings

            rs = TransformerRecSys(language_model=lf_lm,
                                   tokenizer=lf_tokenizer,
                                   max_length=4096,
                                   pooling_strategy='reduce_mean',
                                   **common_kwargs)
            rs.train(exp.texts)
            rs.save_word2vec_format(out_fp, override=override)
    else:
        # Wait for https://github.com/allenai/longformer/pull/14
        logger.warning('Cannot run LongFormer with transformers!=2.0.0')

    # Sentence transformer
    if LooseVersion(transformers.__version__) >= LooseVersion('2.8.0'):
        # See https://github.com/UKPLab/sentence-transformers/blob/master/requirements.txt#L1
        st_models = [
            'bert-base-nli-mean-tokens',
            'bert-large-nli-mean-tokens',
            'roberta-base-nli-mean-tokens',
            'roberta-large-nli-mean-tokens',
            'bert-base-nli-stsb-mean-tokens',
            'bert-large-nli-stsb-mean-tokens',
            'roberta-base-nli-stsb-mean-tokens',
            'roberta-large-nli-stsb-mean-tokens',
        ]
        st_dir = env['datasets_dir'] + '/sentence_transformers/'

        for st_model_name in st_models:
            out_fp = models_dir / f's{st_model_name}.w2v.txt'
            if override or not os.path.exists(out_fp):
                rs = SentenceTransformerRecSys(model_name_or_path=st_dir +
                                               st_model_name,
                                               **common_kwargs)
                rs.train(exp.texts)
                rs.save_word2vec_format(out_fp, override=override)
        #    break
    else:
        logger.warning(
            'Cannot run sentence-transformers with transformers==%s' %
            transformers.__version__)

    # Citation

    # DeepWalk
    out_fp = models_dir / 'deepwalk.pickle'
    if override or not os.path.exists(out_fp):
        rs = GraphEmbeddingRecSys(include_seeds=exp.get_included_seeds(),
                                  graph_model_cls='karateclub.DeepWalk',
                                  graph_model_kwargs=dict(
                                      dimensions=dense_vector_size,
                                      workers=workers),
                                  **common_kwargs)
        rs.train(exp.cits)
        rs.save_to_disk(out_fp, override=override)

    # Diff2Vec
    """
    out_fp = models_dir / 'diff2vec.pickle'
    if override or not os.path.exists(out_fp):
        diff2vec = GraphEmbeddingRecSys(
            include_seeds=exp.get_included_seeds(),
            graph_model_cls='karateclub.Diff2Vec',
            graph_model_kwargs=dict(dimensions=dense_vector_size, workers=workers),
            **common_kwargs
        )
        diff2vec.train(exp.cits)
        diff2vec.save_to_disk(out_fp, override=override)
    """

    # Walklets
    out_fp = models_dir / 'walklets.pickle'
    if override or not os.path.exists(out_fp):
        walklets_window_size = 5  # or 3
        walklets_dim = int(dense_vector_size /
                           walklets_window_size)  # must be int
        rs = GraphEmbeddingRecSys(include_seeds=exp.get_included_seeds(),
                                  graph_model_cls='karateclub.Walklets',
                                  graph_model_kwargs=dict(
                                      dimensions=walklets_dim,
                                      window_size=walklets_window_size,
                                      workers=workers),
                                  **common_kwargs)
        rs.train(exp.cits)
        rs.save_to_disk(out_fp, override=override)

    # Node2Vec
    out_fp = models_dir / 'node2vec.pickle'
    if override or not os.path.exists(out_fp):
        rs = GraphEmbeddingRecSys(include_seeds=exp.get_included_seeds(),
                                  graph_model_cls='node2vec.Node2Vec',
                                  graph_model_kwargs=dict(
                                      dimensions=dense_vector_size,
                                      workers=workers),
                                  **common_kwargs)
        rs.train(exp.cits)
        rs.save_to_disk(out_fp, override=override)

    # NodeSketch
    """
    out_fp = models_dir / 'nodesketch.pickle'
    if override or not os.path.exists(out_fp):
        nodesketch = GraphEmbeddingRecSys(
            include_seeds=exp.get_included_seeds(),
            graph_model_cls='karateclub.NodeSketch',
            graph_model_kwargs=dict(dimensions=dense_vector_size),
            **common_kwargs
        )
        nodesketch.train(exp.cits)
        nodesketch.save_to_disk(out_fp, override=override)
    """

    # BoostNE
    out_fp = models_dir / 'boostne.pickle'
    if override or not os.path.exists(out_fp):
        boostne_iters = 9  # 14
        boostne_dim = 30  # 20

        assert boostne_dim * (boostne_iters + 1) == dense_vector_size

        boostne = GraphEmbeddingRecSys(
            include_seeds=exp.get_included_seeds(),
            # vector_size=dense_vector_size,
            graph_model_cls='karateclub.BoostNE',
            graph_model_kwargs=dict(
                dimensions=boostne_dim,  # 8
                order=2,  # 2
                iterations=boostne_iters,  # 16
                alpha=0.01,
            ),
            # Take only embedding from last boosting
            # node_embedding_slice=slice(dense_vector_size * boostne_iters, dense_vector_size * (boostne_iters + 1)),
            **common_kwargs)
        boostne.train(exp.cits)
        boostne.save_to_disk(out_fp, override=override)

    # Poincare
    from gensim.models.poincare import PoincareModel
    out_fp = models_dir / 'poincare.w2v.txt'
    if override or not os.path.exists(out_fp):
        poincare_model = PoincareModel(
            exp.cits,
            size=300,
            alpha=0.1,
            negative=10,
            workers=1,
            epsilon=1e-05,
            regularization_coeff=1.0,
            burn_in=10,
            burn_in_alpha=0.01,
            init_range=(-0.001, 0.001),
        )
        poincare_model.train(epochs=50, )
        # init empty model
        poincare = KeyedVectors(vector_size=poincare_model.kv.vector_size)

        # ignore items not part of gold standard
        for doc_id in list(poincare_model.kv.vocab.keys()):
            if doc_id in exp.get_included_seeds():
                poincare.add(doc_id, poincare_model.kv.get_vector(doc_id))
        poincare.save_word2vec_format(out_fp)

    logger.info('Done')
Ejemplo n.º 31
0
#      writer = csv.writer(outfile, delimiter='\t')
#      writer.writerows(hyp)
with open('polyFileEdgesRand.tsv', 'w+') as outfile:
     writer = csv.writer(outfile, delimiter='\t')
     writer.writerows(poly)
with open('polyTrainFileRand.tsv', 'w+') as outfile:
     writer = csv.writer(outfile, delimiter='\t')
     writer.writerows(polyTrain)
with open('polyTestFileRand.tsv', 'w+') as outfile:
     writer = csv.writer(outfile, delimiter='\t')
     writer.writerows(polyTest)

# with open('hypTestFile.tsv', 'w+') as outfile:
#      writer = csv.writer(outfile, delimiter='\t')
#      writer.writerows(hypTest)
#
# with open('hypTrainFile.tsv', 'w+') as outfile:
#      writer = csv.writer(outfile, delimiter='\t')
#      writer.writerows(hypTrain)

#file_path = datapath('randFileEdges.tsv')
print("POLY TO HYP")
model = PoincareModel(PoincareRelations("randFileEdges.tsv"), negative=2)
model.train(epochs=100)
#print(model.kv.most_similar('pitch.n.02', topn=10))
test = LinkPredictionEvaluation("polyTrainFileRand.tsv", "polyTestFileRand.tsv", model.kv)
print(test.evaluate())

recon = ReconstructionEvaluation("polyFileEdgesRand.tsv", model.kv)
print(recon.evaluate())
Ejemplo n.º 32
0
import json

from gensim.models.poincare import PoincareModel, PoincareRelations
from gensim.test.utils import datapath

# leaves and total nodes
num = 1840
total = 8801



file_path = datapath('path of the original dataset for poincare')
model = PoincareModel(PoincareRelations(file_path,delimiter=','), negative=2, size=32)
model.train(epochs=10000,print_every=10)

child2ParentDict = {}

for each in range(total):
    r = model.kv.closest_parent(str(each))
    if r is None:
        print(each)
    else:
        child2ParentDict[each] = r


parent2ChildDict = {}
for child in child2ParentDict:
    parent = child2ParentDict[child]
    if(parent in parent2ChildDict):
        parent2ChildDict[parent].append(child)
    else:
Ejemplo n.º 33
0
 def test_wrong_gradients_raises_assertion(self):
     """Tests that discrepancy in gradients raises an error."""
     model = PoincareModel(self.data, negative=3)
     model._loss_grad = Mock(return_value=np.zeros((2 + model.negative, model.size)))
     with self.assertRaises(AssertionError):
         model.train(epochs=1, batch_size=1, check_gradients_every=1)
Ejemplo n.º 34
0
from gensim.models.poincare import PoincareModel, PoincareRelations
from gensim.test.utils import datapath
from utils import Parameters
import pandas as pd

data_path = datapath("D:/PyCharm/PyCharm_Project/paper/data/type_relation.tsv")
type_embedding_path = "data/type_embedding"
model = PoincareModel(train_data=PoincareRelations(data_path, encoding="gbk"),
                      size=Parameters.type_embedding_dim,
                      negative=3)
model.train(epochs=50, print_every=5)
print(model.kv.word_vec("川菜"))
model.save(type_embedding_path)

# poincareModel = PoincareModel.load("data/type_embedding")
# print(poincareModel.kv.word_vec('东北菜'))
Ejemplo n.º 35
0
 def test_vector_dtype(self):
     """Tests whether vectors have the correct dtype before and after training."""
     model = PoincareModel(self.data_large, dtype=np.float32, burn_in=0, negative=3)
     self.assertEqual(model.kv.syn0.dtype, np.float32)
     model.train(epochs=1)
     self.assertEqual(model.kv.syn0.dtype, np.float32)
Ejemplo n.º 36
0
 def test_training(self):
     """Tests that vectors are different before and after training."""
     model = PoincareModel(self.data_large, burn_in=0, negative=3)
     old_vectors = np.copy(model.kv.syn0)
     model.train(epochs=2)
     self.assertFalse(np.allclose(old_vectors, model.kv.syn0))
Ejemplo n.º 37
0
 def test_error_if_negative_more_than_population(self):
     """Tests error is rased if number of negatives to sample is more than remaining nodes."""
     model = PoincareModel(self.data, negative=5)
     with self.assertRaises(ValueError):
         model.train(epochs=1)
Ejemplo n.º 38
0
def run(mode,
        embedding,
        embedding_name,
        experiment_name=None,
        log=False,
        trial=False):
    if embedding == "fasttext":
        #model = gensim.models.KeyedVectors.load_word2vec_format('wiki-news-300d-1M-subword.vec', binary=False)
        model = gensim.models.FastText.load_fasttext_format('wiki.en.bin')
        #model = gensim.models.FastText.load_fasttext_format('crawl-300d-2M.vec')
    elif embedding == "wiki2M":
        #model = gensim.models.FastText.load_fasttext_format('crawl-300d-2M.vec','vec')
        model = gensim.models.KeyedVectors.load_word2vec_format(
            'embeddings/crawl-300d-2M.vec', binary=False)
        #model.save("crawl-300d-2M.bin")
    elif embedding == "wiki1M_subword":
        model = gensim.models.KeyedVectors.load_word2vec_format(
            'embeddings/wiki-news-300d-1M-subword.vec', binary=False)

    elif embedding == "own_w2v":
        model = gensim.models.KeyedVectors.load(
            'embeddings/own_embeddings_w2v')

    elif embedding == "quick":
        model = gensim.models.KeyedVectors.load_word2vec_format(
            'embeddings/crawl-300d-2M.vec', binary=False, limit=50000)
    elif embedding == "poincare":
        model = PoincareModel.load(
            'embeddings/poincare_common_domains02_5_3_50')
        print(len(model.kv.vocab))
        words = [
            "computer_science", "biology", "physics", "science", "virology",
            "life_science", "chemistry", "earth_science", "algebra",
            "economics", "optics"
            "immunology"
        ]
        for word in words:
            print("Current word: ", word)

            if word in model.kv.vocab:
                try:
                    print("Closest Parent: ", model.kv.closest_parent(word))
                    print("Closest Child ", model.kv.closest_child(word))
                    print("Descendants: ", model.kv.descendants(word))
                    print("Ancestors: ", model.kv.ancestors(word))
                    print("Hierarchy diff to Science: ",
                          model.kv.difference_in_hierarchy(word, "science"))
                    print('\n')
                except:
                    continue
            else:
                print("Word not in Vocab")

    if mode == "visualize_embedding_poincare":
        relations = set([])
        filename_in = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                   "data/isas_1000.tsv")
        with open(filename_in, 'r') as f:
            reader = csv.reader(f, delimiter='\t')
            for i, line in enumerate(reader):
                relations.add((line[0], line[1]))
        plot = poincare_2d_visualization(model, relations, experiment_name)
        py.image.save_as(plot, "vis/" + experiment_name + '.png')
        print("Starting visualization")

        #visualize_taxonomy(vectors, names)


#todo own file for train
    if mode == "visualize_embedding":
        gold, relations = read_all_data()
        vectors = []
        names = []
        for relation in (
            [relation1[1].replace(" ", "_") for relation1 in relations] +
            [relation2[2].replace(" ", "_") for relation2 in relations]):
            if relation not in names:
                if relation not in model.wv:
                    print(relation)
                    continue
                vectors.append(model.wv[relation])
                names.append(relation)
        visualize_taxonomy(vectors, names, experiment_name)

    if mode == 'train_poincare':
        # gold,relations = read_all_data()
        # freq_science = [3,5]
        # for entry_science in freq_science:
        #     relations = './data/' + domain +'_crawl_' + str(entry_science) +'.tsv'
        #     #relations = './data/science_crawl_merge_10_3_02.tsv'
        #     poincare_rel = PoincareRelations(relations)
        #     dim = 50
        #     model = PoincareModel(poincare_rel, size = dim)
        #     print("Starting Training...")
        #     model.train(epochs=400)
        #     model.save("embeddings/embeddings_" + domain + "_crawl_poincare_" + str(entry_science) + "_" + str(dim))
        #     #model.save("embeddings/embeddings_science_crawl_merge_poincare_10_3_50_02")
        #     break
        relations = './data/poincare_common_domains.tsv'
        #relations = './data/science_crawl_merge_10_3_02.tsv'
        poincare_rel = PoincareRelations(relations)
        dim = 50
        model = PoincareModel(poincare_rel, size=dim)
        print("Starting Training...")
        model.train(epochs=400)
        model.save("embeddings/poincare_common_domains_5_3" + "_" + str(dim))

    if mode == "train_word2vec":
        gold_s, relations_s = read_all_data("science")
        gold_e, relations_e = read_all_data("environment")
        gold_f, relations_f = read_all_data("food")
        vocabulary = set([relation[2] for relation in gold_s] +
                         [relation[1] for relation in gold_s])
        vocabulary = vocabulary | set([relation[2] for relation in gold_f] +
                                      [relation[1] for relation in gold_f])
        vocabulary = vocabulary | set([relation[2] for relation in gold_e] +
                                      [relation[1] for relation in gold_e])
        documents = list(
            read_input(
                "/srv/data/5aly/data_text/wikipedia_utf8_filtered_20pageviews.csv",
                vocabulary))
        model = gensim.models.Word2Vec(size=300,
                                       window=5,
                                       min_count=5,
                                       workers=30)
        model.build_vocab(documents)
        #model.train(documents, total_examples = len(documents), epochs=10)
        model.train(documents, total_examples=model.corpus_count, epochs=30)
        model.save("embeddings/own_embeddings_w2v_all")

    elif mode == "analysis":
        gold, relations = read_all_data()
        voc_rel = set([relation[1] for relation in relations] +
                      [relation[2] for relation in relations])
        voc_gold = set([relation[1] for relation in gold] +
                       [relation[2] for relation in gold])
        print("Vokabeln in Gold: " + str(len(voc_gold)) +
              "Vokabeln in Taxonomy: " + str(len(voc_rel)))
Ejemplo n.º 39
0
	def train():
		from gensim.models.poincare import PoincareModel,PoincareRelations
		relations = PoincareRelations(file_path="../data/word_relation.csv", delimiter=',')
		model = PoincareModel(relations, negative=10,size=5)
		model.train(epochs=500)
		return model
Ejemplo n.º 40
0
 def test_burn_in(self):
     """Tests that vectors are different after burn-in."""
     model = PoincareModel(self.data, burn_in=1, negative=3)
     original_vectors = np.copy(model.kv.syn0)
     model.train(epochs=0)
     self.assertFalse(np.allclose(model.kv.syn0, original_vectors))
Ejemplo n.º 41
0
        ('West Romance', 'Spanish'), ('West Romance', 'Portguese'),
        ('West Romance', 'Galician'), ('West Romance', 'Catalan'),
        ('West Romance', 'Provencal'), ('West Romance', 'Romansh'),
        ('Celtic', 'Breton'), ('Celtic', 'Welsh'), ('Celtic', 'Irish'),
        ('Baltic', 'Latvian'), ('Baltic', 'Lituanian'), ('Slavic', 'Russian'),
        ('Slavic', 'Polish'), ('Slavic', 'Ukrainian'), ('Slavic', 'Bulgagian'),
        ('Slavic', 'Czech'), ('Slavic', 'Slovakian'), ('Slavic', 'Croatian'),
        ('Slavic', 'Serbian'), ('Finno-Ugric', 'Finnish'),
        ('Finno-Ugric', 'Estonian'), ('Finno-Ugric', 'Hungarian'),
        ('Non-Indo-European', 'Uralic'), ('Romance', 'East-Romance'),
        ('Romance', 'West Romance'), ('Germanic', 'West Germanic'),
        ('Germanic', 'North Germanic'), ('Anglo-Norman', 'English'),
        ('Anglo-Norman', 'French'), ('West Germanic', 'Scots'),
        ('West Romance', 'Anglo-Norman')]
model = PoincareModel(data, negative=2, size=2)
model.train(epochs=50)
print("Distance between English and French: ",
      model.kv.distance('English', 'French'))
print("Distance between English and German: ",
      model.kv.distance('English', 'German'))
print("Distance between English and Frisian: ",
      model.kv.distance('English', 'Frisian'))
print("Distance between English and Welsh: ",
      model.kv.distance('English', 'Welsh'))
print("Distance between English and Russian: ",
      model.kv.distance('English', 'Russian'))
print("Distance between English and Hungarian: ",
      model.kv.distance('English', 'Hungarian'))
print("Distance between English and Albanian: ",
      model.kv.distance('English', 'Albanian'))
print("Distance between English and Greek: ",
Ejemplo n.º 42
0
 def test_error_if_negative_more_than_population(self):
     """Tests error is rased if number of negatives to sample is more than remaining nodes."""
     model = PoincareModel(self.data, negative=5)
     with self.assertRaises(ValueError):
         model.train(epochs=1)