Beispiel #1
0
 def __init__(self):
     # Corpus model
     vocab = dict(torch.load("../data/dialogue.vocab.pt", "text"))
     self.corpus_model = Corpus(dictionary=vocab['tgt'].stoi)
     # Model
     self.glove = Glove(no_components=args.no_components,
                        learning_rate=args.learning_rate)
Beispiel #2
0
def train_glove_fashionrec(dimensionality, context, epochs):
    """ Train with Glove on IG corpora"""
    total_count, vocab_size = corpus_stats("data/clean2_corpus.txt")
    print("total word count: {}, vocabulary size: {}".format(
        total_count, vocab_size))
    fileName = "results/training/glove_fashion_epochs" + str(
        epochs) + "_d" + str(dimensionality) + "_c" + str(
            context) + "_" + ".txt"
    corpus = readCorpus()
    lines = corpus.split("\n")
    linessplit = map(lambda x: x.split(" "), lines)
    corpus_model = Corpus()
    start_time = datetime.now()
    corpus_model.fit(linessplit, window=context)
    corpusModelFile = "trained/glove_fashion_epochs" + str(
        epochs) + "_d" + str(dimensionality) + "_c" + str(
            context) + "_corpus" + ".model"
    corpus_model.save(corpusModelFile)
    glove = Glove(no_components=dimensionality, learning_rate=0.05)
    glove.fit(corpus_model.matrix,
              epochs=int(epochs),
              no_threads=8,
              verbose=True)
    glove.add_dictionary(corpus_model.dictionary)
    time_elapsed = datetime.now() - start_time
    gloveModelFile = "trained/glove_fashion_epochs" + str(epochs) + "_d" + str(
        dimensionality) + "_c" + str(context) + "_vecs" + ".model"
    glove.save(gloveModelFile)
    notes = "Glove Fashion Data," + str(dimensionality) + " dim, " + str(
        context) + " context, " + str(
            epochs) + " epochs \n" + "Training time: " + str(time_elapsed)
    save_to_file(fileName, notes)
    gloveVecFile = "trained/glove_fashion_epochs" + str(epochs) + "_d" + str(
        dimensionality) + "_c" + str(context) + "_vecs" + ".vec"
    save_glove_bin_to_vec(glove, gloveVecFile)
Beispiel #3
0
def train(path, freq, window, dim, lr, epochs):
    lines = []
    dic = {}
    print("Start of train method")
    try:
        for f in os.listdir(path):
            text = open(path + '/' + f, 'r').read()
            text = re.sub('\n', ' ', text)
            text = text.split()
            for word in text:
                if word in dic.keys():
                    dic[word] += 1
                else:
                    dic[word] = 1
        print("Created Dictionary for frequencies of words.")
        for f in os.listdir(path):
            text = open(path + '/' + f, 'r').read()
            text = re.sub('\n', ' ', text)
            text = text.split()
            text = [word for word in text if dic[word] > freq]
            lines.append(text)
        print(
            "Converted preprocessed text data in input format of array of array of words."
        )
        corpus = Corpus()
        corpus.fit(lines, window=window)
        glove = Glove(no_components=dim, learning_rate=lr)
        glove.fit(corpus.matrix, epochs=epochs, verbose=True)
        glove.add_dictionary(corpus.dictionary)
        glove.save('glove.model')
        print("Saved the trained model to glove.model.")
    except:
        print("Error occured in training glove model")
Beispiel #4
0
def parse_Word2Vec(full_content):
    corpus = Corpus()
    corpus.fit(full_content, window=10)
    glove = Glove(no_components=100, learning_rate=0.05)
    glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
    glove.add_dictionary(corpus.dictionary)

    # Open file to write the results
    f2 = open('/home/ubuntu/corpus/results.txt', 'w')

    # Loop through all the article types in the file
    with open('/home/ubuntu/corpus/article_types.csv', 'r') as f:
        reader = csv.reader(f, delimiter="\t")
        for row in reader:
            article_type = row[0]
            translator = str.maketrans({key: '' for key in string.punctuation})
            article_type_no_punctuation = article_type.translate(translator)
            wordnet = WordNetLemmatizer()
            article_type_clean = wordnet.lemmatize(article_type_no_punctuation)
            try:
                match = glove.most_similar(article_type_clean, number=10)
                matched_item = match[0][0]
                print(article_type_clean + ' -> ' + str(matched_item))
                f2.write(article_type + '\n')
                f2.write(str(matched_item + '\n'))
            except:
                pass
                print('failed for: ' + article_type)
    f2.close()
Beispiel #5
0
    def Myself_Model(self,
                     cropus_path,
                     save=None,
                     back_corpus=None,
                     epochs=10,
                     no_threads=8,
                     no_components=100,
                     learning_rate=0.05):
        """
        sd
        """

        self.get_data = self.read_corpus(cropus_path)
        corpus_model = Corpus()
        corpus_model.fit(self.get_data, window=10)
        if back_corpus != None:
            yield corpus_model

        #self.glove = Glove()
        self.glove = Glove(no_components=no_components,
                           learning_rate=learning_rate)
        self.glove.fit(corpus_model.matrix,
                       epochs=epochs,
                       no_threads=no_threads,
                       verbose=True)
        self.glove.add_dictionary(corpus_model.dictionary)

        if save != None:
            #save = 'model/articles_glove.model'
            self.glove.save(save)

        self.model = self.glove
        return self.glove
Beispiel #6
0
def glove_embed(data, embed_dim, window_size, epochs_, step_size):
    '''
    DESCRIPTION : Perform Global Vectors for word embeddings for tokens in data set

    INPUT:
        |--- train: list of tweets
        |--- embed_size: [int] integer representing embedding dimension
        |--- window_size: [int] integer representing the size of the window of tokens considered during training for each token
        |--- epochs: [int] integer for number of epochs for Word2Vec training
        |--- step_size: [float] learning step for the SGD for Word2Vec training 

    OUTPUT:
        |--- embeddings: [dict] dictionnary with tweets as keys and 1D array of feature vector as values
        |--- vocab: [dict] dictionnary with tokens as keys and index of each token in vocab as values
        |--- glove: [Global Vectors Model] GloVe model trained on data
    '''
    sentences = get_tokens(data)

    model = Corpus()
    model.fit(sentences, window=window_size)

    glove = Glove(no_components=embed_dim, learning_rate=step_size)
    glove.fit(model.matrix, epochs=epochs_, no_threads=1, verbose=True)
    glove.add_dictionary(model.dictionary)

    embeddings = np.zeros((len([*glove.dictionary]), embed_dim))
    for w, id_ in glove.dictionary.items():
        embeddings[id_, :] = np.array([glove.word_vectors[id_]])

    vocab = dict()
    for idx, line in enumerate([*glove.dictionary]):
        vocab[line.strip()] = idx

    return embeddings, vocab, glove
def feature_extract(path_dataset):
    feature_extract_dataset = []
    speeches = read_csv(path_dataset, sep="|")
    speeches['Classe'] = speeches['Classe'].replace(1, 1)
    #Falas mistas de Estamira e sua Família
    speeches['Classe'] = speeches['Classe'].replace(0, -1)

    # Para cada fala
    for indice, fala in enumerate(speeches.Fala):
        #inicialização do método para pegar co-ocorrência
        dataset = Corpus()
        grafo = Graph()
        lsa = TruncatedSVD(n_components=1)

        tolkenizado = [simple_preprocess(str(fala), deacc=True)]

        quantas_palavras = shape(tolkenizado)[1]

        dataset.fit(tolkenizado, window=79)

        graph = Graph(dataset.matrix)
        values_lsa = lsa.fit_transform(dataset.matrix)

        values_mean = mean(values_lsa, axis=0)
        values_std = std(values_lsa, axis=0)

        feature_extract_dataset.append([
            average_clustering(G=graph),
            average_shortest_path_length(G=graph),
            speeches.comprimento[indice],
            values_mean.item(),
            values_std.item(), quantas_palavras
        ])

    return DataFrame(feature_extract_dataset), speeches['Classe'].values
Beispiel #8
0
def build_model_glove(args):

    if not os.path.exists(args.corpus_model) or \
            max(map(os.path.getmtime, args.input)) >= os.path.getmtime(args.corpus_model):

        # Build the corpus dictionary and the cooccurrence matrix.
        logging.info('Pre-processing corpus')

        corpus_model = Corpus()
        corpus_model.fit(get_sentences(args), window=CONFIG['glove']['window'])
        corpus_model.save(args.corpus_model)

        logging.info('Dict size: %s' % len(corpus_model.dictionary))
        logging.info('Collocations: %s' % corpus_model.matrix.nnz)
    else:
        # Try to load a corpus from disk.
        logging.info('Reading corpus statistics')
        corpus_model = Corpus.load(args.corpus_model)

        logging.info('Dict size: %s' % len(corpus_model.dictionary))
        logging.info('Collocations: %s' % corpus_model.matrix.nnz)

    # Train the GloVe model and save it to disk.
    logging.info('Training the GloVe model')

    glove = Glove(no_components=CONFIG['glove']['size'], learning_rate=CONFIG['glove']['learning_rate'])
    glove.fit(corpus_model.matrix, epochs=CONFIG['glove']['epochs'],
              no_threads=args.workers, verbose=args.verbose)
    glove.add_dictionary(corpus_model.dictionary)
    return glove
def get_embeddings(prepared_input):
    corpus = Corpus()
    corpus.fit(prepared_input, window=10)
    glove = Glove(no_components=5, learning_rate=0.05)
    glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    glove.save('glove.model')
Beispiel #10
0
def train_glove(save_dir, size):
    print('START')
    f_corpus = get_full_corpus()
    corpus = Corpus()
    print('CREATE CORPUS')
    corpus.fit(f_corpus, window=10)
    word_dict = corpus.dictionary.keys()
    glove = Glove(no_components=size, learning_rate=0.05)
    print('START LEARNING')
    glove.fit(corpus.matrix, epochs=60, no_threads=8, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    dict_in_bin = dict()
    print('START SAVE')
    for item in word_dict:
        word_indx = glove.dictionary[item]
        dict_in_bin[item] = glove.word_vectors[word_indx]
    with open(save_dir, "wb") as file:
        pickle.dump(dict_in_bin, file)
    print('COMMON TEST')
    while True:
        try:
            s = input("Введите строку: ")
            print(glove.most_similar(s, number=10))
            word_indx = glove.dictionary[s]
            print(glove.word_vectors[word_indx])
        except:
            continue
Beispiel #11
0
def train_and_save_model(data_dir,
                         model_name='LeGlove',
                         num_epochs=10,
                         parallel_threads=1):
    '''
    This function processes all the data into a training
    corpus and fits a GloVe model to this corpus. 

    Parameters:
        data_dir (string):          master directory containing all jurisdiction-level directories
        model_name (string):        name of model to be used for output
        num_epochs (int):           number of epochs for which to train model
        parallel_threads (int):     number of parallel threads to use for training

    The trained model is saved as "[model_name].model" into the current directory.
    '''

    corpus_model = Corpus()
    corpus_model.fit(read_corpus(data_dir), window=CONTEXT_WINDOW)

    glove = Glove(no_components=NUM_COMPONENTS, learning_rate=LEARNING_RATE)
    glove.fit(corpus_model.matrix,
              epochs=num_epochs,
              no_threads=parallel_threads,
              verbose=True)
    glove.add_dictionary(corpus_model.dictionary)

    glove.save(model_name + '.model')
Beispiel #12
0
def build_glove_embeddings(corpus):
    """
    DESCRIPTION: 
             Applies the Glove python SGD algorithm given by glove_python library and build the
             word embeddings from our training set.
    INPUT:
            corpus: a list of lists where each sub-list represent a tweet. The outer list represents
                    the whole training dataset.
    OUTPUT: 
            words: python dictionary of the form (word, [vector of embeddings])
    """
    words = load_glove_embeddings_from_txt_file(
        MY_GLOVE_PYTHON_EMBEDDINGS_TXT_FILE)
    if words != None:
        return words
    model = Corpus()
    model.fit(corpus, window=algorithm['options']['WE']['window_size'])

    glove = Glove(no_components=algorithm['options']['WE']['we_features'],
                  learning_rate=algorithm['options']['WE']['learning_rate'])
    print('\nFitting Glove Python Embeddings')
    glove.fit(model.matrix, epochs=algorithm['options']['WE']['epochs'])
    glove.add_dictionary(model.dictionary)

    words = {}
    for w, id_ in glove.dictionary.items():
        words[w] = np.array(glove.word_vectors[id_])

    store_embeddings_to_txt_file(words, MY_GLOVE_PYTHON_EMBEDDINGS_TXT_FILE)
    return words
def generate_glove_corpus():
    global article_info_path, output_path

    write_log('GloVe Load article info : Start')
    with open(article_info_path, 'r') as f_art:
        article_info = json.load(f_art)
    write_log('GloVe Load article info : End')

    write_log('GloVe Generate sentences : Start')
    sentences = []
    for url, dict_info in article_info.items():
        sentence_header = dict_info.get('sentence_header', None)
        sentence_body = dict_info.get('sentence_body', None)

        if (sentence_header == None) or (sentence_body == None):
            continue

        words = []
        #for sentence in sentence_header + sentence_body:
        for sentence in sentence_header:
            for word in sentence.split(' '):
                words.append(word)

        sentences.append(words)
    write_log('GloVe Generate sentences : End')

    write_log('GloVe Generate corpus : Start')
    corpus = Corpus()
    corpus.fit(sentences, window=10)
    write_log('GloVe Generate corpus : End')

    corpus.save(output_path)
Beispiel #14
0
def main(args):

    ############################
    # corpus_model = Corpus()
    # corpus_model.fit(read_corpus(args.corpus))
    # corpus_model.save('corpus_select.model')

    ############################
    # corpus_model = Corpus().load('corpus_select.model')
    # print('Dict size: %s' % len(corpus_model.dictionary))
    # print('Collocations: %s' % corpus_model.matrix.nnz)

    # with open('global_vocab.pkl', 'wb') as handle:
    #     pickle.dump(corpus_model.dictionary, handle, protocol=pickle.HIGHEST_PROTOCOL)
    ############################

    # opening vocab to create the corpus object
    with open('global_vocab.pkl', 'rb') as f:
        vocab_dict = pickle.load(f)
    doc_model = Corpus(dictionary=vocab_dict)
    
    texts = list(read_corpus(args.corpus))
    
    #opening weight csv
    diff_bias = pd.read_csv(args.diff_bias, header=0)

    
    #col 2 is science/arts, col 3 is weapons/instruments

    total = {}
    # for i in range(10):
    for i in tqdm(range(len(texts))):
        doc = [texts[i]]
        doc_model.fit(doc)

        # we might not even need to save it, just put it into one matrix and save that

        coo = doc_model.matrix.todok()
        weight = diff_bias.iloc[i, 2]
        coo = {k:weight*v for k,v in coo.items()}
        total = Counter(coo) + Counter(total)


    def _dict_to_csr(term_dict):
        term_dict_v = term_dict.values()
        term_dict_k = term_dict.keys()
        term_dict_k_zip = zip(*term_dict_k)
        term_dict_k_zip_list = list(term_dict_k_zip)

        shape = (len(term_dict_k_zip_list[0]), len(term_dict_k_zip_list[1]))
        csr = sp.csr_matrix((list(term_dict_v), list(map(list, zip(*term_dict_k)))), shape = shape)
        coo = csr.tocoo()
        return coo

    total = dict(total)
    total = _dict_to_csr(total)
    print(total.get_shape())
    
    with open('doc_matrices_weighted.pkl', 'wb') as handle:
        pickle.dump(total, handle, protocol=pickle.HIGHEST_PROTOCOL)
Beispiel #15
0
    def initiate_model(self, input_corpus):
        self.corpus_model = Corpus()
        self.corpus_model.fit(self.__read_corpus(input_corpus), window=10)

        self.glove = Glove(no_components=100, learning_rate=0.05)
        self.glove.fit(self.corpus_model.matrix, epochs=200)
        self.glove.add_dictionary(self.corpus_model.dictionary)
Beispiel #16
0
def train_glove(target_group, glove_para, src_file, save_model_name):
    """
    example: train_glove(target_group='words', glove_para=glove_para_word)
    after save the mode, u can use it by : glove_ana = Glove.load('glove_words.model')
    :param target_group: 'words' or 'chars'
    :param glove_para: glove_para_word = {'window_size':4, 'no_components':300, 'learning_rate':0.05, 'no_epochs':2, 'parallelism':4}
    :return:
    """
    corpus_model = Corpus()
    corpus_model.fit(read_corpus(src_file=src_file,
                                 words_or_chars=target_group),
                     window=glove_para['window_size']
                     )  #avg word size is 6 for each sentence
    corpus_model.save('corpus_model_{}.model'.format(target_group))
    print target_group
    print('Dict size: %s' % len(corpus_model.dictionary))
    print('Collocations: %s' % corpus_model.matrix.nnz)
    print('Training the GloVe model')

    glove = Glove(no_components=glove_para['no_components'],
                  learning_rate=glove_para['learning_rate'])
    glove.fit(corpus_model.matrix,
              epochs=glove_para['no_epochs'],
              no_threads=glove_para['parallelism'],
              verbose=True)
    glove.add_dictionary(corpus_model.dictionary)

    glove.save(save_model_name)
Beispiel #17
0
def test_fitting():
    """
    Verify that the square error diminishes with fitting
    """

    num_sentences = 5000
    seed = 10

    corpus = Corpus()

    corpus.fit(generate_training_corpus(num_sentences, vocabulary_size=50, seed=seed))

    # Check that the performance is poor without fitting
    glove_model = Glove(no_components=100, learning_rate=0.05)
    glove_model.fit(corpus.matrix, epochs=0, no_threads=2)

    log_cooc_mat = corpus.matrix.copy()
    log_cooc_mat.data = np.log(log_cooc_mat.data)
    log_cooc_mat = np.asarray(log_cooc_mat.todense())

    repr_matrix = _reproduce_input_matrix(glove_model)

    assert ((repr_matrix - log_cooc_mat) ** 2).sum() > 30000.0

    # Check that it is good with fitting
    glove_model = Glove(no_components=100, learning_rate=0.05)
    glove_model.fit(corpus.matrix, epochs=500, no_threads=2)

    repr_matrix = _reproduce_input_matrix(glove_model)

    assert ((repr_matrix - log_cooc_mat) ** 2).sum() < 1500.0
def train_glove(src_filename, dim=100):
    corpus = Corpus()
    corpus.fit(get_lines(src_filename), window=10)
    glove = Glove(no_components=dim, learning_rate=0.001)
    glove.fit(corpus.matrix, epochs=100, no_threads=20, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    glove.save(DATA_DIR + 'glove.{}d.model'.format(dim))
def main():
    corpus_model = Corpus()
    corpus_model = Corpus.load('bioc-corpus-AZ2.model')
    glove = Glove(no_components=100, learning_rate=0.05)
    glove.fit(corpus_model.matrix, epochs=10, no_threads=16, verbose=True)
    glove.add_dictionary(corpus_model.dictionary)
    glove.save('bioc-glove-AZ2.model')
def getWordEmbeddings(processed_text):
    corpus = Corpus()
    corpus.fit(processed_text, window=3)
    glove = Glove(no_components=500, learning_rate=0.001)
    glove.fit(corpus.matrix, epochs=300000, no_threads=4, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    print(glove.most_similar('price'))
def glove_vectors(x,
                  embedding_size,
                  epochs=50,
                  lr=0.05,
                  alpha=0.75,
                  max_count=100,
                  tmp_loc='glove.w2vmodel'):
    # create dict ourselves so that the ids correspond to their location in the df, starting to count from first col downwards
    df = pd.DataFrame(x)
    word_id_dict = create_vocab_dict(df)
    # Creating a corpus object
    corpus = Corpus(dictionary=word_id_dict)
    # Training the corpus to generate the co occurence matrix which is used in GloVe
    # Distance scaling: standard glove reduces the occurence count based on how far a context word is from the focus word.
    # Should not be used since distance has no meaning for purely categorical variables.
    corpus.fit(df.values.tolist(),
               window=len(df.columns),
               distance_scaling=False)
    # alpha is the weighing of the loss, based on how likely a cooccurence is (Xij), less likely = less weight.
    glove = Glove(no_components=embedding_size,
                  learning_rate=lr,
                  alpha=alpha,
                  max_count=max_count)
    glove.fit(
        corpus.matrix, epochs=epochs, no_threads=1, verbose=True
    )  # glove paper: 50 epochs for dimensionality <300, 100 otherwise
    glove.add_dictionary(corpus.dictionary)
    glove.save_word2vec_format(tmp_loc)

    model = KeyedVectors.load_word2vec_format(tmp_loc)
    if os.path.exists(tmp_loc):
        os.remove(tmp_loc)
    return model
Beispiel #22
0
def train_glove(corpus,
                vocabulary,
                zero_init_indices=0,
                rand_init_indices=1,
                embedding_dim=300):
    """Use glove to train on corpus to obtain embedding
    Here we use a python implementation of Glove, but the official glove implementation of C version
    is also highly recommended: https://github.com/stanfordnlp/GloVe/blob/master/demo.sh

    Args:
        corpus: list of tokenized texts, corpus to train on
        vocabulary: dict, a mapping of words to indices
        zero_init_indices: int or a list, the indices which use zero-initialization. These indices
                           usually represent padding token.
        rand_init_indices: int or a list, the indices which use randomly-initialization.These
                           indices usually represent other special tokens, such as "unk" token.
        embedding_dim: int, dimensionality of embedding

    Returns: np.array, a word embedding matrix.

    """
    corpus_model = Corpus()
    corpus_model.fit(corpus, window=10)
    glove = Glove(no_components=embedding_dim, learning_rate=0.05)
    glove.fit(corpus_model.matrix, epochs=10, no_threads=4, verbose=True)
    glove.add_dictionary(corpus_model.dictionary)
    word_vectors = dict(
        (w, glove.word_vectors[glove.dictionary[w]]) for w in glove.dictionary)
    emb = filter_embeddings(word_vectors, embedding_dim, vocabulary,
                            zero_init_indices, rand_init_indices)
    return emb
Beispiel #23
0
def train_glove(path):
    import itertools
    from gensim.models.word2vec import Text8Corpus
    from gensim.scripts.glove2word2vec import glove2word2vec
    from glove import Corpus, Glove
    #import os
    #import struct
    sentences = list(itertools.islice(Text8Corpus(path), None))
    corpus = Corpus()
    corpus.fit(sentences, window=10)
    glove = Glove(no_components=300, learning_rate=0.05)
    glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    file_name = 'embeddings_models/model_glove_' + str(TRAINING_SENTENCES)
    glove.save(file_name)
    glove2word2vec(file_name, file_name + '_modified')
    """
    command = 'python -m gensim.scripts.glove2word2vec -i ' +file_name+' -o '+file_name+'_modified'
    os.system(command)
    with open(file_name+'_modified', mode='rb') as file: # b is important -> binary
        fileContent = file.read()
        print 'Content',fileContent
    """
    print 'Finished'
    return glove
Beispiel #24
0
def train_glove(corpus, params, exp_id, save_dir, save_dict=False):
    dictionary = load_glove_dictionary(exp_id, save_dir)
    # Build the corpus dictionary and the cooccurrence matrix.
    print('Pre-processing corpus')

    dict_path = os.path.join(save_dir, 'glove_dict_{}.model'.format(exp_id))
    if os.path.exists(dict_path):
        corpus_model = Corpus.load(dict_path)
    else:
        corpus_model = Corpus(dictionary)
        corpus_model.fit(corpus,
                         window=params['window'] * 2,
                         ignore_missing=True)
        if save_dict:
            corpus_model.save(dict_path)

    print('Dict size: %s' % len(corpus_model.dictionary))
    print('Collocations: %s' % corpus_model.matrix.nnz)

    glove = Glove(no_components=100, learning_rate=params['alpha'])
    glove.fit(corpus_model.matrix,
              epochs=50,
              no_threads=params['workers'],
              verbose=True)
    glove.add_dictionary(corpus_model.dictionary)
    return glove
Beispiel #25
0
def glove(eng_vect,L1_eng,hin_vect,L1_hin):
  eng_cor = Corpus() # object of english corpus
  hin_cor = Corpus() # object of hindi corpus
  
  eng_cor.fit(eng_vect, window=10) #train the english vector to compute the co occurence matrix
  hin_cor.fit(hin_vect, window=10) #train the hindi vector
  
  gloveE = Glove(no_components=4, learning_rate=0.04) #create a Glove object for english to create embeddings
  gloveH = Glove(no_components=4, learning_rate=0.04) #create a Glove object for hindi to create embeddings

  gloveE.fit(eng_cor.matrix, epochs=400, no_threads=5, verbose=False) #fitting the model for english vectors
  gloveH.fit(hin_cor.matrix, epochs=400, no_threads=5, verbose=False) #fitting the model for hindi vectors

  gloveE.add_dictionary(eng_cor.dictionary) #adding the english embeddings generated to the gloveE dictionary
  gloveE.save('gloveE.model') #saving the model

  gloveH.add_dictionary(hin_cor.dictionary) #adding the hindi embeddings generated to the gloveH dictionary
  gloveH.save('gloveH.model')

  # printing the embeddings for english words
  eng_emb = []
  for i in (eng_vect):
    for words in i:
      eng_emb.append(gloveE.word_vectors[gloveE.dictionary[words]])
  print(eng_emb[:50])

  # printing the embeddings for hindi words
  hin_emb = []
  for i in (hin_vect):
    for words in i:
      hin_emb.append(gloveH.word_vectors[gloveH.dictionary[words]])
  print(hin_emb[:50])

  #printing the most similar words of english words from L1
  for i in L1_eng:
    try:
      print(gloveE.most_similar(i))
      count1 = count1+1
    except:
      pass
  #printing the most similar words of hindi words from L1
  for i in L1_hin:
     try:
       print(gloveH.most_similar(i))
       count1 = count1+1
     except:
       pass
Beispiel #26
0
def buildCorpus(data_path=None, context_window=5):
    # function that loads in wikipedia data and fits corpus model
    print('Fitting data...')

    # intialize and fit corpus
    corpus = Corpus()
    corpus.fit(textGenerator(data_path), window=context_window)
    return corpus
Beispiel #27
0
def build_glove_word_vectors(data_frame, vec_dim, vectorizer, window_size,
                             niter):
    corpus = Corpus(vectorizer.vocabulary_)
    corpus.fit(data_frame.post, window=window_size, ignore_missing=True)
    glove = Glove(no_components=vec_dim, learning_rate=0.01)
    glove.fit(corpus.matrix, epochs=niter, no_threads=4, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    return glove
Beispiel #28
0
 def trainShake2(self):
     corpus = Corpus()
     shakespeare_words = self.shakespeare_lines()
     # corpus.fit(shakespeare_corpus + sonnets_corpus, window=10)
     corpus.fit(shakespeare_words, window=10)
     self.glove = Glove(no_components=100, learning_rate=0.05)
     self.glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
     self.glove.add_dictionary(corpus.dictionary)
Beispiel #29
0
def getGloveEmbedding(seqs, size=300, window=10, epochs=20):
    corpus = Corpus()
    corpus.fit(seqs, window=window)

    glove = Glove(no_components=size, learning_rate=0.05)
    glove.fit(corpus.matrix, epochs=epochs, verbose=True)

    return corpus.dictionary, glove.word_vectors
def train_model(line):
    corpus = Corpus()
    corpus.fit(line)
    glove = Glove(no_components=5, learning_rate=0.05, random_state=0)
    glove.fit(corpus.matrix, epochs=10, no_threads=100, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    glove.save('glove.model')
    return glove