Esempio n. 1
0
def train_glove_fashionrec(dimensionality, context, epochs):
    """ Train with Glove on IG corpora"""
    total_count, vocab_size = corpus_stats("data/clean2_corpus.txt")
    print("total word count: {}, vocabulary size: {}".format(
        total_count, vocab_size))
    fileName = "results/training/glove_fashion_epochs" + str(
        epochs) + "_d" + str(dimensionality) + "_c" + str(
            context) + "_" + ".txt"
    corpus = readCorpus()
    lines = corpus.split("\n")
    linessplit = map(lambda x: x.split(" "), lines)
    corpus_model = Corpus()
    start_time = datetime.now()
    corpus_model.fit(linessplit, window=context)
    corpusModelFile = "trained/glove_fashion_epochs" + str(
        epochs) + "_d" + str(dimensionality) + "_c" + str(
            context) + "_corpus" + ".model"
    corpus_model.save(corpusModelFile)
    glove = Glove(no_components=dimensionality, learning_rate=0.05)
    glove.fit(corpus_model.matrix,
              epochs=int(epochs),
              no_threads=8,
              verbose=True)
    glove.add_dictionary(corpus_model.dictionary)
    time_elapsed = datetime.now() - start_time
    gloveModelFile = "trained/glove_fashion_epochs" + str(epochs) + "_d" + str(
        dimensionality) + "_c" + str(context) + "_vecs" + ".model"
    glove.save(gloveModelFile)
    notes = "Glove Fashion Data," + str(dimensionality) + " dim, " + str(
        context) + " context, " + str(
            epochs) + " epochs \n" + "Training time: " + str(time_elapsed)
    save_to_file(fileName, notes)
    gloveVecFile = "trained/glove_fashion_epochs" + str(epochs) + "_d" + str(
        dimensionality) + "_c" + str(context) + "_vecs" + ".vec"
    save_glove_bin_to_vec(glove, gloveVecFile)
def main():
    corpus_model = Corpus()
    corpus_model = Corpus.load('bioc-corpus-AZ2.model')
    glove = Glove(no_components=100, learning_rate=0.05)
    glove.fit(corpus_model.matrix, epochs=10, no_threads=16, verbose=True)
    glove.add_dictionary(corpus_model.dictionary)
    glove.save('bioc-glove-AZ2.model')
Esempio n. 3
0
def glove_embed(data, embed_dim, window_size, epochs_, step_size):
    '''
    DESCRIPTION : Perform Global Vectors for word embeddings for tokens in data set

    INPUT:
        |--- train: list of tweets
        |--- embed_size: [int] integer representing embedding dimension
        |--- window_size: [int] integer representing the size of the window of tokens considered during training for each token
        |--- epochs: [int] integer for number of epochs for Word2Vec training
        |--- step_size: [float] learning step for the SGD for Word2Vec training 

    OUTPUT:
        |--- embeddings: [dict] dictionnary with tweets as keys and 1D array of feature vector as values
        |--- vocab: [dict] dictionnary with tokens as keys and index of each token in vocab as values
        |--- glove: [Global Vectors Model] GloVe model trained on data
    '''
    sentences = get_tokens(data)

    model = Corpus()
    model.fit(sentences, window=window_size)

    glove = Glove(no_components=embed_dim, learning_rate=step_size)
    glove.fit(model.matrix, epochs=epochs_, no_threads=1, verbose=True)
    glove.add_dictionary(model.dictionary)

    embeddings = np.zeros((len([*glove.dictionary]), embed_dim))
    for w, id_ in glove.dictionary.items():
        embeddings[id_, :] = np.array([glove.word_vectors[id_]])

    vocab = dict()
    for idx, line in enumerate([*glove.dictionary]):
        vocab[line.strip()] = idx

    return embeddings, vocab, glove
Esempio n. 4
0
def train_glove(target_group, glove_para, src_file, save_model_name):
    """
    example: train_glove(target_group='words', glove_para=glove_para_word)
    after save the mode, u can use it by : glove_ana = Glove.load('glove_words.model')
    :param target_group: 'words' or 'chars'
    :param glove_para: glove_para_word = {'window_size':4, 'no_components':300, 'learning_rate':0.05, 'no_epochs':2, 'parallelism':4}
    :return:
    """
    corpus_model = Corpus()
    corpus_model.fit(read_corpus(src_file=src_file,
                                 words_or_chars=target_group),
                     window=glove_para['window_size']
                     )  #avg word size is 6 for each sentence
    corpus_model.save('corpus_model_{}.model'.format(target_group))
    print target_group
    print('Dict size: %s' % len(corpus_model.dictionary))
    print('Collocations: %s' % corpus_model.matrix.nnz)
    print('Training the GloVe model')

    glove = Glove(no_components=glove_para['no_components'],
                  learning_rate=glove_para['learning_rate'])
    glove.fit(corpus_model.matrix,
              epochs=glove_para['no_epochs'],
              no_threads=glove_para['parallelism'],
              verbose=True)
    glove.add_dictionary(corpus_model.dictionary)

    glove.save(save_model_name)
def getWordEmbeddings(processed_text):
    corpus = Corpus()
    corpus.fit(processed_text, window=3)
    glove = Glove(no_components=500, learning_rate=0.001)
    glove.fit(corpus.matrix, epochs=300000, no_threads=4, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    print(glove.most_similar('price'))
def get_embeddings(prepared_input):
    corpus = Corpus()
    corpus.fit(prepared_input, window=10)
    glove = Glove(no_components=5, learning_rate=0.05)
    glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    glove.save('glove.model')
Esempio n. 7
0
def train_glove(save_dir, size):
    print('START')
    f_corpus = get_full_corpus()
    corpus = Corpus()
    print('CREATE CORPUS')
    corpus.fit(f_corpus, window=10)
    word_dict = corpus.dictionary.keys()
    glove = Glove(no_components=size, learning_rate=0.05)
    print('START LEARNING')
    glove.fit(corpus.matrix, epochs=60, no_threads=8, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    dict_in_bin = dict()
    print('START SAVE')
    for item in word_dict:
        word_indx = glove.dictionary[item]
        dict_in_bin[item] = glove.word_vectors[word_indx]
    with open(save_dir, "wb") as file:
        pickle.dump(dict_in_bin, file)
    print('COMMON TEST')
    while True:
        try:
            s = input("Введите строку: ")
            print(glove.most_similar(s, number=10))
            word_indx = glove.dictionary[s]
            print(glove.word_vectors[word_indx])
        except:
            continue
Esempio n. 8
0
def build_model_glove(args):

    if not os.path.exists(args.corpus_model) or \
            max(map(os.path.getmtime, args.input)) >= os.path.getmtime(args.corpus_model):

        # Build the corpus dictionary and the cooccurrence matrix.
        logging.info('Pre-processing corpus')

        corpus_model = Corpus()
        corpus_model.fit(get_sentences(args), window=CONFIG['glove']['window'])
        corpus_model.save(args.corpus_model)

        logging.info('Dict size: %s' % len(corpus_model.dictionary))
        logging.info('Collocations: %s' % corpus_model.matrix.nnz)
    else:
        # Try to load a corpus from disk.
        logging.info('Reading corpus statistics')
        corpus_model = Corpus.load(args.corpus_model)

        logging.info('Dict size: %s' % len(corpus_model.dictionary))
        logging.info('Collocations: %s' % corpus_model.matrix.nnz)

    # Train the GloVe model and save it to disk.
    logging.info('Training the GloVe model')

    glove = Glove(no_components=CONFIG['glove']['size'], learning_rate=CONFIG['glove']['learning_rate'])
    glove.fit(corpus_model.matrix, epochs=CONFIG['glove']['epochs'],
              no_threads=args.workers, verbose=args.verbose)
    glove.add_dictionary(corpus_model.dictionary)
    return glove
Esempio n. 9
0
def build_glove_embeddings(corpus):
    """
    DESCRIPTION: 
             Applies the Glove python SGD algorithm given by glove_python library and build the
             word embeddings from our training set.
    INPUT:
            corpus: a list of lists where each sub-list represent a tweet. The outer list represents
                    the whole training dataset.
    OUTPUT: 
            words: python dictionary of the form (word, [vector of embeddings])
    """
    words = load_glove_embeddings_from_txt_file(
        MY_GLOVE_PYTHON_EMBEDDINGS_TXT_FILE)
    if words != None:
        return words
    model = Corpus()
    model.fit(corpus, window=algorithm['options']['WE']['window_size'])

    glove = Glove(no_components=algorithm['options']['WE']['we_features'],
                  learning_rate=algorithm['options']['WE']['learning_rate'])
    print('\nFitting Glove Python Embeddings')
    glove.fit(model.matrix, epochs=algorithm['options']['WE']['epochs'])
    glove.add_dictionary(model.dictionary)

    words = {}
    for w, id_ in glove.dictionary.items():
        words[w] = np.array(glove.word_vectors[id_])

    store_embeddings_to_txt_file(words, MY_GLOVE_PYTHON_EMBEDDINGS_TXT_FILE)
    return words
Esempio n. 10
0
def train_and_save_model(data_dir,
                         model_name='LeGlove',
                         num_epochs=10,
                         parallel_threads=1):
    '''
    This function processes all the data into a training
    corpus and fits a GloVe model to this corpus. 

    Parameters:
        data_dir (string):          master directory containing all jurisdiction-level directories
        model_name (string):        name of model to be used for output
        num_epochs (int):           number of epochs for which to train model
        parallel_threads (int):     number of parallel threads to use for training

    The trained model is saved as "[model_name].model" into the current directory.
    '''

    corpus_model = Corpus()
    corpus_model.fit(read_corpus(data_dir), window=CONTEXT_WINDOW)

    glove = Glove(no_components=NUM_COMPONENTS, learning_rate=LEARNING_RATE)
    glove.fit(corpus_model.matrix,
              epochs=num_epochs,
              no_threads=parallel_threads,
              verbose=True)
    glove.add_dictionary(corpus_model.dictionary)

    glove.save(model_name + '.model')
Esempio n. 11
0
def train_glove(path):
    import itertools
    from gensim.models.word2vec import Text8Corpus
    from gensim.scripts.glove2word2vec import glove2word2vec
    from glove import Corpus, Glove
    #import os
    #import struct
    sentences = list(itertools.islice(Text8Corpus(path), None))
    corpus = Corpus()
    corpus.fit(sentences, window=10)
    glove = Glove(no_components=300, learning_rate=0.05)
    glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    file_name = 'embeddings_models/model_glove_' + str(TRAINING_SENTENCES)
    glove.save(file_name)
    glove2word2vec(file_name, file_name + '_modified')
    """
    command = 'python -m gensim.scripts.glove2word2vec -i ' +file_name+' -o '+file_name+'_modified'
    os.system(command)
    with open(file_name+'_modified', mode='rb') as file: # b is important -> binary
        fileContent = file.read()
        print 'Content',fileContent
    """
    print 'Finished'
    return glove
Esempio n. 12
0
def train_glove(corpus,
                vocabulary,
                zero_init_indices=0,
                rand_init_indices=1,
                embedding_dim=300):
    """Use glove to train on corpus to obtain embedding
    Here we use a python implementation of Glove, but the official glove implementation of C version
    is also highly recommended: https://github.com/stanfordnlp/GloVe/blob/master/demo.sh

    Args:
        corpus: list of tokenized texts, corpus to train on
        vocabulary: dict, a mapping of words to indices
        zero_init_indices: int or a list, the indices which use zero-initialization. These indices
                           usually represent padding token.
        rand_init_indices: int or a list, the indices which use randomly-initialization.These
                           indices usually represent other special tokens, such as "unk" token.
        embedding_dim: int, dimensionality of embedding

    Returns: np.array, a word embedding matrix.

    """
    corpus_model = Corpus()
    corpus_model.fit(corpus, window=10)
    glove = Glove(no_components=embedding_dim, learning_rate=0.05)
    glove.fit(corpus_model.matrix, epochs=10, no_threads=4, verbose=True)
    glove.add_dictionary(corpus_model.dictionary)
    word_vectors = dict(
        (w, glove.word_vectors[glove.dictionary[w]]) for w in glove.dictionary)
    emb = filter_embeddings(word_vectors, embedding_dim, vocabulary,
                            zero_init_indices, rand_init_indices)
    return emb
Esempio n. 13
0
def build_model_glove(args):

    from glove import Glove, Corpus

    if not os.path.exists(args.corpus_model) or \
            max(map(os.path.getmtime, args.input)) >= os.path.getmtime(args.corpus_model):

        # Build the corpus dictionary and the cooccurrence matrix.
        logging.info('Pre-processing corpus')

        corpus_model = Corpus()
        corpus_model.fit(get_sentences(args), window=CONFIG['glove']['window'])
        corpus_model.save(args.corpus_model)

        logging.info('Dict size: %s' % len(corpus_model.dictionary))
        logging.info('Collocations: %s' % corpus_model.matrix.nnz)
    else:
        # Try to load a corpus from disk.
        logging.info('Reading corpus statistics')
        corpus_model = Corpus.load(args.corpus_model)

        logging.info('Dict size: %s' % len(corpus_model.dictionary))
        logging.info('Collocations: %s' % corpus_model.matrix.nnz)

    # Train the GloVe model and save it to disk.
    logging.info('Training the GloVe model')

    glove = Glove(no_components=CONFIG['glove']['size'], learning_rate=CONFIG['glove']['learning_rate'])
    glove.fit(corpus_model.matrix, epochs=CONFIG['glove']['epochs'],
              no_threads=args.workers, verbose=args.verbose)
    glove.add_dictionary(corpus_model.dictionary)
    return glove
def main():
    corpus_model = Corpus()
    corpus_model = Corpus.load('bioc-corpus-AZ2.model')
    glove = Glove(no_components=100, learning_rate=0.05)
    glove.fit(corpus_model.matrix, epochs=10, no_threads=16, verbose=True)
    glove.add_dictionary(corpus_model.dictionary)
    glove.save('bioc-glove-AZ2.model')
Esempio n. 15
0
def trainGlove(path,
               no_components=100,
               learning_rate=0.05,
               epochs=100,
               no_threads=1,
               verbose=True,
               context_window=5,
               save_path='outputs/Glove'):
    # function to load in and train GloVe model
    print('Training Glove Model...')
    glove = Glove(no_components=no_components, learning_rate=learning_rate)
    corpus = buildCorpus(path, context_window)
    glove.fit(corpus.matrix, epochs=epochs, no_threads=no_threads, verbose=1)
    glove.add_dictionary(corpus.dictionary)

    # glove.save(save_path + '/glove.model')

    with open('data/words.txt', 'r') as f:
        words = f.read().split('\n')[:-1]

    shared_words = list(set(words).intersection(set(list(corpus.dictionary))))

    glove_dict = {}
    for word in shared_words:
        glove_dict[word] = glove.word_vectors[glove.dictionary[word], :]

    np.save('DSMs/glove.npy', glove_dict)
def train_glove(src_filename, dim=100):
    corpus = Corpus()
    corpus.fit(get_lines(src_filename), window=10)
    glove = Glove(no_components=dim, learning_rate=0.001)
    glove.fit(corpus.matrix, epochs=100, no_threads=20, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    glove.save(DATA_DIR + 'glove.{}d.model'.format(dim))
Esempio n. 17
0
class GloveEmbeding:
    def __init__(self, no_components=128):
        self.no_components = no_components
        self.model = Glove(no_components=self.no_components,
                           learning_rate=0.05)

    def train(self, lines):
        # lines list of lists
        corpus = Corpus()
        corpus.fit(lines, window=10)
        self.model.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
        self.model.add_dictionary(corpus.dictionary)

    def save(self, path):
        self.model.save(path)

    def load(self, path):
        self.load(path)

    def embeddings(self, sentences, max_len):
        # paragraph list of tokens
        matrix = np.zeros((len(sentences), max_len, self.no_components))
        for nr, sentence in enumerate(sentences):
            for nr_tok, tok in enumerate(sentence):
                try:
                    emb_nr = self.model.dictionary[tok]

                    matrix[nr, nr_tok, :] = self.model.word_vectors[emb_nr]
                except KeyError:
                    print('word \"' + tok.encode(encoding='UTF-8') +
                          '\" not in dictionary')
                except IndexError:
                    pass
        return matrix
Esempio n. 18
0
def train_glove(corpus, params, exp_id, save_dir, save_dict=False):
    dictionary = load_glove_dictionary(exp_id, save_dir)
    # Build the corpus dictionary and the cooccurrence matrix.
    print('Pre-processing corpus')

    dict_path = os.path.join(save_dir, 'glove_dict_{}.model'.format(exp_id))
    if os.path.exists(dict_path):
        corpus_model = Corpus.load(dict_path)
    else:
        corpus_model = Corpus(dictionary)
        corpus_model.fit(corpus,
                         window=params['window'] * 2,
                         ignore_missing=True)
        if save_dict:
            corpus_model.save(dict_path)

    print('Dict size: %s' % len(corpus_model.dictionary))
    print('Collocations: %s' % corpus_model.matrix.nnz)

    glove = Glove(no_components=100, learning_rate=params['alpha'])
    glove.fit(corpus_model.matrix,
              epochs=50,
              no_threads=params['workers'],
              verbose=True)
    glove.add_dictionary(corpus_model.dictionary)
    return glove
Esempio n. 19
0
class GloVeFilter(object):
    def __init__(self):
        # Corpus model
        vocab = dict(torch.load("../data/dialogue.vocab.pt", "text"))
        self.corpus_model = Corpus(dictionary=vocab['tgt'].stoi)
        # Model
        self.glove = Glove(no_components=args.no_components,
                           learning_rate=args.learning_rate)

    def load_corpus_from_txt(self):
        print('Reading corpus statistics...')
        #texts = [self.pp.preprocessing(l.strip().decode("utf8", "ignore")) for l in open(args.data_path)]
        texts = [
            l.strip().decode("utf8", "ignore").split(" ")
            for l in open(args.data_path)
        ]
        self.corpus_model.fit(texts, window=args.window, ignore_missing=True)
        self.corpus_model.save(args.corpus_model_path)
        print('Dict size: %s' % len(self.corpus_model.dictionary))
        print('Collocations: %s' % self.corpus_model.matrix.nnz)

    def load_corpus_from_model(self):
        print('Reading corpus statistics...')
        self.corpus_model = Corpus.load(args.corpus_model_path)
        print('Dict size: %s' % len(self.corpus_model.dictionary))
        print('Collocations: %s' % self.corpus_model.matrix.nnz)

    def train(self):
        print('Training the GloVe model...')
        self.glove.fit(self.corpus_model.matrix,
                       epochs=args.epochs,
                       verbose=True)
        self.glove.add_dictionary(self.corpus_model.dictionary)
        self.glove.save(args.model_path)
        print('Training finished')
Esempio n. 20
0
class MyGloVe:
    def initiate_model(self, input_corpus):
        self.corpus_model = Corpus()
        self.corpus_model.fit(self.__read_corpus(input_corpus), window=10)

        self.glove = Glove(no_components=100, learning_rate=0.05)
        self.glove.fit(self.corpus_model.matrix, epochs=200)
        self.glove.add_dictionary(self.corpus_model.dictionary)

    def cosine_similarity(self, first_text, second_text):
        first = self.__average_feature_vector(first_text)
        second = self.__average_feature_vector(second_text)

        return 1 - spatial.distance.cosine(first, second)

    def __read_corpus(self, input_corpus):
        for line in input_corpus:
            yield line

    def __average_feature_vector(self, text):
        words = text.split()
        words_no = 0
        feature_vector = numpy.zeros((100, ), dtype="float32")

        for word in words:
            if word in self.glove.dictionary:
                word_idx = self.glove.dictionary[word]
                words_no += 1
                feature_vector = numpy.add(feature_vector,
                                           self.glove.word_vectors[word_idx])

        if words_no > 0:
            feature_vector = numpy.divide(feature_vector, words_no)

        return feature_vector
Esempio n. 21
0
def parse_Word2Vec(full_content):
    corpus = Corpus()
    corpus.fit(full_content, window=10)
    glove = Glove(no_components=100, learning_rate=0.05)
    glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
    glove.add_dictionary(corpus.dictionary)

    # Open file to write the results
    f2 = open('/home/ubuntu/corpus/results.txt', 'w')

    # Loop through all the article types in the file
    with open('/home/ubuntu/corpus/article_types.csv', 'r') as f:
        reader = csv.reader(f, delimiter="\t")
        for row in reader:
            article_type = row[0]
            translator = str.maketrans({key: '' for key in string.punctuation})
            article_type_no_punctuation = article_type.translate(translator)
            wordnet = WordNetLemmatizer()
            article_type_clean = wordnet.lemmatize(article_type_no_punctuation)
            try:
                match = glove.most_similar(article_type_clean, number=10)
                matched_item = match[0][0]
                print(article_type_clean + ' -> ' + str(matched_item))
                f2.write(article_type + '\n')
                f2.write(str(matched_item + '\n'))
            except:
                pass
                print('failed for: ' + article_type)
    f2.close()
def glove_vectors(x,
                  embedding_size,
                  epochs=50,
                  lr=0.05,
                  alpha=0.75,
                  max_count=100,
                  tmp_loc='glove.w2vmodel'):
    # create dict ourselves so that the ids correspond to their location in the df, starting to count from first col downwards
    df = pd.DataFrame(x)
    word_id_dict = create_vocab_dict(df)
    # Creating a corpus object
    corpus = Corpus(dictionary=word_id_dict)
    # Training the corpus to generate the co occurence matrix which is used in GloVe
    # Distance scaling: standard glove reduces the occurence count based on how far a context word is from the focus word.
    # Should not be used since distance has no meaning for purely categorical variables.
    corpus.fit(df.values.tolist(),
               window=len(df.columns),
               distance_scaling=False)
    # alpha is the weighing of the loss, based on how likely a cooccurence is (Xij), less likely = less weight.
    glove = Glove(no_components=embedding_size,
                  learning_rate=lr,
                  alpha=alpha,
                  max_count=max_count)
    glove.fit(
        corpus.matrix, epochs=epochs, no_threads=1, verbose=True
    )  # glove paper: 50 epochs for dimensionality <300, 100 otherwise
    glove.add_dictionary(corpus.dictionary)
    glove.save_word2vec_format(tmp_loc)

    model = KeyedVectors.load_word2vec_format(tmp_loc)
    if os.path.exists(tmp_loc):
        os.remove(tmp_loc)
    return model
Esempio n. 23
0
def train(path, freq, window, dim, lr, epochs):
    lines = []
    dic = {}
    print("Start of train method")
    try:
        for f in os.listdir(path):
            text = open(path + '/' + f, 'r').read()
            text = re.sub('\n', ' ', text)
            text = text.split()
            for word in text:
                if word in dic.keys():
                    dic[word] += 1
                else:
                    dic[word] = 1
        print("Created Dictionary for frequencies of words.")
        for f in os.listdir(path):
            text = open(path + '/' + f, 'r').read()
            text = re.sub('\n', ' ', text)
            text = text.split()
            text = [word for word in text if dic[word] > freq]
            lines.append(text)
        print(
            "Converted preprocessed text data in input format of array of array of words."
        )
        corpus = Corpus()
        corpus.fit(lines, window=window)
        glove = Glove(no_components=dim, learning_rate=lr)
        glove.fit(corpus.matrix, epochs=epochs, verbose=True)
        glove.add_dictionary(corpus.dictionary)
        glove.save('glove.model')
        print("Saved the trained model to glove.model.")
    except:
        print("Error occured in training glove model")
Esempio n. 24
0
def train_model(line):
    corpus = Corpus()
    corpus.fit(line)
    glove = Glove(no_components=5, learning_rate=0.05, random_state=0)
    glove.fit(corpus.matrix, epochs=10, no_threads=100, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    glove.save('glove.model')
    return glove
Esempio n. 25
0
def build_glove_word_vectors(data_frame, vec_dim, vectorizer, window_size,
                             niter):
    corpus = Corpus(vectorizer.vocabulary_)
    corpus.fit(data_frame.post, window=window_size, ignore_missing=True)
    glove = Glove(no_components=vec_dim, learning_rate=0.01)
    glove.fit(corpus.matrix, epochs=niter, no_threads=4, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    return glove
    def glove_feat(df, feat, length):
        corpus = Corpus() 
        corpus.fit(df[feat], window=20)
        glove = Glove(no_components=length, learning_rate=0.05)

        glove.fit(corpus.matrix, epochs=10, no_threads=10, verbose=True)
        glove.add_dictionary(corpus.dictionary)

        return glove
Esempio n. 27
0
def glove_embed(texts, victor_size):
    corpus_model = Corpus()
    corpus_model.fit(texts, window=5, ignore_mising=False)

    glove = Glove(no_components=victor_size, learning_rate=0.05)
    glove.fit(corpus_model.matrix, epochs=20, no_threads=1, verbose=True)
    glove.add_dictionary(corpus_model.dictionary)
    glove.save('embed_model/glove.embed_model')
    return glove
Esempio n. 28
0
    def train_model(self,
                    data,
                    embeddings_size,
                    context_size=10,
                    learning_rate=0.05,
                    alpha=0.75,
                    max_count=100,
                    max_loss=10.0,
                    dictionary=None,
                    workers=5,
                    epochs=20,
                    seed=1111):

        assert len(data) != 0
        print(f'The passed data consists of {len(data)} sentences.')

        print('Fitting the corpus with the given training sentences')
        start_time = time.time()
        if os.path.exists(self.corpus_path):
            print(f'Loading corpus {self.corpus_path} from disk')
            corpus = Corpus.load(self.corpus_path)
        else:
            corpus = Corpus(dictionary=dictionary)
            # dataset represented as a list of lists
            # the length of the (symmetric) context window used for co-occurrence
            corpus.fit(data, window=context_size)
            corpus.save(self.corpus_path)

        print('| Corpus fit time: {:5.2f}s |'.format(time.time() - start_time))
        print(
            f'The vocabulary size of the trained model is {len(corpus.dictionary)} unique tokens'
        )
        print(f'The number of collocations is {corpus.matrix.nnz}')

        print('Training GloVe model')
        start_time = time.time()
        model = Glove(
            no_components=embeddings_size,  # number of latent dimensions
            alpha=alpha,
            max_count=max_count,
            max_loss=max_loss,
            learning_rate=learning_rate,
            random_state=seed,
        )

        # fitting to the corpus and adding standard dictionary to the object
        model.fit(corpus.matrix,
                  epochs=epochs,
                  no_threads=workers,
                  verbose=True)
        model.add_dictionary(corpus.dictionary)

        print('| GloVe model training time: {:5.2f}s |'.format(time.time() -
                                                               start_time))

        model.save(self.model_path)
        self.word_vectors = model
def generateModel(traces):
    linesentence = ""
    myFile = open("sentences.txt", 'w')
    mypackCount = 0
    for trace in traces:
        linesentence = ""

        directionCursor = None
        dataCursor = 0
        for packet in trace.getPackets():
            if directionCursor == None:
                directionCursor = packet.getDirection()

            if packet.getDirection() != directionCursor:
                #dataKey = 'S'+str(directionCursor)+'-'+str( GloveClassifier.roundArbitrary(dataCursor, 600) )
                dataKey = 'S' + str(directionCursor) + '-' + str(dataCursor)
                linesentence = linesentence + " " + dataKey
                directionCursor = packet.getDirection()
                dataCursor = 0

            dataCursor += packet.getLength()

            linesentence = linesentence + " " + str(
                packet.getLength()) + "_" + str(packet.getDirection())

        if dataCursor > 0:
            key = 'S' + str(directionCursor) + '-' + str(dataCursor)
            linesentence = linesentence + " " + key
        myFile.write(linesentence)
        myFile.write("\n")

    myFile.close()
    if config.CLASSIFIER == config.GLOVE_CLASSIFIER:
        sentences = models.word2vec.LineSentence("sentences.txt")
        corpus = Corpus()

        corpus.fit(sentences, window=8)
        glove = Glove(no_components=300, learning_rate=0.05)
        glove.fit(corpus.matrix, epochs=100, no_threads=4, verbose=True)
        glove.add_dictionary(corpus.dictionary)
        glove.save("mygloveModel")

    elif config.CLASSIFIER == config.W2V_CLASSIFIER:
        txt = open("sentences.txt")
        # print txt.read()
        if (len(txt.read()) > 0):
            print "in here"
            txt.close()
            sentences = models.word2vec.LineSentence("sentences.txt")
            model = models.word2vec.Word2Vec(sentences,
                                             size=50,
                                             window=15,
                                             min_count=1,
                                             workers=4)
            model.save("word2vecModel")
        txt.close()
Esempio n. 30
0
    def fit(self, sents):
        corpus = Corpus()
        corpus.fit(sents, window=self.window_size)

        model = Glove(no_components=self.n_features, learning_rate=0.05)
        model.fit(corpus.matrix, epochs=self.n_epochs, no_threads=self.n_threads, verbose=True)
        model.add_dictionary(corpus.dictionary)
        self.model = model

        return self
def generate_glove_text8(EMBEDDING_DIM, saveTo='models/glovetext8.model'):
    import itertools
    sentences = list(itertools.islice(word2vec.Text8Corpus('data/text8'),
                                      None))
    corpus = Corpus()
    corpus.fit(sentences, window=10)
    glove = Glove(no_components=EMBEDDING_DIM, learning_rate=0.05)
    glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    glove.save(saveTo)
    print('DONE! Saved to', saveTo)
Esempio n. 32
0
def glove(lines):
    # creating a corpus object
    corpus = Corpus()
    # training the corpus to generate the co occurence matrix which is used in GloVe
    corpus.fit(lines, window=10)
    # creating a Glove object which will use the matrix created in the above lines to create embeddings
    # We can set the learning rate as it uses Gradient Descent and number of components
    glove = Glove(no_components=5, learning_rate=0.05)

    glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    glove.save('glove.model')
Esempio n. 33
0
 def pretrain(self,data_src):
     if not os.path.isfile("glove.model"):
         data_src = DataClean([
                             ["[^a-z]"," "],  # only letters
                             [" [ ]+", " "],  # remove extra spaces
                             ],html_clean=True,split_words=True).fit(data_src).transform(data_src)
         corpus_model = Corpus()
         corpus_model.fit(data_src,window=self.window)
         glove = Glove(no_components=self.num_features,learning_rate=self.learning_rate)
         glove.fit(corpus_model.matrix,epochs=self.epochs,verbose=True)
         glove.add_dictionary(corpus_model.dictionary)
         glove.save("glove.model")
Esempio n. 34
0
    def run_glove(self):
        """ run global vector """
        #sentences = [["hi","good","to"],["see","u"]]
        sentences = self.get_sentences()

        print '\n' + '-'*80
        print "Fitting words into corpus"
        corpus = Corpus()
        corpus.fit(sentences, window=10)

        print "Running Glove"
        glove = Glove(no_components=200, learning_rate=0.05)
        glove.fit(corpus.matrix, epochs=5, no_threads=10, verbose=True)
        glove.add_dictionary(corpus.dictionary)

        print "Fitting words and vectors into unique_words and vectors200"
        unique_words = []
        vectors200 = []

        cnt1 = 0
        length1 = len(glove.inverse_dictionary)
        for word_id in glove.inverse_dictionary:
            cnt1 += 1
            unique_words.append(glove.inverse_dictionary[word_id])
            vectors200.append(glove.word_vectors[word_id])

            sys.stdout.write("\rStatus: %s / %s"%(cnt1, length1))
            sys.stdout.flush()

        print '\n' + "Processing vectors200"
        processed_vectors200 = []
        processed_vector = []

        cnt2 = 0
        length2 = len(vectors200)
        for vector in vectors200:
            cnt2 += 1
            for float_num in vector:
                processed_vector.append(float_num)

            processed_vectors200.append(processed_vector)

            sys.stdout.write("\rStatus: %s / %s"%(cnt2, length2))
            sys.stdout.flush()

        return unique_words, processed_vectors200
def train_glove(sentences):
    print 'training glove model...'
    t0 = time()
    
    num_features = 300    # Word vector dimensionality
    context = 5          # Context window size
    learning_rate = 0.05
    
    corpus = Corpus()
    corpus.fit(sentences, window=context)

    glove = Glove(no_components=num_features, learning_rate=learning_rate)
    glove.fit(corpus.matrix, epochs=30, no_threads=8, verbose=True)
    glove.add_dictionary(corpus.dictionary)

    print 'took %0.5fs.' % (time() - t0)
    return glove
def build_glove_embeddings(training, testing, args):
    
    ''' Trains the model on the sentiment140 dataset

    @Arguments:
        data:  the loaded sentiment140 dataset from module
        num_epochs: the number of epochs to train on
        num_threads: the number of threads to use
        num_components: the number of components the glove model should use
        learning_rate: the model's learning rate
        window_size: the size of the window to use when looking for word co-occurence
        verbose: boolean for whether or not extensive output should be printed to screen

    @Return:
        A trained glove model
    '''
        
    # initialize model
    glove = Glove(no_components = args.vecsize, learning_rate = args.learningRate)
    
    txtSource = chain( imap(lambda (txt,lbl): txt, training), imap(lambda (txt,lbl): txt, testing))
    
    # read in the data to train on
    corpus_model = Corpus()
    corpus_model.fit( imap(preprocess.tokenize, txtSource), window = args.window)
        
    # fit the model using the given parameters
    logging.info("Training GloVe")
    glove.fit(corpus_model.matrix, epochs = args.epochs, no_threads = args.parallelism, verbose = args.verbose)
    
    # add a dictionary just to make it easier for similarity queries
    glove.add_dictionary(corpus_model.dictionary)
    
    transformer = lambda words: glove.transform_paragraph(words, use_pca = args.pca)

    fromTraining = to_sklearn_format(transformer, training, args.vecsize)
    fromTesting = to_sklearn_format(transformer, testing, args.vecsize)
    
    return fromTraining, fromTesting
Esempio n. 37
0
    print("Loading pretrained corpus...")
    corpus = Corpus.load("cache/corpus.p")
except:
    print("Training corpus...")
    corpus.fit(texts, window=max_sentence_length)
    corpus.save("cache/corpus.p")

glove = Glove(no_components=number_components, learning_rate=0.05)
try:
    print("Loading pretrained GloVe vectors...")
    glove = Glove.load("cache/glove.p")
except:
    print("Training GloVe vectors...")
    # More epochs seems to make it worse
    glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    glove.save("cache/glove.p")

# Convert input text
print("Vectorizing input sentences...")
X = vectify(texts, previous_message, glove.dictionary, max_sentence_length, contextual)
y = np.array([x == u'1' for x in classes]).astype(np.int32)

X, y, texts = X[:207458], y[:207458], texts[:207458]

def print_accurate_forwards(net, history):
    X_train, X_valid, y_train, y_valid = net.train_split(X, y, net)
    y_classified = net.predict(X_valid)
    acc_fwd = np.mean([x == y_ and y_ == 1 for x, y_ in zip(y_valid, y_classified)])/np.mean(y_valid)
    fls_pos = np.mean([x != y_ and y_ == 0 for x, y_ in zip(y_classified, y_valid)])/(np.mean(y_valid))
    print('Accurately forwarded: {:.4f}'.format(acc_fwd) + ', False Positives: {:.4f}'.format(fls_pos) + ', Valid forwards: {:.4f}'.format((acc_fwd / (acc_fwd + fls_pos))) )
Esempio n. 38
0
        print('Collocations: %s' % corpus_model.matrix.nnz)

    if args.train:
        # Train the GloVe model and save it to disk.

        if not args.create:
            # Try to load a corpus from disk.
            print('Reading corpus statistics')
            corpus_model = Corpus.load('corpus.model')

            print('Dict size: %s' % len(corpus_model.dictionary))
            print('Collocations: %s' % corpus_model.matrix.nnz)

        print('Training the GloVe model')

        glove = Glove(no_components=100, learning_rate=0.05)
        glove.fit(corpus_model.matrix, epochs=int(args.train),
                  no_threads=args.parallelism, verbose=True)
        glove.add_dictionary(corpus_model.dictionary)

        glove.save('glove.model')

    if args.query:
        # Finally, query the model for most similar words.
        if not args.train:
            print('Loading pre-trained GloVe model')
            glove = Glove.load('glove.model')

        print('Querying for %s' % args.query)
        pprint.pprint(glove.most_similar(args.query, number=10))
Esempio n. 39
0
mlp1000 = mlp_model(1000)
mlp1000_accuracy = train_test(mlp1000, x, y, folds)

print((mlp1_accuracy, mlp10_accuracy, mlp100_accuracy, mlp1000_accuracy))

#3CNN
#Glove Vectors from reviews
c = [review.split() for review in data.data]

corpus = Corpus()
corpus.fit(c, window=10)

glv = Glove(no_components=100, learning_rate=0.05)
glv.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)

glv.add_dictionary(corpus.dictionary)

embeddings_index = glv.dictionary

BASE_DIR = ''
GLOVE_DIR = BASE_DIR + '/glove.6B/'
TEXT_DATA_DIR = 'txt_sentoken/'
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
Esempio n. 40
0
@author: dannl
'''
from glove import Glove
from glove import Corpus
import time

cooc_file='/home/dannl/tmp/newstech/glove/word.cooc'
model_file='/home/dannl/tmp/newstech/glove/glove.model'

oldtime=time.time()
# get a cooccurrence matrix
corpus_cooc = Corpus.load(cooc_file)

# get a model
glove = Glove(no_components=100, learning_rate=0.05)
glove.fit(corpus_cooc.matrix, epochs=5,no_threads=4, verbose=True)
glove.add_dictionary(corpus_cooc.dictionary)
glove.save(model_file)

# count=0
# for word,wid in corpus_cooc.dictionary.items():
#     count+=1
#     if count>100:
#         break
#     print word,wid
    
print('Dict size: %s' % len(corpus_cooc.dictionary))
print('Collocations: %s' % corpus_cooc.matrix.nnz)

print 'time cost:%.2f'%(time.time()-oldtime)