Ejemplo n.º 1
0
def load_w2v(corpus, dictionary):
    '''
    Return the trained Word2Vec model
    Train a model if model doesn't exist yet
    :param corpus:
    :param dictionary:
    :return:
    '''
    if not os.path.isfile(W2V_MODEL_PATH):
        num_features = 300  # Word vector dimensionality
        min_word_count = 5  # Minimum word count
        num_workers = 5  # Number of threads to run in parallel
        window = 5  # Context window size
        downsampling = 1e-5  # Downsample setting for frequent words
        print("Training the word2vec model!")
        sents = get_review_sentences()
        # Initialize and train the model (this will take some time)
        model = models.Word2Vec(sents, workers=num_workers, \
                    size=num_features, min_count = min_word_count, \
                    window = window, sample = downsampling)

        # If you don't plan to train the model any further, calling
        # init_sims will make the model much more memory-efficient.
        model.init_sims(replace=True)

        # It can be helpful to create a meaningful model name and
        # save the model for later use. You can load it later using Word2Vec.load()
        model.save(W2V_MODEL_PATH)
        tfidf = models.Word2Vec(corpus)
        print('Word2vec model created!')

    print('Loading word2vec model')
    w2v = models.Word2Vec.load(W2V_MODEL_PATH)
    print('Loading word2vec model complished!')
    return w2v
Ejemplo n.º 2
0
def embeddings(ENG_sentences,ESP_sentences,multilingual_data):
    """ 
    Using the word2vec implementation   
    - Initialize a model
    Parameters:
    - (sg=0), CBOW is used. Otherwise (sg=1), skip-gram is employed
    - size is the dimensionality of the feature vectors.
    - window is the maximum distance between the current and predicted word within a sentence.
    - min_count => ignore all words with total frequency lower than this.
    
    """
    # Persist a model to disk
    
    fname1=os.path.join(os.getcwd(),'word_vectors_ita.txt')
    # fname2=os.path.join(os.getcwd(),'word_vectors_esp.txt')
    fname3=os.path.join(os.getcwd(),'word_vectors_mul_ita_eng.txt')
    
    # Word2Vec
    print "Saving Italian model"
    model_eng = models.Word2Vec(ENG_sentences, size=300, window=5, min_count=5, workers=4)
    model_eng.save(fname1)
    # print "Saving Spanish model"
    # model_esp = models.Word2Vec(ESP_sentences, size=300, window=5, min_count=5, workers=4)
    # model_esp.save(fname2)
    # 
    model_mul = models.Word2Vec(multilingual_data, size=300, window=5, min_count=5, workers=4)
    print "Saving multi-lingual model It+Es"
    model_mul.save(fname3)
Ejemplo n.º 3
0
    def __init__(self, sentances, ldaModel, flag, filename, size, window,
                 mincount):

        self.sentances = sentances
        self.ldaModel = ldaModel
        self.filename = filename
        self.size = size
        self.window = window
        self.mincount = mincount

        #word = self.sentances[0][0]
        self.permuteSentances = []

        self.dictionary = gensim.corpora.Dictionary.load(
            '/Users/loaner/Documents/Renncode_2016/SKPN/py-server/comTragDict')

        self.dictID = self.dictionary.token2id

        if flag:
            print "starting variation"
            print "There are %d many sentances" % len(self.sentances)
            self.sentanceVariation(self.sentances)

            random.shuffle(self.permuteSentances)
            x = self.permuteSentances

            self.topic2vec = models.Word2Vec(self.permuteSentances,
                                             size=self.size,
                                             window=self.window,
                                             min_count=self.mincount,
                                             workers=2)
            self.topic2vec.save(self.filename)

            random.shuffle(self.sentances)
            self.word2vec = models.Word2Vec(self.sentances,
                                            size=self.size,
                                            window=self.window,
                                            min_count=3,
                                            workers=2)
            self.topic2vec.save(self.filename + "_w2v")

        #self.word2vec = models.Word2Vec.load('Shrew_word2vecClass')
        self.topic2vec = models.Word2Vec.load(self.filename)
        #self.word2vec = models.Word2Vec.load(self.filename + "_w2v")

        self.ldaTopics = []
        for x in range(0, 50):
            self.ldaTopics.append(self.ldaModel.show_topic(x))

        self.topicVecs = []
        for x in range(0, 50):
            self.topicVecs.append(
                self.topic2vec.most_similar(positive=["u" + str(x)]))

        topics = self.topicVecs
        lda = self.ldaTopics
        print self.filename + "is finished"
Ejemplo n.º 4
0
def train_high_memory():
    cnn_files = os.listdir(CNN_TOKENS_PATH)
    dm_files = os.listdir(DM_TOKENS_PATH)

    tokens = []

    for file in cnn_files:
        file_object = open(os.path.join(CNN_TOKENS_PATH, file),
                           errors="ignore")
        for line in file_object:
            if line and line != "\n" and line != "@highlight\n":
                tokens.append(line.split(' '))
            print("Read: CNN " + file + " read.")

    for file in dm_files:
        file_object = open(os.path.join(DM_TOKENS_PATH, file), errors="ignore")
        for line in file_object:
            if line and line != "\n" and line != "@highlight\n":
                tokens.append(line.split(' '))
            print("Read: DM " + file)

    model = models.Word2Vec(tokens,
                            size=VECTOR_LENGTH,
                            window=WINDOW,
                            min_count=MIN_COUNT)
    model.wv.save('vectors/cnn_dm_vectors_' + str(VECTOR_LENGTH) + "_" +
                  str(WINDOW) + "_" + str(MIN_COUNT))
Ejemplo n.º 5
0
def main():
    with open("../09/tokens_81.txt", "r") as f:
        sentences = [
            sentence.replace("\n", "").split(" ")
            for sentence in f.readlines()
        ]
    model = models.Word2Vec(sentences,
                            sg=1,
                            size=300,
                            window=5,
                            min_count=5,
                            workers=4)
    model.save("word2vec_model")
    # q86
    print("===q86===")
    United_States = model.wv["United_States"]
    print(United_States)
    # q87
    print("===q87===")
    print(model.wv.similarity("United_States", "U.S"))
    # q88
    print("===q88===")
    print(model.wv.most_similar_cosmul(positive=["England"], topn=10))
    # q89
    print("===q89===")
    print(
        model.wv.most_similar_cosmul(positive=["Spain", "Athens"],
                                     negative=["Madrid"],
                                     topn=10))
Ejemplo n.º 6
0
 def train(self):
     logging.info(' train tfidf model ... ')
     self.tfidf = models.TfidfModel(self.corpus, normalize=True)
     logging.info(' train word2vec model ... ')
     self.w2v = models.Word2Vec(min_count=2,
                                window=2,
                                size=300,
                                sample=6e-5,
                                alpha=0.03,
                                min_alpha=0.0007,
                                negative=15,
                                workers=4,
                                iter=7)
     self.w2v.train(self.data,
                    total_examples=self.w2v.corpus_count,
                    epochs=15,
                    report_delay=1)
     logging.info(' train fasttext model ... ')
     self.fast = models.FastText(self.data,
                                 size=300,
                                 window=3,
                                 min_count=1,
                                 iter=10,
                                 min_n=3,
                                 max_n=6,
                                 word_ngrams=2)
def transform_corpus(dictionary, corpus, documents):
    tfidf = models.TfidfModel(corpus, id2word=dictionary)
    corpus_tfidf = tfidf[corpus]
    lsi = models.LsiModel(corpus=corpus_tfidf, id2word=dictionary)

    # Set values for various parameters
    num_features = 300  # Word vector dimensionality
    min_word_count = 5  # Minimum word count
    num_workers = 6  # Number of threads to run in parallel
    context = 5  # Context window size
    downsampling = 1e-3  # Downsample setting for frequent words

    model = models.Word2Vec(documents,
                            workers=num_workers,
                            size=num_features,
                            min_count=min_word_count,
                            window=context,
                            sample=downsampling)
    model.train(documents, total_examples=len(documents), epochs=10)
    # If you don't plan to train the model any further, calling
    # init_sims will make the model much more memory-efficient.
    model.init_sims(replace=True)

    lsi.save('../NLP_DocumentSimilarity/corpus/model.lsi')
    model.save('../NLP_DocumentSimilarity/corpus/w2v.model')

    return lsi, model
Ejemplo n.º 8
0
def corpus2wordVSM(corpus_file_name,
                   embeddings_file_name=r"embeddings.w2v",
                   txt_file_name="sent_line_math_corpus.txt",
                   feature_vec_size=100,
                   window_size=5,
                   minimum_count=5,
                   num_of_virtual_cores=4,
                   skipGram=0):
    math_corpus_file = codecs.open(corpus_file_name, "r", "utf-8")
    math_corpus_text = math_corpus_file.read().lower()
    math_corpus_file.close()
    math_corpus_sanitized_sentence_tokenized = nltk.tokenize.sent_tokenize(
        math_corpus_text)
    sent_line_math_corpus_file = open(txt_file_name, "a")
    for sentence in math_corpus_sanitized_sentence_tokenized:
        sent_line_math_corpus_file.write("\n")
        word_tokens = nltk.tokenize.word_tokenize(sentence)
        for word in word_tokens:
            # if isinstance(word, unicode):
            sent_line_math_corpus_file.write(word.lower() + " ")
    sent_line_math_corpus_file.close()
    sentences = models.word2vec.LineSentence(txt_file_name)
    model = models.Word2Vec(sentences,
                            size=feature_vec_size,
                            window=window_size,
                            min_count=minimum_count,
                            workers=num_of_virtual_cores,
                            sg=skipGram)
    #if the parameter sg=0 (defult) is changed to sg=1,
    #the model will be skip-gram as opposed to CBOW

    model.save(embeddings_file_name)
Ejemplo n.º 9
0
def convert_text_to_vector(entire_text, date, period):
    # 1. Train model using word2vec
    # 2. Save model in file
    # 3. Write the top 20 most similar words for each word in CHECK_WORDS into file
    if period == "five":
        postfix = cs.postfix_models_five
        path = cs.path_models_five
    else:
        postfix = cs.postfix_models_decade
        path = cs.path_models_decade
    print "Inside word2vec"
    print "No of lines: ", len(entire_text)
    print "Start training.."
    model = models.Word2Vec(entire_text,
                            size=100,
                            window=10,
                            min_count=5,
                            workers=cores)
    print "Done training.."
    model.save(os.path.join(path, date + postfix))
    print "Saved.."
    print "MODEL:\n", model
    for term in CHECK_WORDS:
        print term.upper(), ":\n"
        for t, p in model.most_similar(positive=term.split(' '), topn=20):
            print t, ":", str(round(p, 3))
        print ""
Ejemplo n.º 10
0
def train(domain, shouldTrain, setNumber):
    '''
    The function trains a model on training data and then tests the models accuracy on the testing data.
    Since training is time consuming, we save the model and load it later for further testing
    '''
    print "\n=== Set : %s ===\n" % str(setNumber)

    # Train a model based on training data
    if shouldTrain == True:
        sentences = models.word2vec.LineSentence(domain + '/train' +
                                                 str(setNumber) + '.txt')
        model = models.Word2Vec(sentences=sentences,
                                min_count=1,
                                workers=4,
                                hs=1,
                                window=window_size,
                                iter=10)
        model.save(domain + '/model' + str(setNumber) + '.txt')
    else:
        # OR load a mode
        model = models.Word2Vec.load(domain + '/model' + str(setNumber) +
                                     '.txt')

    print "Training : COMPLETE!"

    # Evaluate model on test data
    plans = open(domain + '/test' + str(setNumber) + '.txt').read().split("\n")
    list_of_actions = [[unicode(actn, "utf-8") for actn in plan_i.split()]
                       for plan_i in plans]
    actions = model.vocab.keys()
    return [x for x in list_of_actions if len(x) > window_size * 2], actions
Ejemplo n.º 11
0
def train_word2vec(apps, num_topics):

    train_set = []
    for app in apps:
        for v in get_versions(app):
            train_set.append(preproccess(''.join(raw_desc(app, v))))

    # train a word2vec model
    word2vec = models.Word2Vec(train_set)

    # initialize a k-means object and use it to extract centroids
    kmeans_clustering = KMeans(num_topics)
    idx = kmeans_clustering.fit_predict(word2vec.syn0)

    # create a word / index (cluster number) dictionary
    word_centroid_map = dict(zip(word2vec.index2word, idx))

    # one cluster for one topic
    topics = [[] for i in xrange(num_topics)]
    for word, index in word_centroid_map.iteritems():
        topics[index].append(word)

    # store topics
    pickle_dump(topics, WORD2VEC)

    # print each topic
    print_word2vec()
Ejemplo n.º 12
0
	def _build_word2vec(self, logger):
		""" build word2vec for words"""
		if not self._split_words:
			logger.error("no split words, skip")
		else:
			self._word_model = models.Word2Vec(self._split_sentences, min_count=5)
			self._word_model.save(os.path.join(self.basepath, "data", "word_model"))
Ejemplo n.º 13
0
def main():
    """
	Main application execution. 
	"""
    db = MongoClient('localhost', 27017).test
    fillMongo(db)
    sentences = mongoDocumentsSplitted(db)
    w2v_model = models.Word2Vec(sentences, workers=4)
    w2v_model.save("word2vec.bin")

    d2v_model = models.Doc2Vec(mongoDocuments2Sentences(db), workers=4)
    d2v_model.save("doc2vec.bin")

    random_records = db.deneme.aggregate([{"$sample": {"size": 10}}])

    infer_vectors = []
    vectors = []
    for record in random_records:
        vectors.append(record["text"])
        infer_vectors.append(
            np.array(
                d2v_model.infer_vector(record['text'].split(),
                                       alpha=0.025,
                                       min_alpha=0.025,
                                       steps=20)).reshape(-1, 1))

    for i in range(len(infer_vectors) - 1):
        print("vector1: ", vectors[i])
        print("vector2: ", vectors[i + 1])
        print("cosine: ",
              cosine_similarity(infer_vectors[i],
                                infer_vectors[i +
                                              1]))  # Print out = ~0.00795774
Ejemplo n.º 14
0
def learnLDA(collection):
    print "learning model"

    key_word = []

    articles = collection.find()
    for article in articles:
        key_word.append(splitWord(article['article_title']))

    # lda
    global dictionary
    dictionary = corpora.Dictionary(key_word)
    corpus = [dictionary.doc2bow(sentence) for sentence in key_word]

    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    global lda
    lda = models.ldamodel.LdaModel(corpus=corpus_tfidf,
                                   id2word=dictionary,
                                   alpha='auto',
                                   num_topics=50)

    # word2vec
    global word2vec
    word2vec = models.Word2Vec(key_word, min_count=1)

    if not os.path.exists(board):
        os.makedirs(board)
    dictionary.save(board + '/' + board + '_dict.model')
    lda.save(board + '/' + board + '_lda.model')
    word2vec.save(board + '/' + board + '_word2vec.model')

    print "learning finish"
Ejemplo n.º 15
0
    def construct_word_model(self, min_count=1, size=500, workers=1):
        sentence_list = self.load_data_sentences()
        #sentence_list = sentence_tokenize(sentence_list)
        sentence_stream = []
        for sentence in sentence_list:
            sentence_stream.append(word_tokenize(sentence))
        #print sentence_stream[-1], len(sentence_stream)
        transformer = gensim.models.Phrases(sentence_stream)
        #print sentence_list[0], len(transformer[sentence_list]), len(sentence_list)
        #print transformer[sentence_stream][-1]
        #raw_input('stop')
        self.model = models.Word2Vec(sorted_vocab=1,
                                     min_count=min_count,
                                     size=size,
                                     workers=workers,
                                     iter=10,
                                     window=10)
        self.model.scan_vocab(transformer[sentence_stream])
        self.model.build_vocab(transformer[sentence_stream])
        self.model.train(transformer[sentence_stream])

        print('Finished training')
        print('Vocab size: %d\n' % len(self.model.vocab))
        #for fname in os.listdir(self.datadir):
        #    with open(os.path.join(datadir, fname)) as file:
        #        for line in file:
        #            self.model.train(line)
        return
Ejemplo n.º 16
0
def get_model(multilingual_data):

    # Persist a model to disk
    fname = "wikiDS/word2Vec.mdl"
    vocabfname = "wikiDS/vocab.pkl"
    """ 
    Using the word2vec implementation   
    - Initialize a model
    Parameters:
    - (sg=0), CBOW is used. Otherwise (sg=1), skip-gram is employed
    - size is the dimensionality of the feature vectors.
    - window is the maximum distance between the current and predicted word within a sentence.
    - min_count => ignore all words with total frequency lower than this.

    """

    model = models.Word2Vec(multilingual_data,
                            size=128,
                            window=5,
                            min_count=5,
                            workers=4)
    model.save(fname)

    vocab = list(model.vocab.keys())
    vocabfile = codecs.open(vocabfname, "w", "utf-8")
    pickle.dump(vocab, vocabfile)
    vocab_len = len(vocab)
    print("Vocab length is ", vocab_len)
    test_model(model)
    return model
Ejemplo n.º 17
0
def wordVecSrlBigrams(cases, save=True, load=False):
    if load:
        return models.Word2Vec.load('models/word2vec/srlBigramModel')
    else:
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                            level=logging.INFO)
        sentences = []
        for cas in cases:
            for srlSentence in cas.srlSentences:
                newSentence = []
                for clause in srlSentence:
                    for role, text in clause.iteritems():
                        newSentence.append(str((role, text)))
                print newSentence
                bigramSentence = [
                    b for b in zip(newSentence[:-1], newSentence[1:])
                ]
                print bigramSentence
                sentenceList = []
                for w1, w2 in bigramSentence:
                    sentenceList.append(w1 + '_' + w2)
                sentences.append(sentenceList)
        model = models.Word2Vec(sentences, min_count=10, size=200, workers=4)
        #print model.most_similar(positive=["('V', 'conclude')"], negative=[], topn=50)
        if save:
            model.save('models/word2vec/srlBigramModel')
            print 'model saved'
Ejemplo n.º 18
0
    def corpus2wordVSM(self,
                       corpus_file_name,
                       embeddings_file_name=r"embeddings.w2v",
                       feature_vec_size=100,
                       window_size=5,
                       minimum_count=5,
                       num_of_virtual_cores=4,
                       skipGram=0):
        corpus_file = codecs.open(corpus_file_name, "r", "utf-8")
        corpus_text = corpus_file.read().lower()
        corpus_file.close()
        corpus_sentence_tokenized = nltk.tokenize.sent_tokenize(corpus_text)

        #making sure the file is new when created below
        if os.path.isfile("sent_line_corpus.txt"):
            os.remove(r"sent_line_corpus.txt")
        sent_line_corpus_file = codecs.open(r"sent_line_corpus.txt", "a",
                                            "utf-8")
        for sentence in corpus_sentence_tokenized:
            sent_line_corpus_file.write("\n")
            word_tokens = nltk.tokenize.word_tokenize(sentence)
            for word in word_tokens:
                # if isinstance(word, unicode): #deprecated
                sent_line_corpus_file.write(word.lower() + " ")
        sent_line_corpus_file.close()
        sentences = models.word2vec.LineSentence(r"sent_line_corpus.txt")
        model = models.Word2Vec(sentences,
                                size=feature_vec_size,
                                window=5,
                                min_count=5,
                                workers=4,
                                sg=skipGram)
        #if the parameter sg=0 (defult) is changed to sg=1,
        #the model will be skip-gram as opposed to CBOW
        model.save(embeddings_file_name)
Ejemplo n.º 19
0
 def fit(self, X, y=None):
     """
     Fit the model according to the given training data.
     Calls gensim.models.Word2Vec
     """
     self.gensim_model = models.Word2Vec(sentences=X,
                                         size=self.size,
                                         alpha=self.alpha,
                                         window=self.window,
                                         min_count=self.min_count,
                                         max_vocab_size=self.max_vocab_size,
                                         sample=self.sample,
                                         seed=self.seed,
                                         workers=self.workers,
                                         min_alpha=self.min_alpha,
                                         sg=self.sg,
                                         hs=self.hs,
                                         negative=self.negative,
                                         cbow_mean=self.cbow_mean,
                                         hashfxn=self.hashfxn,
                                         iter=self.iter,
                                         null_word=self.null_word,
                                         trim_rule=self.trim_rule,
                                         sorted_vocab=self.sorted_vocab,
                                         batch_words=self.batch_words)
     return self
Ejemplo n.º 20
0
 def corpus2wordVSM_CBOW(self,
                         corpus_file_name,
                         embeddings_file_name=r"embeddings.w2v",
                         feature_vec_size=100,
                         window_size=5,
                         minimum_count=5,
                         num_of_virtual_cores=4):
     math_corpus_file = codecs.open(corpus_file_name, "r", "utf-8")
     math_corpus_text = math_corpus_file.read().lower()
     math_corpus_file.close()
     math_corpus_sanitized_sentence_tokenized = tokenize.sent_tokenize(
         math_corpus_text)
     sent_line_math_corpus_file = open(r"sent_line_math_corpus.txt", "a")
     for sentence in math_corpus_sanitized_sentence_tokenized:
         sent_line_math_corpus_file.write("\n")
         word_tokens = tokenize.word_tokenize(sentence)
         for word in word_tokens:
             # if isinstance(word, unicode):
             sent_line_math_corpus_file.write(word.lower() + " ")
     sent_line_math_corpus_file.close()
     sentences = models.word2vec.LineSentence(r"sent_line_math_corpus.txt")
     model = models.Word2Vec(sentences,
                             size=100,
                             window=5,
                             min_count=5,
                             workers=4)
     model.save(embeddings_file_name)
Ejemplo n.º 21
0
def model_trainer(layer):
    '''
    Train a word2vec model on a given layer
    Return the trained model
    '''
    tokenized_corpus = [word_tokenize(sentence) for sentence in layer]
    return models.Word2Vec(tokenized_corpus, min_count=1, size=100, workers=8)
Ejemplo n.º 22
0
    def train_model(self):
        """
        训练模型
        :return:
        """
        sentence = CorpusIterator(self.corpus_name)

        if not os.path.exists(self.result_name):
            model = models.Word2Vec(sentence,
                                    min_count=self.min_count,
                                    size=self.size,
                                    window=self.window)
        elif self.retrain:
            logging.info("模型已存在, 再次训练")
            model = models.Word2Vec.load(self.result_name)
        else:
            logging.error("模型存在,禁止再次训练")
            return

        model.train(sentence,
                    total_examples=model.corpus_count,
                    epochs=model.iter)
        model.save(self.result_name)

        self.model = model
Ejemplo n.º 23
0
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    input_fname = ''
    if len(sys.argv) > 1:
        input_fname = sys.argv[1]

    #unsupervised data
    directories = [
        'smiley_tweets_nl_full',
        'smiley_tweets_de_full',
        'smiley_tweets_it_full',
    ]
    files = ['filtered_en_balanced580M.gz']

    listing = []
    for dir in directories:
        listing.append(map(lambda x: os.path.join(dir, x), os.listdir(dir)))

    sentences = MySentences(listings=listing, gzpFiles=files)
    model = models.Word2Vec(sentences,
                            size=52,
                            window=5,
                            min_count=10,
                            workers=16,
                            sg=1,
                            sample=1e-5,
                            hs=1)
    model.save_word2vec_format(
        'embeddings/smiley_tweets_embedding_multilingual{}'.format(
            input_fname),
        binary=False)
Ejemplo n.º 24
0
def prepare_data_files(path_num, path_token, check_list, long_path_list,
                       buggy_states_list, output_root, w_size, w_window,
                       w_workers, label):
    with open(os.path.join(output_root, "check_list.json"), "a") as file_:
        json.dump(check_list, file_)
    word2vec_model = g.Word2Vec(long_path_list,
                                size=w_size,
                                window=w_window,
                                workers=w_workers)
    word2vec_model.wv.save_word2vec_format(os.path.join(
        output_root, "word2vec.txt"),
                                           binary=False)
    diction = get_word2vec_list(os.path.join(output_root, "word2vec.txt"))
    for i in range(len(long_path_list)):
        long_path_list[i] = vector_replace(long_path_list[i], diction)
    long_path_list_np = transfor_2_np(long_path_list, path_num, path_token,
                                      w_size)
    np.save(os.path.join(output_root, label + "_input.npy"), long_path_list_np)
    buggy_states_list = transfor_2_np(buggy_states_list, 0, path_num, 1)
    if label == "test":
        buggy_states_list = np.ones(buggy_states_list.shape)
    np.save(os.path.join(output_root, label + "_label.npy"), buggy_states_list)
    output_list = buggy_states_list.reshape([path_num, 1])
    output_list = output_list.repeat(w_size, axis=1)
    np.save(os.path.join(output_root, label + "_output.npy"), output_list)
Ejemplo n.º 25
0
def word2vec(data, window, min_count, size, iterations):
    model = genmod.Word2Vec(train_data,
                            window=window,
                            min_count=min_count,
                            size=size,
                            iter=iterations)
    return model.wv
Ejemplo n.º 26
0
    def trainer_w2v(self, path):
        """
        corpus=['教授', '长江', '学者', '优秀成果', '集中', '呈现'], ['', '生物质', '发电', '燃料', '供应链', '运营', '模式']
        :param path:
        :return:
        """
        print('train w2v')
        corpus = get_corpus(path, w2v=True)
        w2v = models.Word2Vec(min_count=2,
                              window=3,
                              size=300,
                              sample=6e-5,
                              alpha=0.03,
                              min_alpha=0.0007,
                              negative=15,
                              workers=4,
                              iter=10,
                              max_vocab_size=50000)
        w2v.build_vocab(corpus)
        w2v.train(corpus,
                  total_examples=w2v.corpus_count,
                  epochs=15,
                  report_delay=1)

        return w2v
Ejemplo n.º 27
0
def load_save_word2vec_model(line_words, model_filename):
    # 模型参数
    feature_size = 500
    content_window = 5
    freq_min_count = 3
    # threads_num = 4
    negative = 3  #best采样使用hierarchical softmax方法(负采样,对常见词有利),不使用negative sampling方法(对罕见词有利)。
    iter = 20

    print("word2vec...")
    tic = time.time()
    if os.path.isfile(model_filename):
        model = models.Word2Vec.load(model_filename)
        print(model.vocab)
        print("Loaded word2vec model")
    else:
        bigram_transformer = models.Phrases(line_words)
        model = models.Word2Vec(bigram_transformer[line_words],
                                size=feature_size,
                                window=content_window,
                                iter=iter,
                                min_count=freq_min_count,
                                negative=negative,
                                workers=multiprocessing.cpu_count())
        toc = time.time()
        print("Word2vec completed! Elapsed time is %s." % (toc - tic))
        model.save(model_filename)
        # model.save_word2vec_format(save_model2, binary=False)
        print("Word2vec Saved!")
    return model
Ejemplo n.º 28
0
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    input_fname = ''
    if len(sys.argv) > 1:
        input_fname = sys.argv[1]

    #unsupervised data
    directory = 'smiley_tweets_full'
    listing = map(lambda x: os.path.join(directory, x), os.listdir(directory))

    sentences = MySentences(files=listing)
    model = models.Word2Vec(sentences,
                            size=52,
                            window=5,
                            min_count=10,
                            workers=16,
                            sg=1,
                            sample=1e-5,
                            hs=1)
    model.save_word2vec_format(
        'embeddings/smiley_tweets_embedding_multilingual{}'.format(
            input_fname),
        binary=False)
Ejemplo n.º 29
0
def _gen_embedding(ndim, alignment=False):
    print "Generating %d-dim word embedding ..." %ndim
    int2ch, ch2int = get_vocab()
    ch_lists = []
    quatrains = get_quatrains()
    for idx, poem in enumerate(quatrains):
        for sentence in poem['sentences']:
            ch_lists.append(filter(lambda ch: ch in ch2int, sentence))
        if alignment:
            # the i-th characters in the poem, used to boost Dui Zhang
            i_characters = [[sentence[j] for sentence in poem['sentences']] for j in range(len(poem['sentences'][0]))]
            for characters in i_characters:
                ch_lists.append(filter(lambda ch: ch in ch2int, characters))
        if 0 == (idx+1)%10000:
            print "[Word2Vec] %d/%d poems have been processed." %(idx+1, len(quatrains))
    print "Hold on. This may take some time ..."
    model = models.Word2Vec(ch_lists, size = ndim, min_count = 5)
    embedding = uniform(-1.0, 1.0, [VOCAB_SIZE, ndim])
    for idx, ch in enumerate(int2ch):
        if ch in model.wv:
            embedding[idx,:] = model.wv[ch]
    if alignment:
        model.save(_w2v_with_alignment_model_path)
        print "Word2Vec model is saved."
        np.save(_w2v_with_alignment_path, embedding)
        print "Word embedding is saved."
    else:
        model.save(_w2v_model_path)
        print "Word2Vec model is saved."
        np.save(_w2v_path, embedding)
        print "Word embedding is saved."
Ejemplo n.º 30
0
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    input_fname = ''
    if len(sys.argv) > 1:
        input_fname = sys.argv[1]

    #supervised data
    train = "semeval/task-B-train-plus-dev.tsv.gz"
    dev = "semeval/twitter-test-gold-B.downloaded.tsv.gz"
    train16 = "semeval/task-A-train-2016.tsv.gz"
    dev2016 = "semeval/task-A-dev-2016.tsv.gz"
    devtest2016 = "semeval/task-A-devtest-2016.tsv.gz"
    test2016 = "semeval/SemEval2016-task4-test.subtask-A.txt.gz"

    #unsupervised data
    smiley_pos = 'semeval/smiley_tweets_{}.gz'.format(input_fname)

    files = [(train, 3), (dev, 3), (train16, 2), (dev2016, 2),
             (devtest2016, 2), (test2016, 2), (smiley_pos, 0)]
    sentences = MySentences(files=files)
    model = models.Word2Vec(sentences,
                            size=52,
                            window=5,
                            min_count=5,
                            workers=7,
                            sg=1,
                            sample=1e-5,
                            hs=1)
    model.save_word2vec_format(
        'embeddings/smiley_tweets_embedding_final{}'.format(input_fname),
        binary=False)