def load_w2v(corpus, dictionary): ''' Return the trained Word2Vec model Train a model if model doesn't exist yet :param corpus: :param dictionary: :return: ''' if not os.path.isfile(W2V_MODEL_PATH): num_features = 300 # Word vector dimensionality min_word_count = 5 # Minimum word count num_workers = 5 # Number of threads to run in parallel window = 5 # Context window size downsampling = 1e-5 # Downsample setting for frequent words print("Training the word2vec model!") sents = get_review_sentences() # Initialize and train the model (this will take some time) model = models.Word2Vec(sents, workers=num_workers, \ size=num_features, min_count = min_word_count, \ window = window, sample = downsampling) # If you don't plan to train the model any further, calling # init_sims will make the model much more memory-efficient. model.init_sims(replace=True) # It can be helpful to create a meaningful model name and # save the model for later use. You can load it later using Word2Vec.load() model.save(W2V_MODEL_PATH) tfidf = models.Word2Vec(corpus) print('Word2vec model created!') print('Loading word2vec model') w2v = models.Word2Vec.load(W2V_MODEL_PATH) print('Loading word2vec model complished!') return w2v
def embeddings(ENG_sentences,ESP_sentences,multilingual_data): """ Using the word2vec implementation - Initialize a model Parameters: - (sg=0), CBOW is used. Otherwise (sg=1), skip-gram is employed - size is the dimensionality of the feature vectors. - window is the maximum distance between the current and predicted word within a sentence. - min_count => ignore all words with total frequency lower than this. """ # Persist a model to disk fname1=os.path.join(os.getcwd(),'word_vectors_ita.txt') # fname2=os.path.join(os.getcwd(),'word_vectors_esp.txt') fname3=os.path.join(os.getcwd(),'word_vectors_mul_ita_eng.txt') # Word2Vec print "Saving Italian model" model_eng = models.Word2Vec(ENG_sentences, size=300, window=5, min_count=5, workers=4) model_eng.save(fname1) # print "Saving Spanish model" # model_esp = models.Word2Vec(ESP_sentences, size=300, window=5, min_count=5, workers=4) # model_esp.save(fname2) # model_mul = models.Word2Vec(multilingual_data, size=300, window=5, min_count=5, workers=4) print "Saving multi-lingual model It+Es" model_mul.save(fname3)
def __init__(self, sentances, ldaModel, flag, filename, size, window, mincount): self.sentances = sentances self.ldaModel = ldaModel self.filename = filename self.size = size self.window = window self.mincount = mincount #word = self.sentances[0][0] self.permuteSentances = [] self.dictionary = gensim.corpora.Dictionary.load( '/Users/loaner/Documents/Renncode_2016/SKPN/py-server/comTragDict') self.dictID = self.dictionary.token2id if flag: print "starting variation" print "There are %d many sentances" % len(self.sentances) self.sentanceVariation(self.sentances) random.shuffle(self.permuteSentances) x = self.permuteSentances self.topic2vec = models.Word2Vec(self.permuteSentances, size=self.size, window=self.window, min_count=self.mincount, workers=2) self.topic2vec.save(self.filename) random.shuffle(self.sentances) self.word2vec = models.Word2Vec(self.sentances, size=self.size, window=self.window, min_count=3, workers=2) self.topic2vec.save(self.filename + "_w2v") #self.word2vec = models.Word2Vec.load('Shrew_word2vecClass') self.topic2vec = models.Word2Vec.load(self.filename) #self.word2vec = models.Word2Vec.load(self.filename + "_w2v") self.ldaTopics = [] for x in range(0, 50): self.ldaTopics.append(self.ldaModel.show_topic(x)) self.topicVecs = [] for x in range(0, 50): self.topicVecs.append( self.topic2vec.most_similar(positive=["u" + str(x)])) topics = self.topicVecs lda = self.ldaTopics print self.filename + "is finished"
def train_high_memory(): cnn_files = os.listdir(CNN_TOKENS_PATH) dm_files = os.listdir(DM_TOKENS_PATH) tokens = [] for file in cnn_files: file_object = open(os.path.join(CNN_TOKENS_PATH, file), errors="ignore") for line in file_object: if line and line != "\n" and line != "@highlight\n": tokens.append(line.split(' ')) print("Read: CNN " + file + " read.") for file in dm_files: file_object = open(os.path.join(DM_TOKENS_PATH, file), errors="ignore") for line in file_object: if line and line != "\n" and line != "@highlight\n": tokens.append(line.split(' ')) print("Read: DM " + file) model = models.Word2Vec(tokens, size=VECTOR_LENGTH, window=WINDOW, min_count=MIN_COUNT) model.wv.save('vectors/cnn_dm_vectors_' + str(VECTOR_LENGTH) + "_" + str(WINDOW) + "_" + str(MIN_COUNT))
def main(): with open("../09/tokens_81.txt", "r") as f: sentences = [ sentence.replace("\n", "").split(" ") for sentence in f.readlines() ] model = models.Word2Vec(sentences, sg=1, size=300, window=5, min_count=5, workers=4) model.save("word2vec_model") # q86 print("===q86===") United_States = model.wv["United_States"] print(United_States) # q87 print("===q87===") print(model.wv.similarity("United_States", "U.S")) # q88 print("===q88===") print(model.wv.most_similar_cosmul(positive=["England"], topn=10)) # q89 print("===q89===") print( model.wv.most_similar_cosmul(positive=["Spain", "Athens"], negative=["Madrid"], topn=10))
def train(self): logging.info(' train tfidf model ... ') self.tfidf = models.TfidfModel(self.corpus, normalize=True) logging.info(' train word2vec model ... ') self.w2v = models.Word2Vec(min_count=2, window=2, size=300, sample=6e-5, alpha=0.03, min_alpha=0.0007, negative=15, workers=4, iter=7) self.w2v.train(self.data, total_examples=self.w2v.corpus_count, epochs=15, report_delay=1) logging.info(' train fasttext model ... ') self.fast = models.FastText(self.data, size=300, window=3, min_count=1, iter=10, min_n=3, max_n=6, word_ngrams=2)
def transform_corpus(dictionary, corpus, documents): tfidf = models.TfidfModel(corpus, id2word=dictionary) corpus_tfidf = tfidf[corpus] lsi = models.LsiModel(corpus=corpus_tfidf, id2word=dictionary) # Set values for various parameters num_features = 300 # Word vector dimensionality min_word_count = 5 # Minimum word count num_workers = 6 # Number of threads to run in parallel context = 5 # Context window size downsampling = 1e-3 # Downsample setting for frequent words model = models.Word2Vec(documents, workers=num_workers, size=num_features, min_count=min_word_count, window=context, sample=downsampling) model.train(documents, total_examples=len(documents), epochs=10) # If you don't plan to train the model any further, calling # init_sims will make the model much more memory-efficient. model.init_sims(replace=True) lsi.save('../NLP_DocumentSimilarity/corpus/model.lsi') model.save('../NLP_DocumentSimilarity/corpus/w2v.model') return lsi, model
def corpus2wordVSM(corpus_file_name, embeddings_file_name=r"embeddings.w2v", txt_file_name="sent_line_math_corpus.txt", feature_vec_size=100, window_size=5, minimum_count=5, num_of_virtual_cores=4, skipGram=0): math_corpus_file = codecs.open(corpus_file_name, "r", "utf-8") math_corpus_text = math_corpus_file.read().lower() math_corpus_file.close() math_corpus_sanitized_sentence_tokenized = nltk.tokenize.sent_tokenize( math_corpus_text) sent_line_math_corpus_file = open(txt_file_name, "a") for sentence in math_corpus_sanitized_sentence_tokenized: sent_line_math_corpus_file.write("\n") word_tokens = nltk.tokenize.word_tokenize(sentence) for word in word_tokens: # if isinstance(word, unicode): sent_line_math_corpus_file.write(word.lower() + " ") sent_line_math_corpus_file.close() sentences = models.word2vec.LineSentence(txt_file_name) model = models.Word2Vec(sentences, size=feature_vec_size, window=window_size, min_count=minimum_count, workers=num_of_virtual_cores, sg=skipGram) #if the parameter sg=0 (defult) is changed to sg=1, #the model will be skip-gram as opposed to CBOW model.save(embeddings_file_name)
def convert_text_to_vector(entire_text, date, period): # 1. Train model using word2vec # 2. Save model in file # 3. Write the top 20 most similar words for each word in CHECK_WORDS into file if period == "five": postfix = cs.postfix_models_five path = cs.path_models_five else: postfix = cs.postfix_models_decade path = cs.path_models_decade print "Inside word2vec" print "No of lines: ", len(entire_text) print "Start training.." model = models.Word2Vec(entire_text, size=100, window=10, min_count=5, workers=cores) print "Done training.." model.save(os.path.join(path, date + postfix)) print "Saved.." print "MODEL:\n", model for term in CHECK_WORDS: print term.upper(), ":\n" for t, p in model.most_similar(positive=term.split(' '), topn=20): print t, ":", str(round(p, 3)) print ""
def train(domain, shouldTrain, setNumber): ''' The function trains a model on training data and then tests the models accuracy on the testing data. Since training is time consuming, we save the model and load it later for further testing ''' print "\n=== Set : %s ===\n" % str(setNumber) # Train a model based on training data if shouldTrain == True: sentences = models.word2vec.LineSentence(domain + '/train' + str(setNumber) + '.txt') model = models.Word2Vec(sentences=sentences, min_count=1, workers=4, hs=1, window=window_size, iter=10) model.save(domain + '/model' + str(setNumber) + '.txt') else: # OR load a mode model = models.Word2Vec.load(domain + '/model' + str(setNumber) + '.txt') print "Training : COMPLETE!" # Evaluate model on test data plans = open(domain + '/test' + str(setNumber) + '.txt').read().split("\n") list_of_actions = [[unicode(actn, "utf-8") for actn in plan_i.split()] for plan_i in plans] actions = model.vocab.keys() return [x for x in list_of_actions if len(x) > window_size * 2], actions
def train_word2vec(apps, num_topics): train_set = [] for app in apps: for v in get_versions(app): train_set.append(preproccess(''.join(raw_desc(app, v)))) # train a word2vec model word2vec = models.Word2Vec(train_set) # initialize a k-means object and use it to extract centroids kmeans_clustering = KMeans(num_topics) idx = kmeans_clustering.fit_predict(word2vec.syn0) # create a word / index (cluster number) dictionary word_centroid_map = dict(zip(word2vec.index2word, idx)) # one cluster for one topic topics = [[] for i in xrange(num_topics)] for word, index in word_centroid_map.iteritems(): topics[index].append(word) # store topics pickle_dump(topics, WORD2VEC) # print each topic print_word2vec()
def _build_word2vec(self, logger): """ build word2vec for words""" if not self._split_words: logger.error("no split words, skip") else: self._word_model = models.Word2Vec(self._split_sentences, min_count=5) self._word_model.save(os.path.join(self.basepath, "data", "word_model"))
def main(): """ Main application execution. """ db = MongoClient('localhost', 27017).test fillMongo(db) sentences = mongoDocumentsSplitted(db) w2v_model = models.Word2Vec(sentences, workers=4) w2v_model.save("word2vec.bin") d2v_model = models.Doc2Vec(mongoDocuments2Sentences(db), workers=4) d2v_model.save("doc2vec.bin") random_records = db.deneme.aggregate([{"$sample": {"size": 10}}]) infer_vectors = [] vectors = [] for record in random_records: vectors.append(record["text"]) infer_vectors.append( np.array( d2v_model.infer_vector(record['text'].split(), alpha=0.025, min_alpha=0.025, steps=20)).reshape(-1, 1)) for i in range(len(infer_vectors) - 1): print("vector1: ", vectors[i]) print("vector2: ", vectors[i + 1]) print("cosine: ", cosine_similarity(infer_vectors[i], infer_vectors[i + 1])) # Print out = ~0.00795774
def learnLDA(collection): print "learning model" key_word = [] articles = collection.find() for article in articles: key_word.append(splitWord(article['article_title'])) # lda global dictionary dictionary = corpora.Dictionary(key_word) corpus = [dictionary.doc2bow(sentence) for sentence in key_word] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] global lda lda = models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=dictionary, alpha='auto', num_topics=50) # word2vec global word2vec word2vec = models.Word2Vec(key_word, min_count=1) if not os.path.exists(board): os.makedirs(board) dictionary.save(board + '/' + board + '_dict.model') lda.save(board + '/' + board + '_lda.model') word2vec.save(board + '/' + board + '_word2vec.model') print "learning finish"
def construct_word_model(self, min_count=1, size=500, workers=1): sentence_list = self.load_data_sentences() #sentence_list = sentence_tokenize(sentence_list) sentence_stream = [] for sentence in sentence_list: sentence_stream.append(word_tokenize(sentence)) #print sentence_stream[-1], len(sentence_stream) transformer = gensim.models.Phrases(sentence_stream) #print sentence_list[0], len(transformer[sentence_list]), len(sentence_list) #print transformer[sentence_stream][-1] #raw_input('stop') self.model = models.Word2Vec(sorted_vocab=1, min_count=min_count, size=size, workers=workers, iter=10, window=10) self.model.scan_vocab(transformer[sentence_stream]) self.model.build_vocab(transformer[sentence_stream]) self.model.train(transformer[sentence_stream]) print('Finished training') print('Vocab size: %d\n' % len(self.model.vocab)) #for fname in os.listdir(self.datadir): # with open(os.path.join(datadir, fname)) as file: # for line in file: # self.model.train(line) return
def get_model(multilingual_data): # Persist a model to disk fname = "wikiDS/word2Vec.mdl" vocabfname = "wikiDS/vocab.pkl" """ Using the word2vec implementation - Initialize a model Parameters: - (sg=0), CBOW is used. Otherwise (sg=1), skip-gram is employed - size is the dimensionality of the feature vectors. - window is the maximum distance between the current and predicted word within a sentence. - min_count => ignore all words with total frequency lower than this. """ model = models.Word2Vec(multilingual_data, size=128, window=5, min_count=5, workers=4) model.save(fname) vocab = list(model.vocab.keys()) vocabfile = codecs.open(vocabfname, "w", "utf-8") pickle.dump(vocab, vocabfile) vocab_len = len(vocab) print("Vocab length is ", vocab_len) test_model(model) return model
def wordVecSrlBigrams(cases, save=True, load=False): if load: return models.Word2Vec.load('models/word2vec/srlBigramModel') else: logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) sentences = [] for cas in cases: for srlSentence in cas.srlSentences: newSentence = [] for clause in srlSentence: for role, text in clause.iteritems(): newSentence.append(str((role, text))) print newSentence bigramSentence = [ b for b in zip(newSentence[:-1], newSentence[1:]) ] print bigramSentence sentenceList = [] for w1, w2 in bigramSentence: sentenceList.append(w1 + '_' + w2) sentences.append(sentenceList) model = models.Word2Vec(sentences, min_count=10, size=200, workers=4) #print model.most_similar(positive=["('V', 'conclude')"], negative=[], topn=50) if save: model.save('models/word2vec/srlBigramModel') print 'model saved'
def corpus2wordVSM(self, corpus_file_name, embeddings_file_name=r"embeddings.w2v", feature_vec_size=100, window_size=5, minimum_count=5, num_of_virtual_cores=4, skipGram=0): corpus_file = codecs.open(corpus_file_name, "r", "utf-8") corpus_text = corpus_file.read().lower() corpus_file.close() corpus_sentence_tokenized = nltk.tokenize.sent_tokenize(corpus_text) #making sure the file is new when created below if os.path.isfile("sent_line_corpus.txt"): os.remove(r"sent_line_corpus.txt") sent_line_corpus_file = codecs.open(r"sent_line_corpus.txt", "a", "utf-8") for sentence in corpus_sentence_tokenized: sent_line_corpus_file.write("\n") word_tokens = nltk.tokenize.word_tokenize(sentence) for word in word_tokens: # if isinstance(word, unicode): #deprecated sent_line_corpus_file.write(word.lower() + " ") sent_line_corpus_file.close() sentences = models.word2vec.LineSentence(r"sent_line_corpus.txt") model = models.Word2Vec(sentences, size=feature_vec_size, window=5, min_count=5, workers=4, sg=skipGram) #if the parameter sg=0 (defult) is changed to sg=1, #the model will be skip-gram as opposed to CBOW model.save(embeddings_file_name)
def fit(self, X, y=None): """ Fit the model according to the given training data. Calls gensim.models.Word2Vec """ self.gensim_model = models.Word2Vec(sentences=X, size=self.size, alpha=self.alpha, window=self.window, min_count=self.min_count, max_vocab_size=self.max_vocab_size, sample=self.sample, seed=self.seed, workers=self.workers, min_alpha=self.min_alpha, sg=self.sg, hs=self.hs, negative=self.negative, cbow_mean=self.cbow_mean, hashfxn=self.hashfxn, iter=self.iter, null_word=self.null_word, trim_rule=self.trim_rule, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words) return self
def corpus2wordVSM_CBOW(self, corpus_file_name, embeddings_file_name=r"embeddings.w2v", feature_vec_size=100, window_size=5, minimum_count=5, num_of_virtual_cores=4): math_corpus_file = codecs.open(corpus_file_name, "r", "utf-8") math_corpus_text = math_corpus_file.read().lower() math_corpus_file.close() math_corpus_sanitized_sentence_tokenized = tokenize.sent_tokenize( math_corpus_text) sent_line_math_corpus_file = open(r"sent_line_math_corpus.txt", "a") for sentence in math_corpus_sanitized_sentence_tokenized: sent_line_math_corpus_file.write("\n") word_tokens = tokenize.word_tokenize(sentence) for word in word_tokens: # if isinstance(word, unicode): sent_line_math_corpus_file.write(word.lower() + " ") sent_line_math_corpus_file.close() sentences = models.word2vec.LineSentence(r"sent_line_math_corpus.txt") model = models.Word2Vec(sentences, size=100, window=5, min_count=5, workers=4) model.save(embeddings_file_name)
def model_trainer(layer): ''' Train a word2vec model on a given layer Return the trained model ''' tokenized_corpus = [word_tokenize(sentence) for sentence in layer] return models.Word2Vec(tokenized_corpus, min_count=1, size=100, workers=8)
def train_model(self): """ 训练模型 :return: """ sentence = CorpusIterator(self.corpus_name) if not os.path.exists(self.result_name): model = models.Word2Vec(sentence, min_count=self.min_count, size=self.size, window=self.window) elif self.retrain: logging.info("模型已存在, 再次训练") model = models.Word2Vec.load(self.result_name) else: logging.error("模型存在,禁止再次训练") return model.train(sentence, total_examples=model.corpus_count, epochs=model.iter) model.save(self.result_name) self.model = model
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) input_fname = '' if len(sys.argv) > 1: input_fname = sys.argv[1] #unsupervised data directories = [ 'smiley_tweets_nl_full', 'smiley_tweets_de_full', 'smiley_tweets_it_full', ] files = ['filtered_en_balanced580M.gz'] listing = [] for dir in directories: listing.append(map(lambda x: os.path.join(dir, x), os.listdir(dir))) sentences = MySentences(listings=listing, gzpFiles=files) model = models.Word2Vec(sentences, size=52, window=5, min_count=10, workers=16, sg=1, sample=1e-5, hs=1) model.save_word2vec_format( 'embeddings/smiley_tweets_embedding_multilingual{}'.format( input_fname), binary=False)
def prepare_data_files(path_num, path_token, check_list, long_path_list, buggy_states_list, output_root, w_size, w_window, w_workers, label): with open(os.path.join(output_root, "check_list.json"), "a") as file_: json.dump(check_list, file_) word2vec_model = g.Word2Vec(long_path_list, size=w_size, window=w_window, workers=w_workers) word2vec_model.wv.save_word2vec_format(os.path.join( output_root, "word2vec.txt"), binary=False) diction = get_word2vec_list(os.path.join(output_root, "word2vec.txt")) for i in range(len(long_path_list)): long_path_list[i] = vector_replace(long_path_list[i], diction) long_path_list_np = transfor_2_np(long_path_list, path_num, path_token, w_size) np.save(os.path.join(output_root, label + "_input.npy"), long_path_list_np) buggy_states_list = transfor_2_np(buggy_states_list, 0, path_num, 1) if label == "test": buggy_states_list = np.ones(buggy_states_list.shape) np.save(os.path.join(output_root, label + "_label.npy"), buggy_states_list) output_list = buggy_states_list.reshape([path_num, 1]) output_list = output_list.repeat(w_size, axis=1) np.save(os.path.join(output_root, label + "_output.npy"), output_list)
def word2vec(data, window, min_count, size, iterations): model = genmod.Word2Vec(train_data, window=window, min_count=min_count, size=size, iter=iterations) return model.wv
def trainer_w2v(self, path): """ corpus=['教授', '长江', '学者', '优秀成果', '集中', '呈现'], ['', '生物质', '发电', '燃料', '供应链', '运营', '模式'] :param path: :return: """ print('train w2v') corpus = get_corpus(path, w2v=True) w2v = models.Word2Vec(min_count=2, window=3, size=300, sample=6e-5, alpha=0.03, min_alpha=0.0007, negative=15, workers=4, iter=10, max_vocab_size=50000) w2v.build_vocab(corpus) w2v.train(corpus, total_examples=w2v.corpus_count, epochs=15, report_delay=1) return w2v
def load_save_word2vec_model(line_words, model_filename): # 模型参数 feature_size = 500 content_window = 5 freq_min_count = 3 # threads_num = 4 negative = 3 #best采样使用hierarchical softmax方法(负采样,对常见词有利),不使用negative sampling方法(对罕见词有利)。 iter = 20 print("word2vec...") tic = time.time() if os.path.isfile(model_filename): model = models.Word2Vec.load(model_filename) print(model.vocab) print("Loaded word2vec model") else: bigram_transformer = models.Phrases(line_words) model = models.Word2Vec(bigram_transformer[line_words], size=feature_size, window=content_window, iter=iter, min_count=freq_min_count, negative=negative, workers=multiprocessing.cpu_count()) toc = time.time() print("Word2vec completed! Elapsed time is %s." % (toc - tic)) model.save(model_filename) # model.save_word2vec_format(save_model2, binary=False) print("Word2vec Saved!") return model
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) input_fname = '' if len(sys.argv) > 1: input_fname = sys.argv[1] #unsupervised data directory = 'smiley_tweets_full' listing = map(lambda x: os.path.join(directory, x), os.listdir(directory)) sentences = MySentences(files=listing) model = models.Word2Vec(sentences, size=52, window=5, min_count=10, workers=16, sg=1, sample=1e-5, hs=1) model.save_word2vec_format( 'embeddings/smiley_tweets_embedding_multilingual{}'.format( input_fname), binary=False)
def _gen_embedding(ndim, alignment=False): print "Generating %d-dim word embedding ..." %ndim int2ch, ch2int = get_vocab() ch_lists = [] quatrains = get_quatrains() for idx, poem in enumerate(quatrains): for sentence in poem['sentences']: ch_lists.append(filter(lambda ch: ch in ch2int, sentence)) if alignment: # the i-th characters in the poem, used to boost Dui Zhang i_characters = [[sentence[j] for sentence in poem['sentences']] for j in range(len(poem['sentences'][0]))] for characters in i_characters: ch_lists.append(filter(lambda ch: ch in ch2int, characters)) if 0 == (idx+1)%10000: print "[Word2Vec] %d/%d poems have been processed." %(idx+1, len(quatrains)) print "Hold on. This may take some time ..." model = models.Word2Vec(ch_lists, size = ndim, min_count = 5) embedding = uniform(-1.0, 1.0, [VOCAB_SIZE, ndim]) for idx, ch in enumerate(int2ch): if ch in model.wv: embedding[idx,:] = model.wv[ch] if alignment: model.save(_w2v_with_alignment_model_path) print "Word2Vec model is saved." np.save(_w2v_with_alignment_path, embedding) print "Word embedding is saved." else: model.save(_w2v_model_path) print "Word2Vec model is saved." np.save(_w2v_path, embedding) print "Word embedding is saved."
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) input_fname = '' if len(sys.argv) > 1: input_fname = sys.argv[1] #supervised data train = "semeval/task-B-train-plus-dev.tsv.gz" dev = "semeval/twitter-test-gold-B.downloaded.tsv.gz" train16 = "semeval/task-A-train-2016.tsv.gz" dev2016 = "semeval/task-A-dev-2016.tsv.gz" devtest2016 = "semeval/task-A-devtest-2016.tsv.gz" test2016 = "semeval/SemEval2016-task4-test.subtask-A.txt.gz" #unsupervised data smiley_pos = 'semeval/smiley_tweets_{}.gz'.format(input_fname) files = [(train, 3), (dev, 3), (train16, 2), (dev2016, 2), (devtest2016, 2), (test2016, 2), (smiley_pos, 0)] sentences = MySentences(files=files) model = models.Word2Vec(sentences, size=52, window=5, min_count=5, workers=7, sg=1, sample=1e-5, hs=1) model.save_word2vec_format( 'embeddings/smiley_tweets_embedding_final{}'.format(input_fname), binary=False)