def cleanText(input_list): # declare and init textCleaner text_cleaner = TextCleaner(rm_punct = True, rm_digits = True, rm_hashtags = False) # cleaning text cleaned_list = [] for text in input_list: if (str(text) != "nan"): text = text.replace(" /", "/") text = re.sub(r'\b\w{1,3}\b', '', text) cleaned_list.append(text_cleaner.regex_applier(text)) #REMOVE STOPWORDS no_stopwords_sentences = [] stopwords_dict = stopwordsDictFromFile(conf.ConfigSectionMap('STOPWORDS_FILES')) for text in cleaned_list: if (str(text) != "nan"): words_no_stopwords = [] words = utilities.convertSentenceToListOfWords(text) for word in words: if not stopwords_dict.has_key(word.lower().encode("utf-8")): words_no_stopwords.append(word) no_stopwords_sentences.append(utilities.concatWords(words_no_stopwords)) return no_stopwords_sentences
def computeWord2Tweets(self): """ computes dictionary for each vector the list of the tweets related """ self.vec2tweets = {} self.vec2word = {} self.word2tweet = {} for key in self.model.wv.vocab.keys(): vec = self.model[key] self.vec2tweets[utilities.convertListOfNumToString(vec)] = [] self.word2tweet[key] = [] for i in range(len(self.cleaned_tweets)): for word in utilities.convertSentenceToListOfWords( self.cleaned_tweets[i]): try: vec = self.model.wv[word] self.vec2tweets[utilities.convertListOfNumToString( vec)].append(self.tweets[i]) self.vec2word[utilities.convertListOfNumToString( vec)] = word self.word2tweet[word].append(self.tweets[i]) except: pass
def getEmbeddedWords(sentences, model=None): log.info("getting Embedded Words from input") # load trained model W2V if (model is None): model = Word2Vec.load(conf.get('MAIN', 'path_pickle_w2v_model')) list_of_sentences = [] embedded_words = [] dict_index2word_tweet = OrderedDict({}) dict_word2index_tweet = OrderedDict({}) row_index = 0 # from list of sentence to list of list of words for text in sentences: if (str(text) != "nan"): list_of_sentences.append( utilities.convertSentenceToListOfWords(text)) for sentence in list_of_sentences: for word in sentence: try: # if word is not yet inserted if word not in dict_word2index_tweet: embedded_words.append(model.wv[word]) dict_index2word_tweet[row_index] = word dict_word2index_tweet[word] = row_index row_index += 1 # we are here if w2v model does not have the word except KeyError: pass return numpy.array( embedded_words), dict_index2word_tweet, dict_word2index_tweet
def trainW2Vmodel(corpus, new_model=False, identifier=""): log.info("Training W2V Model") # covert list of sentences to list of words list_corpus = [] for text in corpus: if (str(text) != "nan"): list_corpus.append(utilities.convertSentenceToListOfWords(text)) corpus = list_corpus # count num of words flat_list = [item for sublist in corpus for item in sublist] num_words = len(flat_list) debugging = conf.get('MAIN', 'debugging') if (debugging == 'True'): n_epoch = 10000 else: n_epoch = int(10**9 / num_words) #epochs = iter model = Word2Vec(size=conf.getint('W2V', 'size'), min_count=conf.getint('W2V', 'min_count'), sg=conf.getint('W2V', 'sg'), window=conf.getint('W2V', 'window'), iter=n_epoch, alpha=conf.getfloat('W2V', 'alpha'), workers=conf.getint('W2V', 'workers')) model.build_vocab(corpus, progress_per=conf.getint('W2V', 'progress_per')) #Train the model over train_reviews (this may take several minutes) model.train(corpus, total_examples=model.corpus_count, epochs=model.iter) if (new_model == False): model.save(conf.get('MAIN', 'path_pickle_w2v_model')) log.info("Model trained") else: filename = conf.get('MAIN', 'path_pickle_w2v_model_incr_fold' ) + "word2vec_" + str(identifier) + ".pickle" model.save(filename)