def phraseBuilder(df_sentences): # creating bigram Gensim Phrases: bigram = phrases.Phrases(delimiter='_') # convert df to list of sentences: sentences = [] for index, row in df_sentences.iterrows(): print(row['Feedback']) trans = row['Feedback'] trans = trans.encode('ascii', 'ignore') sentence = [ word for word in word_tokenize(cleanTranscript(trans)) if word.isalpha() ] sentences.append(sentence) bigram.add_vocab([sentence]) finalTrans = [] # creating trigram Gensim Phrases trigram = phrases.Phrases(bigram[sentences], delimiter='_') for sen in trigram[bigram[sentences]]: trigramSen = ' '.join(w for w in sen) finalTrans.append(trigramSen) # assign interaction_id to these trigram Phrases that we will chunk to get useful context finalTransSeries = pd.Series(finalTrans) df_sentences = df_sentences.drop('Feedback', 1) df_sentences['finalTrans'] = finalTransSeries.values return df_sentences
def clean_text(df_input, col='content', remove_unusual=False, remove_stopwords=False, toRemove=[], remove_numbers=False, stem_words=False, lemmatize=False, nGram=False): # Clean mails if remove_stopwords: toRemove.extend(stopWords()) usual_words = [] if remove_unusual: usual_words = usualWords() ## Clean content of mails # tokenization and lemmatization/stemming df_input[col] = df_input[col].map( lambda x: text_to_words(x, remove_numbers, stem_words, lemmatize)) # removing stopwords and unusual words df_input[col] = df_input[col].map(lambda x: remove_words( x, remove_unusual, remove_stopwords, toRemove, usual_words)) # bigrams and trigrams if nGram: phrase = phrases.Phrases(df_input[col], min_count=30, threshold=300) bigram = phrases.Phraser(phrase) trigram = phrases.Phrases(bigram[df_input[col]]) df_input[col] = [trigram[bigram[sent]] for sent in df_input[col]] print("data cleaned") return df_input
def phraseBuilder(df_sentences): #creating bigram Gensim Phrases: bigram = phrases.Phrases(delimiter='-') #convert df to list of sentences: sentences = [] for index, row in df_sentences.iterrows(): sentence = [word for word in word_tokenize(row['phrase'])] sentences.append(sentence) bigram.add_vocab([sentence]) finalTrans = [] # creating trigram Gensim Phrases trigram = phrases.Phrases(bigram[sentences], delimiter='-') for sen in trigram[bigram[sentences]]: trigramSen = ' '.join(w for w in sen) finalTrans.append(trigramSen) #assign interaction_id to these trigram Phrases that we will chunk to get useful context finalTransSeries = pd.Series(finalTrans) df_sentences = df_sentences.drop('phrase', 1) df_sentences['finalTrans'] = finalTransSeries.values return df_sentences
def __init__(self, corpus_dir=None): super().__init__(corpus_dir=corpus_dir) self.sentences = self.get_sentences() model_save_path = os.path.join( self.save_dir, "%s_model.pkl" % self.__class__.__name__) try: with open(model_save_path, 'rb') as f: self._bigrams, self._trigrams = pickle.load(f) except FileNotFoundError: self._bigrams = gen_phrases.Phrases(self.sentences) self._trigrams = gen_phrases.Phrases(self._bigrams[self.sentences]) with open(model_save_path, 'wb') as f: pickle.dump((self._bigrams, self._trigrams), f)
def main(args): """Reads csv data file containing sentences, tokenizes and uses them to train word2vec model""" data = pd.read_csv(args['data_csv'], index_col=0) # tokenize and preprocess sentences sentences = [ stripword(row.translate(translator).lower).split(' ') for row in data['Sentence'] ] # create bigrams to capture word combinations (e.g. New_York) bigram_transformer = phrases.Phrases(sentences) bigram = phrases.Phraser(bigram_transformer) # train word2vec model according to the hyperparameters chosen currentmodel = Word2Vec(bigram[sentences], workers=-1, sg=0, size=args['model_size'], min_count=5, window=['window_size'], sample=1e-3) currentmodel.init_sims(replace=True) currentmodel.save("app/word2vec/word2vec_retrained") print('Saved as app/word2vec/word2vec_retrained')
def get_collocations(text, verbose = True, bigram_freq = True): if (verbose): print('Word Tokenization...') tokens = [t.split() for t in text] if (verbose): print('Making Bigramer Model...') bigramer = phrases.Phrases(tokens) # train model with default settings if (bigram_freq): if (verbose): print('Making Bigramer list...') bigram_counter = list() bigram_list = list(bigramer.vocab.items()) for key, value in bigram_list: str_key = key.decode() if len(str_key.split("_")) > 1: bigram_counter.append(tuple([str_key, value])) bigram_df = pd.DataFrame(bigram_counter, columns=['bigrams', 'count']) if (bigram_freq): res_dict = {'bigramer': bigramer, 'bigram_freq': bigram_df} else: res_dict = {'bigramer': bigramer, 'bigram_freq': None} return(res_dict)
def ngram_counts(self, clean_sentences): ''' threshold is for PMI score and min_count is for word counts in order to not miss any person that is seldom make the min_count =1 ''' phrase = phrases.Phrases(clean_sentences, min_count=1, threshold=2, delimiter=' ') #len(phrase.vocab.keys()) bigrams = phrases.Phrases(phrase[clean_sentences], min_count=1, threshold=2, delimiter=' ') #len(bigrams.vocab.keys()) trigrams = phrases.Phrases(bigrams[clean_sentences], min_count=1, threshold=2, delimiter=' ') #len(trigrams.vocab.keys()) #unigram_count_dict={} #bigram_count_dict={} #trigram_count_dict={} #ngram_count_dict=trigrams.vocab """ i=0 for k in trigrams.vocab.keys(): if i%100000==0: print i,"done" if len(k.split("_"))>=4: quadgram_count_dict[k]=trigrams.vocab[k] elif len(k.split("_"))==3: trigram_count_dict[k]=trigrams.vocab[k] elif len(k.split("_"))==2: bigram_count_dict[k]=trigrams.vocab[k] else: unigram_count_dict[k]=trigrams.vocab[k] i=i+1 """ os.chdir(self.inter_data_path) pickle.dump(trigrams.vocab, open("ngram_count_dict.pkl", "wb"))
def train_phrase(): sentence_stream = list() for doc in documentList: wordlist = doc.split(" ") sentence_stream.append(wordlist) ps = phrase.Phrases(sentence_stream) bigram = phrase.Phraser(ps) return bigram
def phrase_detection(df): """ Given the emails dataframe, form bigrams based on the text in "Body" field """ sentences = [text.split() for text in df["Body"]] phrases_ = phrases.Phrases(sentences, min_count=params.bigrams_min_count, threshold=params.bigrams_threshold) bigram = phrases.Phraser(phrases_) # for phr, score in phrases_.export_phrases(sentences): # print(u'{0} {1}'.format(phr, score)) return bigram
def make_bigram(dirpaths): sentences = corpora(dirpaths, loop_or_not=False) print('Start phrasing:') phrase = phrases.Phrases(sentences, max_vocab_size=DICTLENGTH, min_count=1, threshold=5, common_terms={'of', 'and', 'the', 'with'}) bigram = phrases.Phraser(phrase) bigram.save(SAVED_BIGRAM_PATH) print('bigram phraser saved conclude.')
def train(): choo_choo_train = word2vec.LineSentence(INPUT_FILE) bigram = phrases.Phrases( sentences=choo_choo_train, min_count=50, threshold=10.0, ) trigram = phrases.Phrases( sentences=bigram[choo_choo_train], min_count=50, threshold=10.0, ) model = word2vec.Word2Vec( sentences=trigram[choo_choo_train], min_count=100, size=100, workers=4, ) model.init_sims(replace=True) model.save(OUTPUT_FILE)
def __init__(self, model_path, create=False, corpus=None, bigrams=True): """ Initializes the rewriter, given a particular Word2Vec corpus. A good example corpus is the Wikipedia Text8Corpus. You only need the corpus if you are recreating the model from scratch. If ``create == True``, this generates a new Word2Vec model (which takes a really long time to build.) If ``False``, this loads an existing model we already saved. :param str model_path: where to store the model files. This file needn't exist, but its parent folder should. :param bool create: True to create a new Word2Vec model, False to use the one stored at ``model_path``. :param Iterable corpus: only needed if ``create=True``. Defines a corpus for Word2Vec to learn from. :param bool bigrams: only needed if ``create=True``. If True, takes some more time to build a model that supports bigrams (e.g. `new_york`). Otherwise, it'll only support one-word searches. ``bigram=True`` makes this slower but more complete. """ self.model_path = model_path # TODO: add logic around defaulting to creating or not if create: # generate a new Word2Vec model... takes a while! # TODO optimize parameters transformed_corpus = None if bigrams: # TODO save the phraser somewhere... but that requires # even more arguments. # the Phrases class lets you generate bigrams, but the # Phraser class is a more compact version of the same # TODO making the phrases takes forever, making the phraser # takes forever, turning it into a list takes forever... this # is really annoying. is there any way to speed it up? bigram_generator = phrases.Phraser(phrases.Phrases(corpus)) # weird bug where the bigram generator won't work unless # it's turned into a list first. if you try to do it straight, # it'll give you total gibberish. FIXME bigram_corpus = list(bigram_generator[corpus]) transformed_corpus = bigram_corpus else: # no bigrams, same old corpus transformed_corpus = corpus self.model = word2vec.Word2Vec(transformed_corpus, workers=8) self.model.save(self.model_path) else: self.model = word2vec.Word2Vec.load(self.model_path)
def addSentence(self, sentence): try: #f=open("w2v_"+self.name,"r") model = gensim.models.KeyedVectors.load_word2vec_format("w2v_" + self.name) weights = model.syn0 except FileNotFoundError: print(len(sentence)) ph = phrases.Phrases(sentence) bigram_transformer = phrases.Phraser(ph) trigram = phrases.Phrases(bigram_transformer[sentence]) ngram = phrases.Phrases(trigram[sentence]) #ngram=phrases.Phrases(trigram[bigram_transformer[sentence]]) model = Word2Vec(ngram[trigram[bigram_transformer[sentence]]], size=40000, window=5, min_count=1, workers=4, sg=0, iter=80) model.wv.save_word2vec_format("w2v_" + self.name) #print(sentence[1:10]) #print("Fresh :",model["fresh"]) #print("ताजा :",model["ताजा"]) weights = model.wv.syn0 #print(weights) np.save(open("embed" + self.name + ".txt", 'wb'), weights) vocab = dict([(k, v.index) for k, v in model.vocab.items()]) with open("vocab" + self.name + ".txt", 'w', encoding='utf-8') as f: f.write(json.dumps(vocab)) with open("vocab" + self.name + ".txt", 'r', encoding='utf-8') as f: data = json.loads(f.read()) self.word2index = data self.index2word = dict([(v, k) for k, v in data.items()]) self.n_words = len(model.vocab) print(self.name + ":", self.n_words)
def build_phrases(doc_list): # creating bigram Gensim Phrases: bigram = phrases.Phrases(delimiter='-') bigram_phrases = [] for doc in doc_list: for sen in doc: sen = sen.replace('\n', '') sentence = [word for word in sen.split()] #print(sentence) bigram_phrases.append(sentence) for word in sentence: bigram.add_vocab(str(word)) #bigram.add_vocab(bigram_phrases) trigram_phrases = [] # creating trigram Gensim Phrases trigram = phrases.Phrases(delimiter='-') for sen in trigram[bigram[bigram_phrases]]: trigramSen = ' '.join(w for w in sen) trigram_phrases.append(trigramSen) return trigram_phrases
def build_model(corpus_path, detect_phrase=False) : startTime = time.time() sentences = word2vec.LineSentence(corpus_path) if detect_phrase : bigram_transformer = phrases.Phrases(sentences) model = word2vec.Word2Vec(bigram_transformer[sentences], size=100, alpha=0.025, window=5, min_count=5, sample=1e-5, workers=4, sg=1) else : model = word2vec.Word2Vec(sentences, size=100, alpha=0.025, window=5, min_count=5, sample=1e-5, workers=4, sg=1) # no more training model.init_sims(replace=True) durationTime = time.time() - startTime sys.stderr.write("duration time = %f\n" % durationTime) return model
def run_on_all_books(books, bootstrap=True): """Runs word2vec training on data. Args: books: dictionary of titles to text (str) bootstrap: whether to bootstrap sample from the sentences """ # Combine all text into a list of sentences print("Getting sentences...") all_sentences = [] for title, book in books.items(): all_sentences.extend(get_sentences(book)) # Create model bigrams = phrases.Phrases(all_sentences, min_count=5, delimiter=b' ', common_terms=stopwords) # Create vocabulary of bigrams print("Creating vocabulary...") vocab = [w for sent in bigrams[all_sentences] for w in sent] vocab = [w for w, count in Counter(vocab).most_common() if count >= 5] # Save vocab with codecs.open(os.path.join(args.output_dir, 'vocab.txt'), 'w', encoding='utf-8') as f: f.write('\n'.join(vocab)) # Run word2vec model for run_idx in range(args.num_runs): print("Run #%d" % run_idx) if bootstrap: data = bigrams[np.random.choice(all_sentences, len(all_sentences), replace=True)] else: data = bigrams[all_sentences] model = word2vec.Word2Vec(data, size=args.dim, window=args.window, sg=1, min_count=5, workers=10) model.wv.save(os.path.join(args.output_dir, str(run_idx) + '.wv'))
def create_phrases_model(): MyUtils.init_logging("Encode_Common.log") logging.info("Starting preparation of phrases...") docs_percent_touse = 1 #0.5. chunk_size = 10**5 doc_filenames = [F.DESCDOCS_RAW, F.QADOCS_RAW] doc_files = [open(doc_filename, "r") for doc_filename in doc_filenames] all_docwords = [] for doc_file in doc_filenames: for docs_chunk in pd.read_csv(doc_file, chunksize=chunk_size): len_c = len(docs_chunk) words_chunk = [] indices = list( sorted( numpy.random.choice(len_c, int(docs_percent_touse * len_c), replace=False))) selected_rows = docs_chunk.iloc[indices] for tupl in selected_rows.itertuples(): word_ls = ast.literal_eval(tupl.words) words_chunk.append(word_ls) all_docwords.extend(words_chunk) logging.info("Reading in the documents' words. Chunk processed...") logging.info("Completed: reading in a set of documents' words" ) # @ time = " + str(round(time1 - start, 3))) logging.info("Number of documents to use in the Phrases model: %s", str(len(all_docwords))) del doc_filenames del doc_files collect() phrases_model = phrases.Phrases(sentences=all_docwords, min_count=20, threshold=300, delimiter=b'_', max_vocab_size=30 * 10**6) #phraser_model = phrases.Phraser(phrases_model) #time2 = time(); logging.info( "Phrases model created") #@ time = " + str(round(time2 - start, 3))) logging.info("Memory size in MBs = %s", str(mem.asizeof(phrases_model) // 2**20)) phrases_model.save(F.PHRASES_MODEL) return phrases_model
def explore_phrase2vec(min_freq, phrases_threshold): MyUtils.init_logging("Explore_Phrase2Vec.log") words_lls = [] doc_filenames = [F.DESCDOCS_RAW, F.QADOCS_RAW] doc_files = [open(doc_filename, "r") for doc_filename in doc_filenames] all_docwords = [] chunk_size = 10**5 for doc_file in doc_filenames: for docs_chunk in pd.read_csv(doc_file, chunksize=chunk_size): len_c = len(docs_chunk) words_chunk = [] #indices = list(sorted(numpy.random.choice(len_c, int(docs_percent_touse * len_c), replace=False))) #selected_rows = docs_chunk.iloc[indices] for tupl in docs_chunk.itertuples(): #words = tupl.words.replace("'",'"') #logging.info(words) #word_ls = json.loads(words)#ast.literal_eval(tupl.words) word_ls = eval(tupl.words, {'__builtins__': {}}) words_chunk.append(word_ls) all_docwords.extend(words_chunk) logging.info("Added chunk from file %s to documents list...", doc_file) logging.info("Number of documents: %s", len(all_docwords)) phrases_model = phrases.Phrases(sentences=all_docwords, min_count=min_freq, threshold=phrases_threshold, delimiter=b'_') #logging.info("***The Phrases model's frequency vocabulary: %s", str(phrases_model.vocab)) phrases_vocab = phrases_model.vocab del phrases_model collect() sorted_vocabulary = sorted(list(phrases_vocab.items()), key=lambda tpl: tpl[1], reverse=True) phrases_sorted_vocabulary = list( filter(lambda tpl: '_' in str(tpl[1]), sorted_vocabulary)) individual_words_sorted_vocabulary = list( filter(lambda tpl: not ('_' in str(tpl[1])), sorted_vocabulary)) logging.info("***The vocabulary of phrases, ordered by frequency : %s ", phrases_sorted_vocabulary) logging.info("***The vocabulary of words, ordered by frequency : %s ", individual_words_sorted_vocabulary) #phrases_model.save("Exploration_phrasesModel_mincount"+ str(min_freq) + "_T"+str(phrases_threshold) + ".model") for i in range(len(words_lls) // 4): print(str(phrases_model[words_lls[i]]))
def word_modeling(tokens): from gensim.corpora import Dictionary from gensim.models import phrases, LdaModel bigram = phrases.Phraser(phrases.Phrases(tokens, min_count=2)) for i, ts in enumerate(tokens): for btoken in bigram[ts]: if '_' in btoken and btoken not in tokens[i]: tokens[i].append(btoken) token_dict = Dictionary(tokens) corpus = [token_dict.doc2bow(t) for t in tokens] _ = token_dict[0] model = LdaModel(corpus=corpus, id2word=token_dict.id2token, chunksize=len(tokens), alpha="auto", eta="auto", iterations=400, num_topics=20, passes=20, eval_every=None) pprint.pprint(model.top_topics(corpus))
def cleanDocs(posts): stop = set(stopwords.words('english')) exclude = set(string.punctuation) lemma = WordNetLemmatizer() clean_docs = [] bigram_docs = [] for post in posts: stop_free = " ".join([i for i in post.lower().split() if i not in stop]) punc_free = ''.join(ch for ch in stop_free if ch not in exclude) digit_free = [word for word in punc_free.split() if not word.isdigit() and len(word) > 2] normalized = " ".join(lemma.lemmatize(word) for word in digit_free) nouns = [word[0] for word in nltk.pos_tag(normalized.split()) if word[1][0] == 'N' or word[1][0] == 'VB'] clean_docs.append(nouns) bigram_transformer = phrases.Phrases(clean_docs) for doc in bigram_transformer[clean_docs]: bigram_docs.append(doc) return bigram_docs
def word2vecmodel_gensim(text): corpus = text.values.tolist() bigrams = phrases.Phrases(corpus) model = gensim.models.Word2Vec(bigrams, min_count=10, size=100) return model
def main(): # uncomment the file that we want to assign topic + sentiment to df_csv = pd.read_csv("dataset/phase1/non_replies.csv", encoding='ISO-8859-1') # df_csv = pd.read_csv("dataset/phase1/replies.csv", encoding='ISO-8859-1') # df_csv = pd.read_csv("dataset/phase1/replied_to.csv", encoding='ISO-8859-1') # df_csv = pd.read_csv("dataset/phase2/non_replies.csv", encoding='ISO-8859-1') # df_csv = pd.read_csv("dataset/phase2/replies.csv", encoding='ISO-8859-1') # df_csv = pd.read_csv("dataset/phase2/replied_to.csv", encoding='ISO-8859-1') # df_csv = pd.read_csv("dataset/phase3/non_replies.csv", encoding='ISO-8859-1') # df_csv = pd.read_csv("dataset/phase3/replies.csv", encoding='ISO-8859-1') # df_csv = pd.read_csv("dataset/phase3/replied_to.csv", encoding='ISO-8859-1') df_csv.head() textList = df_csv.values.tolist() print(len(textList)) text = "" # ------- Loading dataset files ------- # Uncomment the block corresponding the phase that we want to assign topics and sentiment to only # Uncomment entire phase block if in training phase. If in assigning phase uncomment only the file we want to assign to # Phase 1 # with open("dataset/phase1/non_replies.csv", encoding='ISO-8859-1') as csvfile: # text= csvfile.read() # uncomment for either training or assigning phase # with open("dataset/phase1/replies.csv", encoding='ISO-8859-1') as csvfile: # # text+= csvfile.read() # uncomment for training phase # text = csvfile.read() # uncomment for assigning phase # with open("dataset/phase1/replied_to.csv", encoding='ISO-8859-1') as csvfile: # # text+= csvfile.read() #uncomment for training phase # text = csvfile.read() #uncomment for assigning phase # Phase 2 # with open("dataset/phase2/non_replies.csv", encoding='ISO-8859-1') as csvfile: # text= csvfile.read() # uncomment for either training or assigning phase # with open("dataset/phase2/replies.csv", encoding='ISO-8859-1') as csvfile: # # text+= csvfile.read() # uncomment for training phase # text = csvfile.read() # uncomment for assigning phase # with open("dataset/phase2/replied_to.csv", encoding='ISO-8859-1') as csvfile: # # text+= csvfile.read() #uncomment for training phase # text = csvfile.read() #uncomment for assigning phase # Phase 3 # with open("dataset/phase3/non_replies.csv", encoding='ISO-8859-1') as csvfile: # text= csvfile.read() # uncomment for either training or assigning phase # with open("dataset/phase3/replies.csv", encoding='ISO-8859-1') as csvfile: # # text+= csvfile.read() # uncomment for training phase # text = csvfile.read() # uncomment for assigning phase # with open("dataset/phase3/replied_to.csv", encoding='ISO-8859-1') as csvfile: # # text+= csvfile.read() #uncomment for training phase # text = csvfile.read() #uncomment for assigning phase # ------- Generating corpus ------- nlp = spacy.load("en_core_web_sm") my_stop_words = [ 'https', 'co', 'from', 'text', 'subject', 're', 'edu', 'use', 'RT', 'make', 'jerusalemembassy', 'jerusalem', 'Jerusalem', 'amp', 'JerusalemEmbassy', 'usembassyjerusalem' ] for stopword in my_stop_words: lexeme = nlp.vocab[stopword] lexeme.is_stop = True nlp.max_length = 1547045 doc = nlp(text) texts, article = [], [] for w in doc: if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num and not w.like_url and w.is_ascii and not w.is_left_punct and not w.is_right_punct and w.lang_ == 'en' and w.is_alpha: article.append(w.lemma_) if w.text == '\n': texts.append(article) article = [] texts = [x for x in texts if x != []] bigram = phrases.Phrases(texts) texts = [bigram[line] for line in texts] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] # ------- create LDA model (if in training phase) ------- mallet_path = os.path.join('C:\\', 'new-mallet', 'mallet-2.0.8', 'bin', 'mallet.bat') # ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=10, id2word=dictionary) # ldamallet.save("phaseOne_full_model") # ldamallet.save("phaseTwo_full_model") # ldamallet.save("phaseThree_full_model") # ------- Loading LDA model (if in assigning phase) ------- ldamallet = gensim.models.wrappers.LdaMallet.load( "LDAmodels\\phaseOne_full_model") # ldamallet = gensim.models.wrappers.LdaMallet.load("LDAmodels\\phaseTwo_full_model") # ldamallet = gensim.models.wrappers.LdaMallet.load("LDAmodels\\phaseThree_full_model") # ------- Assigning topics to each text ------- def format_topics_sentences(ldamodel, corpus, texts): # Init output sent_topics_df = pd.DataFrame() # Get main topic in each document for i, row in enumerate(ldamodel[corpus]): row = sorted(row, key=lambda x: (x[1]), reverse=True) # Get the Dominant topic, Perc Contribution and Keywords for each document dom_topic = "" perc_contrib = "" keywords = "" for j, (topic_num, prop_topic) in enumerate(row): if j == 0 or j == 1: # => top 2 dominant topics wp = ldamodel.show_topic(topic_num) topic_keywords = ", ".join([word for word, prop in wp]) dom_topic += str(topic_num) perc_contrib += str(round(prop_topic, 4)) keywords += topic_keywords else: sent_topics_df = sent_topics_df.append(pd.Series( [dom_topic, perc_contrib, keywords]), ignore_index=True) break sent_topics_df.columns = [ 'Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords' ] # Add original text to the end of the output contents = pd.Series(texts) sent_topics_df = pd.concat([sent_topics_df, contents], axis=1) return (sent_topics_df) # ------- Assigning sentiment to each text ------- def getSentiment(): analyzer = SentimentIntensityAnalyzer() sentimentResults = [] for text in textList: for tweet in text: tweet = str(tweet) vs = analyzer.polarity_scores(tweet) if (vs['compound'] > 0.1): sentimentResults.append("positive") elif (vs['compound'] < -0.1): sentimentResults.append("negative") else: sentimentResults.append("neutral") return sentimentResults # ------- Calling topic and sentiment assignment functions ------- df_topic_sents_keywords = pd.DataFrame() df_topic_sents_keywords = format_topics_sentences(ldamodel=ldamallet, corpus=corpus, texts=textList) sentimentColumn = pd.Series(getSentiment()) # ------- Adding sentiment column to full dataframe ------- df_topic_sents_keywords = pd.concat( [df_topic_sents_keywords, sentimentColumn], axis=1) # Format df_dominant_topic = df_topic_sents_keywords.reset_index() df_dominant_topic.columns = [ 'Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text', 'Sentiment' ]
def save(self, kind, bigrams=True): print('Initializing split word phraser') stream = self.stream('sentences', 'list') split_word_model = phrases.Phrases(self.stream('sentences', 'list')) # first, reunite words that shouldn't be split; # remove all bigrams that don't merge into a real word split_word_phraser = phrases.Phraser(split_word_model) for word_tuple in list(split_word_phraser.phrasegrams.keys()): if not word_tuple[0] + word_tuple[1] in nlp.vocab: del split_word_phraser.phrasegrams[word_tuple] # we don't want the merged words to have a delimiter in them split_word_phraser.delimiter = b'' if bigrams is True: print('Initializing bigram phraser') # now we actually look for bigrams stream = self.stream('sentences', 'list') bigram_model = phrases.Phrases(split_word_phraser[stream]) # this phraser will catch bigrams that are very unique but less bigram_model.min_count = 20 bigram_model.threshold = 90 bigram_phraser_threshold = phrases.Phraser(bigram_model) # this one will catch bigrams that are less unique but very common bigram_model.min_count = 70 bigram_model.threshold = 60 bigram_phraser_count = phrases.Phraser(bigram_model) if kind == 'documents': save_path = self.save_dir.joinpath('line_documents.txt') elif kind == 'sentences': sp.call(['rm -rf {}/line_sentences'.format(self.save_dir.name)], shell=True) save_dir = self.save_dir.joinpath('line_sentences') save_dir.mkdir(exist_ok=True) for i, tokenized_text in enumerate(self.stream('documents', 'spacy')): print('Writing {} in line-{} format'.format( self.raw_paths[i].name, kind)) if kind == 'sentences': save_path = save_dir.joinpath(self.raw_paths[i].name + '.txt') if kind == 'documents': document_tokens = [] with save_path.open('a') as save_file: for sentence in tokenized_text.sents: sentence_tokens = [] for token in sentence: if token.pos_ in ['PROPN', 'NUM']: sentence_tokens.append(token.pos_) elif token.is_alpha and token.is_ascii and not token.is_oov: sentence_tokens.append(token.text) sentence_tokens = split_word_phraser[sentence_tokens] if bigrams is True: sentence_tokens = bigram_phraser_threshold[ sentence_tokens] sentence_tokens = bigram_phraser_count[sentence_tokens] if kind == 'sentences': sentence_string = ' '.join(sentence_tokens) if len(sentence_string) > 0: save_file.write(sentence_string + '\n') if kind == 'documents': document_tokens += sentence_tokens if kind == 'documents': document_string = ' '.join(document_tokens) save_file.write(document_string + '\n')
def train(self, data_iterator, **kwargs): # Train the phraser from gensim self.phraser = gensim_phrases.Phraser( gensim_phrases.Phrases(data_iterator, **kwargs))
return porterStemmer.stem(text) return porterStemmer.stem_sentence(text) def getTokenizedSentences(text): sentences = splitToSentences(text) tokenized_sentences = [] for sentence in sentences: tokenized_sentence = [ token for token in tokenize(sentence, lower=True) ] tokenized_sentences.append(tokenized_sentence) return tokenized_sentences bigram_phrases = phrases.Phrases(min_count=1, threshold=1) def getbigramTokenizedSentences(text): """ note that this is an on going implemetation, thus order of articles might change the results""" tokenized_sentences = getTokenizedSentences(text) bigram_phrases.add_vocab(tokenized_sentences) bigram_sentences = [ bigram_phrases[sentence] for sentence in tokenized_sentences ] return bigram_sentences def getWordAndBigrams2Freq(text): tokenized_sentences = getTokenizedSentences(text) bigram_phrases.add_vocab(tokenized_sentences)
import pandas as pd from nltk.corpus import stopwords from nltk.stem import SnowballStemmer, WordNetLemmatizer from nltk.tokenize import word_tokenize from gensim.models.doc2vec import TaggedDocument from gensim.models.word2vec import Word2Vec from gensim.models import phrases df = pd.read_csv("dataset", header=1, names=["A", "B", "C", "D"]) ##########################use same preprocessing as specific dataset####################################################################################################### bigrams = phrases.Phrases(sentences) epoch_list = [50] size_list = [10] for x in epoch_list: for y in size_list: vec_size = y max_epochs = x mode = 0 model = Word2Vec(size=vec_size, sg=mode, iter=max_epochs) model.build_vocab(bigrams[sentences]) model.train(bigrams[sentences], total_examples=model.corpus_count, epochs=model.epochs) model.save("10vector50epoch_new_tweet_word2vec_model")
print(data.head()) for text in tqdm(data['body']): if type(text) is str: text = text.lower() #lowercase tokens = [ tokenize.word_tokenize(t) for t in tokenize.sent_tokenize(text) ] sentences_2010.extend(tokens) #train models print('training (unigram) model') model1a = Word2Vec(sentences_2010) #default model, min_counts = 5 model1a.save("models/ED_4cat_snapshot1_lowercase.model") print('training bigram model') bigrams = phrases.Phrases(sentences_2010) model1b = Word2Vec(bigrams[sentences_2010]) #default model, min_counts = 5 model1b.save("models/ED_4cat_snapshot1_bigrams_lowercase.model") #%% #GENERATE MODELS FOR SECOND ED SNAPSHOT LOWERCASE (2020) sentences_2020_lower = [] data = pd.read_csv('ED_data/ED_data_2020.csv') print(data.head()) for text in tqdm(data['body']): text = text.lower() #make lowercase tokens = [tokenize.word_tokenize(t) for t in tokenize.sent_tokenize(text)] sentences_2020_lower.extend(tokens) print('training first 2020 model')