def fit(n_topics, min_occur, iterations, passes, texts, wc): """ Fit an LDA model of n topics to texts. """ bigrams = models.Phrases(texts, min_count=3) trigrams = models.Phrases(bigrams[texts], min_count=3) texts = [[w for w in t if (len(w)>1 and wc[w]>min_occur)] for t in texts] texts = [text + [x for x in bigrams[text] if '_' in x] + [x for x in trigrams[text] if '_' in x] for text in texts] dictionary = corpora.Dictionary(texts) bows = [dictionary.doc2bow(text) for text in texts] lda = LdaModel(bows, id2word=dictionary, num_topics=n_topics, alpha='auto', eta='auto', iterations=iterations, passes=passes, eval_every=1) coherence_model_lda = CoherenceModel(model=lda, texts=texts, dictionary=dictionary, coherence='c_v') coherence = coherence_model_lda.get_coherence() return Namespace(**locals())
def text_pre_processing(csvFile, columnNumberForText): # import data-set # colNum becomes an index, which should start at 0, and columns in spreadsheets start at 1, so subtract 1 from columnNumberForText documents = importColumnFromCSV(fileName=csvFile, colNum=int(columnNumberForText) - 1, header=True) print "imported documents..." # phrase detection model training abstracts = [] # list of abstracts containing a list of words for line in documents: # tokenize abstract tokens = nltk.word_tokenize(remove_non_ascii(line)) abstracts.append(tokens) # create bigram and trigram phrase models bigram = models.Phrases(abstracts) trigram = models.Phrases(bigram[abstracts]) print "built bigram and trigram phrase detection models..." # text pre-processing tools stops = get_stopwords('en') # stronger stopwords STOPS = list(' '.join(str(e).title() for e in stops).split()) # uppercase stopwords noNum = re.compile(r'[^a-zA-Z ]') # number and punctuation remover # function that cleans the text def clean(text): clean_text = noNum.sub(' ', text) # remove numbers and punctuations tokens = nltk.word_tokenize(clean_text) # tokenize text filtered_words = [w for w in tokens if not w in stops] # filter out lowercase stopwords double_filtered_words = [w for w in filtered_words if not w in STOPS] # filter out uppercase stopwords trigrams = trigram[bigram[double_filtered_words]] # apply the bigram and trigram models to the filtered words trigrams_str = ' '.join(str(x) for x in trigrams) # stringify clean and filtered tokens return trigrams_str results = [] # create list for storing clean abstracts # figure out path for the text corpus rawFilePathBase = os.path.basename(csvFile) rawFileName = os.path.splitext(rawFilePathBase)[0] corpusPath = "../../data/" + rawFileName + "_textCorpus.txt" # write list of clean text documents to text corpus file with open(corpusPath, 'w') as f: print 'Cleaned up text corpus file has been created at ', corpusPath, ' ...' f.truncate() # if file is not empty, remove everything inside the file for abstract in documents: text = clean(abstract) # clean each abstract, one at a time f.write(text + '\n') # write clean abstract to desired text corpus file results.append(text) # append clean abstracts to list return results, corpusPath # return a list of clean abstracts
def build_phrases(filename='../data/bigrams'): """ This script finds bi-grams in our corpus and stores the results to disk. """ bigram = models.Phrases(MyText()) bigram.save(filename)
def fit(self, X, y=None): """ Fit the model according to the given training data. """ self.gensim_model = models.Phrases(sentences=X, min_count=self.min_count, threshold=self.threshold, max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, progress_per=self.progress_per) return self
def fit(self, X, y=None): """Fit the model according to the given training data. Parameters ---------- X : iterable of list of str Sequence of sentences to be used for training the model. Returns ------- :class:`~gensim.sklearn_api.phrases.PhrasesTransformer` The trained model. """ self.gensim_model = models.Phrases( sentences=X, min_count=self.min_count, threshold=self.threshold, max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, progress_per=self.progress_per, scoring=self.scoring, connector_words=self.connector_words, ) self.phraser = FrozenPhrases(self.gensim_model) return self
def load_save_word2vec_model(line_words, model_filename): # 模型参数 feature_size = 500 content_window = 5 freq_min_count = 3 # threads_num = 4 negative = 3 #best采样使用hierarchical softmax方法(负采样,对常见词有利),不使用negative sampling方法(对罕见词有利)。 iter = 20 print("word2vec...") tic = time.time() if os.path.isfile(model_filename): model = models.Word2Vec.load(model_filename) print(model.vocab) print("Loaded word2vec model") else: bigram_transformer = models.Phrases(line_words) model = models.Word2Vec(bigram_transformer[line_words], size=feature_size, window=content_window, iter=iter, min_count=freq_min_count, negative=negative, workers=multiprocessing.cpu_count()) toc = time.time() print("Word2vec completed! Elapsed time is %s." % (toc - tic)) model.save(model_filename) # model.save_word2vec_format(save_model2, binary=False) print("Word2vec Saved!") return model
def partial_fit(self, X): """Train model over a potentially incomplete set of sentences. This method can be used in two ways: 1. On an unfitted model in which case the model is initialized and trained on `X`. 2. On an already fitted model in which case the X sentences are **added** to the vocabulary. Parameters ---------- X : iterable of list of str Sequence of sentences to be used for training the model. Returns ------- :class:`~gensim.sklearn_api.phrases.PhrasesTransformer` The trained model. """ if self.gensim_model is None: self.gensim_model = models.Phrases( sentences=X, min_count=self.min_count, threshold=self.threshold, max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, progress_per=self.progress_per, scoring=self.scoring, common_terms=self.common_terms) self.gensim_model.add_vocab(X) self.phraser = Phraser(self.gensim_model) return self
def partial_fit(self, X): if self.gensim_model is None: self.gensim_model = models.Phrases(sentences=X, min_count=self.min_count, threshold=self.threshold, max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, progress_per=self.progress_per) self.gensim_model.add_vocab(X) return self
def train_topic( self, num_topics, no_below=1, no_above=0.9, keep_n=None, keep_tokens=None, remove_most_freq_n=None, bad_tokens=None, model="ldamulticore", bigrams=True, **kwargs, ): """ no_below (int|None) – Keep tokens which are contained in at least no_below documents. no_above (float|None): Keep tokens which are contained in no more than no_above documents (fraction of total corpus size, not an absolute number). keep_n (int|None) – Keep only the first keep_n most frequent tokens. keep_tokens (iterable of str) – Iterable of tokens that must stay in dictionary after filtering. remove_most_freq_n (int|None): Remove n most frequent tokens model ('ldamulticore'|'lda'|'ldamallet') """ if bigrams is True: phrases = models.Phrases(self.tokenlists, delimiter=b" ") phraser = models.phrases.Phraser(phrases) self.tokenlists = [phraser[tl] for tl in self.tokenlists] dictionary = corpora.Dictionary(self.tokenlists) if remove_most_freq_n: dictionary.filter_n_most_frequent(remove_most_freq_n) dictionary.filter_extremes( no_below=no_below, no_above=no_above, keep_n=keep_n, keep_tokens=keep_tokens ) bows = [dictionary.doc2bow(tl) for tl in self.tokenlists] if bad_tokens: dictionary.filter_tokens( bad_ids=[dictionary.id2token[tok] for tok in bad_tokens] ) self.bows = bows self.dictionary = dictionary if model == "ldamulticore": self.model = models.LdaMulticore( bows, num_topics=num_topics, id2word=dictionary, **kwargs ) if model == "lda": self.model = models.LdaModel( bows, num_topics=num_topics, id2word=dictionary, **kwargs ) if model == "ldamallet": raise ValueError("mallet is not yet implemented")
def transform(self, data): """Transform training data.""" # For gensim we need to tokenize the data and filter out stopwords self.tokens = [clean_text(doc, stopwords_) for doc in data] # bigrams if self.bigrams: bigram = models.Phrases( self.tokens, min_count=5, threshold=100) # higher threshold fewer phrases. bigram_mod = models.phrases.Phraser(bigram) self.tokens = make_bigrams(self.tokens, bigram_mod) # trigrams if self.trigrams: bigram = models.Phrases(self.tokens, min_count=5, threshold=100) bigram_mod = models.phrases.Phraser(bigram) trigram = models.Phrases(bigram[self.tokens], threshold=100) trigram_mod = models.phrases.Phraser(trigram) self.tokens = make_trigrams(self.tokens, bigram_mod, trigram_mod) # lemmatization if self.lemmatization: # Initialize spacy 'en_core_web_sm' model, keeping only tagger component (for efficiency) spacy_nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner']) # Do lemmatization keeping only noun, adj, vb, adv self.tokens = do_lemmatization( spacy_nlp=spacy_nlp, texts=self.tokens, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) # Again remove stopwords after doing lemmatization self.tokens = [[token for token in doc if token not in stopwords_] for doc in self.tokens] # Build a Dictionary - association word to numeric id self.dictionary = corpora.Dictionary(self.tokens) # Transform the collection of texts to a numerical form [(word_id, count), ...] self.corpus = [self.dictionary.doc2bow(text) for text in self.tokens] # tf-idf vectorizer if self.tf_idf: self._tfidf_model = models.TfidfModel(self.corpus, id2word=self.dictionary) self.corpus = self._tfidf_model[self.corpus]
def collocs(texts): prev=texts bigram = models.Phrases(texts) texts=map(lambda x : bigram[x],texts) if prev == texts: return texts else: return collocs(texts)
def __init__(self, pdf_file): self.extractor = pdf_extractor.PdfExtractor(pdf_file) #text scrapping from pdf self.raw_corpus, self.pages, self.headers = self.extractor.get_corpus_pages_headers( ) # bpbpbp: pages -> docid # bpbpbp: headings -> headings # get stop words nltk.download("stopwords") stoplist = set(stopwords.words('english')) # process raw_corpus (ie. list of strings) tokenizer = RegexpTokenizer( '\w[\w-]*|\d(?:\d|,\d|\.\d)*' ) #any word with hyphens or number with decimal/commas t_corpus = [tokenizer.tokenize(s.lower()) for s in self.raw_corpus] #tokenized corpus # find bigrams (phrases of 2 words) bigram_ct = models.Phrases(t_corpus, common_terms=stoplist) # for all bigrams in t_corpus, combine into 1 word t_corpus = bigram_ct[t_corpus] # filter out stopwords from t_corpus st_corpus = [[w for w in t_txt if w not in stoplist] for t_txt in t_corpus] # stop_word_token_corpus # # grams # ct_ngrams = set((g[1], g[0]) for g in bigram_ct.export_phrases(t_corpus)) # ct_ngrams = sorted(list(ct_ngrams)) # print(len(ct_ngrams), "grams with common terms found") # # highest scores # print(ct_ngrams[-20:]) # Count word frequencies from collections import defaultdict frequency = defaultdict(int) for text in st_corpus: for token in text: frequency[token] += 1 # Only keep words that appear more than once processed_corpus = [[token for token in text if frequency[token] > 1] for text in st_corpus] dictionary = corpora.Dictionary(processed_corpus) bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus] tfidf = models.TfidfModel(bow_corpus) self.id_to_score_corpus = tfidf[bow_corpus] # bpbpbp: vector self.processed_corpus = processed_corpus # bpbpbp:tokens
def bigram(self, threshold=10): ''' Optional Create bigrams. ''' #Colocation detector trained on the data phrases = models.Phrases(self.tokenised, threshold=threshold) bigram = models.phrases.Phraser(phrases) self.tokenised = bigram[self.tokenised] return (self)
def __init__(self, lang, tokenizer=None, load=True): self.lang = lang self.tokenizer = tokenizer or Tokenizer(lang) dirname = join(nlp_data, lang) dict_fname = join(dirname, DICTIONARY_FNAME) phrase_fname = join(dirname, PHRASES_FNAME) if load and exists(phrase_fname): self.phrases = gmodels.Phrases.load(phrase_fname) else: self.phrases = gmodels.Phrases() if load and exists(dict_fname): self.dictionary = corpora.Dictionary.load(dict_fname) else: self.dictionary = corpora.Dictionary()
def make_ngram(tokenised_corpus, n_gram=2, threshold=10): """Extract bigrams from tokenised corpus Args: tokenised_corpus (list): List of tokenised corpus n_gram (int): maximum length of n-grams. Defaults to 2 (bigrams) threshold (int): min number of n-gram occurrences before inclusion Returns: ngrammed_corpus (list) """ tokenised = tokenised_corpus.copy() t = 1 # Loops while the ngram length less / equal than our target while t < n_gram: phrases = models.Phrases(tokenised, threshold=threshold) bigram = models.phrases.Phraser(phrases) tokenised = bigram[tokenised] t += 1 return list(tokenised)
def make_bigrams(self): texts = self.df[self.column_with_text] bigram = models.Phrases(texts, min_count=3, threshold=5) bigram_mod = models.phrases.Phraser(bigram) return [bigram_mod[doc] for doc in texts]
def depunct(tokens): """Remove punctuations from the text""" return [token.translate(None, string.punctuation) for token in tokens] data = pd.read_csv('data/calvin.csv') documents = data['quote'].tolist() stoplist = stopwords.words('english') sent = [d.lower().split() for d in documents] texts = [ depunct( [word for word in document.lower().split() if word not in stoplist]) for document in documents ] bigrams_model = models.Phrases(texts) bigrams = list(bigrams_model[texts]) trigrams_model = models.Phrases(bigrams) trigrams = list(trigrams_model[bigrams]) sent.extend(trigrams) sent.extend(bigrams) model = models.Word2Vec() model.build_vocab(sent) model.train(sent) chain = ['calvin', 'tiger', 'hobbes', 'mom'] pprint.pprint([ k for k, v in model.most_similar(positive=chain, negative=[], topn=50) if '_' in k
def train(texts, tokentype='lemma', allowed_pos=["NOUN", "ADJ", "VERB", "PROPN"], out_path=None): '''Run gensim phrase detection, remove empty, keep dates. Returns lists of tokens. Parameters ---------- texts : list Assuming that texts have _already_ been preprocessed using text_to_x (i.e. tokenization, lemmatization & feature selection) tokentype : str Either "token" or "lemma" allowed_pos : list uPOS that will be kept in texts. Use stanza tags. out_path : str (optional) path to a directory, where results will be saved (in a child directory). ''' # convert to a nice format # keep only "meaningful" POS # (i.e. noun, propnoun, adj, verb, adverb) texts_filter = [] for doc in texts: allowed_keys = [key for key, value in doc['upos'].items() if value in allowed_pos] texts_filter.append([word for key, word in doc[tokentype].items() if key in allowed_keys]) # initialize phrase detection phrases = models.Phrases(texts_filter, delimiter=b" ") # find phrases phraser = models.phrases.Phraser(phrases) # extract texts with phrases detected phrase_list = [phraser[tl] for tl in texts_filter] # missing any data? assert len(texts_filter) == len(phrase_list) # put together IDs and documents phrase_doc = [] for (i, tweet) in enumerate(phrase_list): d = dict() d['id'] = i d['text'] = tweet phrase_doc.append(d) # remove empty tweets phrase_doc = [doc for doc in phrase_doc if doc['text']] # if saving enabled if out_path: # check if file extension is specified if out_path.endswith('.ndjson'): pass # add it automatically if not else: print("Adding file extension (.ndjson)") out_path = os.path.join(out_path, '.ndjson') # export it with open(out_path, 'w') as f: ndjson.dump(phrase_doc, f) return phrase_doc
# upload the pandas dataframe with all info on bills and their text. bill_df = pd.load('fullbilldetails_dataframe') documents = bill_df['full_cleaned_text'] aa = bill_df['full_cleaned_text'] for i in range(len(documents)): documents[i] = documents[i].replace('-', ' ') documents[i] = documents[i].replace('ab ', ' ') documents[i] = documents[i].replace('_', ' ') # playwing with bigrams doclist = [] for doc in documents: doclist.append(doc.split()) bigram = models.Phrases(doclist) trigram = models.Phrases(bigram[doclist]) stoplist = stop_word_list() """ ------------------------------------------------------------ Testing bigrams/trigrams. Bigrams worked best. 0 trigrams added with frequency greater than 1. ------------------------------------------------------------ """ #texts = [[word for word in document.lower().split() if (word not in stoplist and len(word)>2)] # for document in documents] #texts = [[word for word in document if (word not in stoplist and len(word)>2)] # for document in doclist]
'ADJ', 'ADV', 'NOUN', 'PROPN', 'VERB' ] # only the words having the parts of speech will be included in the dataset for document in data: # looping over the documents one by one text = [] doc = nlp(document) # return a spacy document object for processing for w in doc: # looping over single tokens of the document object if (not w.is_stop) and (not w.is_punct) and (not w.like_num) and ( w.pos_ in allowed_pos ): # eliminating stop words, punctuations, numbers and the words whose POS is not included on the list described above using features provided by spacy text.append(w.lemma_) # take only the lemma of the word texts.append( text ) # append the documents one by one to the 'texts' list created earlier bigram = models.Phrases( texts, min_count=1, threshold=1 ) # create a bigram model class using the Phrases class of gensim package and fit it to the dataset texts = [bigram[data] for data in texts ] # transform and create the actual bigrams off the text print(texts) dictionary = corpora.Dictionary( texts ) # create a bag of words model class using the Dictionary class of gensim corpus = [dictionary.doc2bow(text) for text in texts ] # create the bag of words corpus off the Dictionary class print(corpus) lda = models.ldamodel.LdaModel( corpus=corpus, id2word=dictionary, num_topics=20, passes=1000 ) # run the LdaModel class from gensim for topic modelling with our bag of words corpus, the dictionary object, the topic parameter and the number of runs the model has to run to finish modelling
return texts_out #Remove Stopwords stoplist = stopwords.read().splitlines() texts = [ [word for word in document.lower().split() if word not in stoplist] for document in tweetList ] tweetList = list(sent_to_words(texts)) # Build the bigram and trigram models bigram = models.Phrases(tweetList, min_count=5, threshold=100) # higher threshold fewer phrases. trigram = models.Phrases(bigram[tweetList], threshold=100) # Faster way to get a sentence clubbed as a trigram/bigram bigram_mod = models.phrases.Phraser(bigram) trigram_mod = models.phrases.Phraser(trigram) texts = make_bigrams(texts) nlp = spacy.load('en', disable=['parser', 'ner']) # Do lemmatization keeping only noun, adj, vb, adv texts = lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
from gensim import models, matutils import numpy as np # array handling import os, warnings model_filename = 'D:/MICCAI/MICCAI Corpora/2017/Bigram2017.model' input_filename = 'D:/MICCAI/MICCAI Corpora/2018/Bigrams2018.txt' newModel_filename = 'D:/MICCAI/MICCAI Corpora/2018/Bigram2018.model' # ignore unicode warnings # (they don't cause any problems, just ugly output from this code) warnings.filterwarnings('ignore', '.*Unicode.*') # load existing model from file print('loading model...') model = models.Word2Vec.load(model_filename) print('- done') new_sentences = models.word2vec.LineSentence(input_filename) model.build_vocab(new_sentences, update=True) #model.train(new_sentences) bigram_transformer = models.Phrases(new_sentences) model.train(bigram_transformer[new_sentences], total_examples=model.corpus_count, epochs=model.iter) model.save(newModel_filename) # bye print('all done, thank you!')
# Get the number of reviews based on the dataframe column size num_patents = exampleData["PatentAbstract"].size # Initialize an empty list to hold the clean reviews clean_abstracts = [] # Loop over each review; create an index i that goes from 0 to the length # of the patent list for i in xrange( 0, num_patents ): # Call our function for each one, and add the result to the list of patent = patent_to_words(exampleData["PatentAbstract"][i]) array = patent.split() clean_abstracts.append(array) # Identify Bigrams using gensim's Phrases function bigram = models.Phrases(clean_abstracts) final_abstracts = [] for i in xrange(0,num_patents): sent = clean_abstracts[i] temp_bigram = bigram[sent] final_abstracts.append(temp_bigram) # turn our tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(final_abstracts) # convert tokenized documents into a document-term matrix (bag-of-words) corpus = [dictionary.doc2bow(text) for text in final_abstracts] #TF IDF
def col1(texts): prev=texts bigram = models.Phrases(texts) texts=map(lambda x : bigram[x],texts) return texts
os.path.join(path_save_data, 'Recomended_Gram_Tab.csv')) # NotRecommended_gramtab = gram_table( data=documents[df["Recommended IND"].astype(int) == 1], gram=[1, 2, 3], length=20) NotRecommended_gramtab.to_csv( os.path.join(path_save_data, 'NotRecomended_Gram_Tab.csv')) ########################################################## # Creating Bigrams and Trigrams Models, higher threshold fewer phrases ########################################################## print('Creating Bigrams Model...') # Build the bigram models t1 = time.time() bigram = models.Phrases(texts, min_count=1, threshold=1) # Faster way to get a sentence clubbed as a trigram/bigram bigram_mod = models.phrases.Phraser(bigram) print('Time used: %s seconds' % (time.time() - t1)) # See example print(bigram_mod[texts[9]]) texts_bigram = bigram_mod[texts] ##################### # save with open(os.path.join(path_save_data, 'Preprocessed Review Text Bigram.txt'), 'w') as f: for text in texts_bigram: for item in text: f.write("%s " % item) f.write("\n")
doc = nlp((" ".join(doc)), disable=[ 'ner', 'tagger', 'textcat', ]) texts_out.append([tok.lemma_ for tok in doc if tok.lemma_ != '-PRON-']) return pd.Series(texts_out) text_preprocess(reviews_df.reviews.iloc[10:15]) # Commented out IPython magic to ensure Python compatibility. # %time train_corpus = text_preprocess(reviews_df.reviews) # create ngrams ngram_phraser = models.Phrases(train_corpus, threshold=1) ngram = models.phrases.Phraser(ngram_phraser_1) # apply n-gram model to corpus texts_1 = [ngram[token] for token in train_corpus] # adding it to dataframe texts_1 = [' '.join(text) for text in texts_1] reviews_df['ngram'] = texts_1 reviews_df.head() def createLabelsFromReviewPoints( df ): #this function creates a new column which will be our classification label like low,medium high df['class'] = df.apply(lambda row: label_reviews(row), axis=1) return df
#Tokenising tokenizer = RegexpTokenizer(r'[a-zA-Z]{3,}') english_stopwords = get_stopwords('en') english_stopwords.append('reuters') english_stopwords.append('said') token_content = [] processed_content = [] for article in content: tokens = tokenizer.tokenize(article.lower()) token_content.append(tokens) stopped_tokens = [i for i in tokens if i not in english_stopwords] processed_content.append(stopped_tokens) # Creating a bigram model bigram = models.Phrases(token_content, min_count=5, threshold = 100) bigram_mod = models.phrases.Phraser(bigram) bigram_content = [bigram_mod[i] for i in processed_content] # lemmatisation nlp = spacy.load('en',disable=['parser','ner']) allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'] lemmatised_content=[] for each_article in bigram_content: try: doc = nlp(" ".join(each_article)) lemmatised_content.append([tokens1.lemma_ for tokens1 in doc if tokens1.pos_ in allowed_postags]) except: print(each_article) '''
result='' # for w in words: # if w not in stopwords: # result += w.encode('utf-8') # +"/"+str(w.flag)+" " #去停用词 words = pseg.lcut(content) for word, flag in words: # print word.encode('utf-8') if (word not in stopwords and flag[0] in [u'n',u'f',u'a',u'z']): #去停用词和其他词性,比如非名词动词等 result += word.encode('utf-8') # +"/"+str(w.flag)+" " #去停用词 return result input = [] for sentence in sentences: sentence = delNOTNeedWords(sentence,stopwords) input.append(jieba.lcut(sentence)) bigram_transformer = models.Phrases(input) model = models.Word2Vec(bigram_transformer[input], size=feature_size, window=content_window, min_count=freq_min_count, negative=negative, iter=iter, workers=multiprocessing.cpu_count()) # print model.index2word model.save(save_filename) f = model.most_similar([u'奥迪']) for k in f: print k[0].encode('utf-8'),k[1] """ model.most_similar(positive=['woman', 'king'], negative=['man']) """
num_row = 0 for row in data: doc_pos = engin.pos(row[0]) doc_pos = doc_pos_tokenizer(doc_pos) doc_info = (row[0], doc_pos) doc_list.append(doc_info) doc_dic.append(doc_pos) num_row += 1 print("{}번째 문장 명사 개수 : ".format(num_row) + str(len(doc_pos))) print("총 문장 개수 : " + str(len(doc_dic))) print("head : " + "< " + row_data[0] + " >") print('\n' + "=" * 75) #n그램모델 https://wikidocs.net/21692 bigram = models.Phrases(doc_dic, min_count=5, threshold=100) trigram = models.Phrases(bigram[doc_dic], threshold=100) bigram_mod = models.phrases.Phraser(bigram) trigram_mod = models.phrases.Phraser(trigram) doc_dic_bi_tri = [] for i in range(0, len(doc_dic)): doc_dic_bi_tri.append(trigram_mod[bigram_mod[doc_dic[i]]]) dictionary = corpora.Dictionary(doc_dic_bi_tri) dictionary.save('dictionary.dict') corpus = [ dictionary.doc2bow(a_doc_dic_bi_tri) for a_doc_dic_bi_tri in doc_dic_bi_tri
print("Reading data") data = pd.read_csv( "/Users/dmitrys/Yandex.Disk.localized/top_russian_music/comments/union_superreduced_comments.csv" ) print(data.shape) data = data[~data.text_bow.isnull()] print("Cleaning data") data = data[data['comment_len'] > 40] texts = data.text_bow.apply(literal_eval) print(data.shape) print("Bigrams") # Build the bigram models bigram = models.Phrases(texts, min_count=3, threshold=5) # higher threshold fewer phrases. # Faster way to get a sentence clubbed as a bigram bigram_mod = models.phrases.Phraser(bigram) def make_bigrams(texts): return [bigram_mod[doc] for doc in texts] texts = make_bigrams(texts) print("Corpora") dictionary = corpora.Dictionary(texts) # составляем словарь из терминов print('Размер словаря до фильтрации: {}'.format(len(dictionary)))