def main(): # ------------------------------------------------------------------------------- # Parameters # the script will most likely work if we swap the TEXTS variable # with any iterable of text (where one element represents a document, # and the whole iterable is the corpus) newsgroups_train = fetch_20newsgroups(subset='train') TEXTS = newsgroups_train.data # spacy's english model for text preprocessing NLP = spacy.load('en') # a set of stopwords built-in to spacy, we can always # expand this set for the problem that we are working on, # here we include python built-in string punctuation mark STOPWORDS = spacy.en.STOP_WORDS | set(punctuation) | set( ENGLISH_STOP_WORDS) # create a directory called 'model' to store all outputs in later section MODEL_DIR = 'model' UNIGRAM_PATH = os.path.join(MODEL_DIR, 'unigram.txt') PHRASE_MODEL_CHECKPOINT = os.path.join(MODEL_DIR, 'phrase_model') BIGRAM_PATH = os.path.join(MODEL_DIR, 'bigram.txt') WORD2VEC_CHECKPOINT = os.path.join(MODEL_DIR, 'word2vec') # ------------------------------------------------------------------------------- logger.info('job started') if not os.path.isdir(MODEL_DIR): os.mkdir(MODEL_DIR) if not os.path.exists(UNIGRAM_PATH): logger.info('preprocessing text') export_unigrams(UNIGRAM_PATH, texts=TEXTS, parser=NLP, stopwords=STOPWORDS) if os.path.exists(PHRASE_MODEL_CHECKPOINT): phrase_model = Phrases.load(PHRASE_MODEL_CHECKPOINT) else: logger.info('training phrase model') # use LineSetence to stream text as oppose to loading it all into memory unigram_sentences = LineSentence(UNIGRAM_PATH) phrase_model = Phrases(unigram_sentences) phrase_model.save(PHRASE_MODEL_CHECKPOINT) if not os.path.exists(BIGRAM_PATH): logger.info('converting words to phrases') export_bigrams(UNIGRAM_PATH, BIGRAM_PATH, phrase_model) if os.path.exists(WORD2VEC_CHECKPOINT): word2vec = Word2Vec.load(WORD2VEC_CHECKPOINT) else: logger.info('training word2vec') sentences = LineSentence(BIGRAM_PATH) word2vec = Word2Vec(sentences, workers=cpu_count()) word2vec.save(WORD2VEC_CHECKPOINT) logger.info('job completed')
def _phrase_detection_(fpath=fpathroot + fpathappend, passes=2, returnmodels=True, threshold=10.): """ This function does pharse modeling. User specifies the number of passes. Each pass detects longer phrases. The maximum detectable phrase length for each pass, n, is 2*n. Returns the list of models by default. Also saves models and intermediary phrased sentences for each pass. """ generpath = fpath + '_sent_gram_0.txt' ngram = list() for it in range(passes): gen = LineSentence(generpath) gram = Phrases(gen, threshold=threshold) ngram.append(gram) modelpath = fpath + 'phrase_model_gram_' + str(it + 1) generpath = fpath + 'sent_gram_' + str(it + 1) + '.txt' gram.save(modelpath) # Write sentence gram with codecs.open(generpath, 'w', encoding='utf_8') as f: for sent in gen: new_sent = u' '.join(gram[sent]) f.write(new_sent + '\n') if returnmodels == True: return ngram
def build_trigrams(self): self.bigram_model = Phrases([ doc.split(" ") for doc in self.lemmatized_sentence_corpus( self.df_prod['reviews'].values) ], min_count=2) bigram_sentences = [] for unigram_sentence in self.lemmatized_sentence_corpus( self.df_prod['reviews'].values): bigram_sentences.append(' '.join( self.bigram_model[unigram_sentence.split(" ")])) self.trigram_model = Phrases( [doc.split(" ") for doc in bigram_sentences], min_count=2) self.trigrams_doc = [] for doc in self.df_prod['reviews'].values: parsed_doc = self.wp.nlp(doc) bigram_doc = ' '.join( self.bigram_model[(token.lemma_ for token in parsed_doc if self.keep_token(token))]) trigram_doc = ' '.join( self.trigram_model[(token for token in bigram_doc.split(" "))]) self.trigrams_doc.append( self.trigram_model[(token for token in bigram_doc.split(" "))])
def process_tokens(self, lemmatize=True, lower=True, phrases=True): tokens = [ [ token for token in raw_token # TODO: Add like_num option? if (token.pos_ in self.keep_pos) and (not token.is_stop) and ( token.is_alpha) ] for raw_token in tqdm( self.docs, total=self.n_docs, desc="Processing tokens") ] if lemmatize: tokens = [[token.lemma_ for token in doc] for doc in tokens] else: tokens = [[token.text for token in doc] for doc in tokens] if lower: tokens = [[token.lower() for token in doc] for doc in tokens] if phrases: # TODO: Add n-gram pattern matching with spacy bigrams = Phrases(tokens, delimiter=b"_", min_count=2) trigrams = Phrases(bigrams[tokens], delimiter=b"_", min_count=2) # extract bigrams and trigrams tokens = [bigrams[doc] for doc in tokens] tokens = [trigrams[doc] for doc in tokens] return tokens
def trigramGenerator(self): corpusStream = self.sentenceStream() biGramPhrases = Phrases(corpusStream, min_count=self.bigramMinCount, threshold=self.thresholdBigram) bigram = Phraser(biGramPhrases) inputStream = self.sentenceStream() bigramSentenceList = (bigram[sentence] for sentence in inputStream) triGramPhrases = Phrases(bigramSentenceList, min_count=self.trigramMinCount, threshold=self.thresholdTrigram) trigram = Phraser(triGramPhrases) inputStream = self.sentenceStream() bigramSentenceList = (bigram[sentence] for sentence in inputStream) trigramSentenceList = (trigram[sentence] for sentence in bigramSentenceList) trigramList = set() for trigramSentence in trigramSentenceList: for item in trigramSentence: if "_" in item: trigramList.add(item) print("Number of Unique Trigrams = ", len(trigramList)) for item in sorted(trigramList): if not os.path.exists(self.trainingLocation): os.makedirs(self.trainingLocation) with open( os.path.join(self.trainingLocation, "TC-phrases-bi-tri.txt"), "a") as outFile: outFile.write(item + "\n")
def visualize_lda_model(): data = preprocess_to_lemmatization() stopwords_verbs = [ 'say', 'get', 'go', 'know', 'may', 'need', 'like', 'make', 'see', 'want', 'come', 'take', 'use', 'would', 'can' ] stopwords_other = [ 'one', 'mr', 'bbc', 'image', 'getty', 'de', 'en', 'caption', 'also', 'copyright', 'something' ] my_stopwords = stopwords.words( 'english') + stopwords_verbs + stopwords_other data['tokens'] = data['tokens_sentences_lemmatized'].map( lambda sentences: list(chain.from_iterable(sentences))) data['tokens'] = data['tokens'].map(lambda tokens: [ token.lower() for token in tokens if token.isalpha() and token.lower() not in my_stopwords and len(token) > 1 ]) tokens = data['tokens'].tolist() bigram_model = Phrases(tokens) trigram_model = Phrases(bigram_model[tokens], min_count=1) tokens = list(trigram_model[bigram_model[tokens]]) dictionary_LDA = corpora.Dictionary(tokens) dictionary_LDA.filter_extremes(no_below=3) corpus = [dictionary_LDA.doc2bow(tok) for tok in tokens] np.random.seed(123456) num_topics = 20 lda_model = models.LdaModel(corpus, num_topics=num_topics, \ id2word=dictionary_LDA, \ passes=4, alpha=[0.01]*num_topics, \ eta=[0.01]*len(dictionary_LDA.keys())) lda_viz = gensimvis.prepare(lda_model, corpus, dictionary_LDA) pyLDAvis.enable_notebook() return pyLDAvis.display(lda_viz)
def getTopics(jobs_): bigram_model = Phrases.load('data/bigram_model_all') trigram_model = Phrases.load('data/trigram_model_all') trigram_dictionary = Dictionary.load('data/trigram_dict_all.dict') lda = LdaMulticore.load('data/lda_model_all') topic_names = {0:u'Risk Management Bank', 1:u'Big Data Report', 2:u'Automotive SAP', 3:u'Microsoft Java Scrum', 4:u'Medical Consultant', 5:u'Java Engineer', 6:u'Computer Vision Developer', 7:u'Data Analyst', 8:u'BI SAP BW', 9:u'IOT Reporting R', 10:u'Global Project Presentation', 11:u'Cloud Engineer IOT', 12:u'Industry 4.0', 13:u'Risk Consulting', 14:u'Machine Learning Data Science'} topics_ = [] for job_ in jobs_: if job_ is not None: #print(job_[0]) topics_.append(lda_description(bigram_model, trigram_model, trigram_dictionary, lda, topic_names, job_[1], job_[0]))
def _preprocess(self, text, min_tok_len=1): stop_words = set(nltk.corpus.stopwords.words('english')) lemm_stemm = lambda tok: WordNetLemmatizer().lemmatize(tok, pos='v') result = [] #remove proper nouns tagged_sent = pos_tag(text.split()) noProper = [word for word, pos in tagged_sent if pos != 'NNP'] noProper = ' '.join(noProper) for token in simple_preprocess(noProper): if len(token) > min_tok_len and token not in stop_words: result.append(lemm_stemm(token)) # Build the bigram and trigram models bigram = Phrases(result, min_count=5, threshold=10) # higher threshold fewer phrases. trigram = Phrases(bigram[result], threshold=10) # Faster way to get a sentence clubbed as a trigram/bigram bigram_mod = Phraser(bigram) trigram_mod = Phraser(trigram) result = trigram_mod[bigram_mod[result]] return [result]
def train_bigram(unigram_txt_filepath, bigram_model_filepath, savebigram, bigram_txt_filepath): print('reading unigram text file.....') unigram_txt = LineSentence(unigram_txt_filepath) print('training bigram model.....') bigram_model = Phrases(unigram_txt) print('saving bigram model.....') bigram_model.save(bigram_model_filepath) # load the finished model from disk # bigram_model = Phrases.load(bigram_model_filepath) if savebigram: print('saving bigram processed text file....') with codecs.open(bigram_txt_filepath, 'w', encoding='utf_8') as f: i = 0 for unigram_sentence in tqdm(unigram_txt): bigram_sentence = u' '.join(bigram_model[unigram_sentence]) f.write(bigram_sentence + '\n') i = i + 1 if (i % 10000 == 0): print('Bigram Processed ' + str(i) + ' articles')
def bigrams(corpus, output_prefix): print("----- Bigram -----") if os.path.exists(output_prefix + "_bigram_phrases"): bigram_phrases = Phrases.load(output_prefix + "_bigram_phrases") print("Loaded bigram phrases") else: bigram_phrases = Phrases(corpus, min_count=CONFIG["bigram_phrase_min_count"], threshold=CONFIG["bigram_phrase_threshold"], progress_per=CONFIG["bigram_phrase_progress_per"], delimiter=CONFIG["bigram_phrase_delimiter"]) bigram_phrases.save(output_prefix + "_bigram_phrases") bigram_transformer = Phraser(bigram_phrases) dct = Dictionary(bigram_transformer[corpus]) dct.save(output_prefix + "_dictionary_bigram") print("Training tf-idf from bigrams") bow_corpus = [dct.doc2bow(line) for line in bigram_transformer[corpus]] tfidf = gensim.models.TfidfModel(bow_corpus, smartirs='ntc') tfidf.save(output_prefix + "_tfidf_bigram") print("Training word2vec model with bigrams (may be unnecessary if trigrams work as expected)") start_time = time() bigram_model = gensim.models.Word2Vec(bigram_transformer[corpus], size=CONFIG['vector_size'], window=CONFIG['window_size'], min_count=CONFIG['min_count'], workers=CONFIG['worker_count'], sg=CONFIG['sg'], negative=CONFIG['negative_size'], alpha=CONFIG['alpha'], min_alpha = CONFIG['min_alpha'], iter=CONFIG['train_epoch']) bigram_model.save(output_prefix + "_bigram") print("Time :", format_time(time() - start_time)) return bigram_model
def main(): input = LineSentence('cleaned_judgments2') bigram = Phrases(input) trigram = Phrases(bigram[input]) model = Word2Vec(trigram[bigram[input]], sg=0, size=300, window=5, min_count=3, workers=8) model.save('model2')
def train_phrases(paths, out='data/bigram_model.phrases', **kwargs): """ Train a bigram phrase model on a list of files. """ n = 0 for path in paths: print('Counting lines for {0}...'.format(path)) n += sum(1 for line in open(path, 'r')) print('Processing {0} lines...'.format(n)) # Change to use less memory. Default is 40m. max_vocab_size = 40000000 print('Training bigrams...') bigram = Phrases(_phrase_doc_stream(paths, n), max_vocab_size=max_vocab_size, threshold=8.) print('Saving...') bigram.save(out) print('Some examples:') docs = [ ['the', 'new', 'york', 'times', 'is', 'a', 'newspaper'], ['concern', 'is', 'rising', 'in', 'many', 'quarters', 'that', 'the', 'united', 'states', 'is', 'retreating', 'from', 'global', 'economic', 'leadership', 'just', 'when', 'it', 'is', 'needed', 'most'], ['the', 'afghan', 'president', 'ashraf', 'ghani', 'blamed', 'the', 'islamic', 'state', 'group'], ['building', 'maintenance', 'by', 'the', 'hrynenko', 'family', 'which', 'owns', 'properties', 'in', 'the', 'east', 'village'], ['a', 'telegram', 'from', 'the', 'american', 'embassy', 'in', 'constantinople', 'to', 'the', 'state', 'department', 'in', 'washington'] ] for r in bigram[docs]: print(r)
def main(): get_args() def sentences(): return chain.from_iterable( (read_slice(data) for data in read_corpus())) bigram = Phrases(sentences(), min_count=1, threshold=1, delimiter=b' ') bigram_phraser = Phraser(bigram) bigrammed = map(lambda x: bigram_phraser[x], sentences()) trigram = Phrases(bigrammed, min_count=1, threshold=1, delimiter=b' ') trigram_phraser = Phraser(trigram) only_trigrams = {b' '.join(trigram_tuple): score for (trigram_tuple, score) in \ trigram_phraser.phrasegrams.items() if b' '.join(trigram_tuple).count(b' ') == 2} for key, value in sorted(only_trigrams.items(), key=lambda item: item[1], reverse=True)[:10]: print(key, value) scores = list(only_trigrams.values()) print(""" Unique trigrams: {unique} Mean score:{mean} Max score:{max} Min score:{min} """.format(unique=len(only_trigrams), mean=mean(scores) if len(scores) != 0 else 0, max=max(scores) if len(scores) != 0 else 0, min=min(scores) if len(scores) != 0 else 0))
def get_top_phrases(documents): documents_split = [doc.split() for doc in documents] remove_from_stop_words = [ "would", "what", "which", "who", "whom", "when", "where", "why", "how", "could" ] words_to_remove = [ 'yeah', 'okay', 'like', 'oh', 'also', 'and', 'so', 'hey', 'hello' ] custom_stopwords = [ sw for sw in stopwords.words('english') if sw not in remove_from_stop_words and sw not in words_to_remove ] bigram = Phrases(documents_split, min_count=1, delimiter=b' ', common_terms=custom_stopwords) trigram = Phrases(bigram[documents_split], min_count=1, delimiter=b' ', common_terms=custom_stopwords) cnt = Counter([ t for sent in documents_split for t in trigram[bigram[sent]] if t.count(' ') >= 1 ]) return cnt.most_common()
def get_test_reviews(): doc_reviews = {} sent_reivews = {} num_docs = 0 num_words = 0 apk_path = os.path.join("..", "data", "raw") apk_lst_path = os.path.join(apk_path, "package_names.txt") # load phrases bigram = Phrases.load(os.path.join("..", "model", "bigram.model")) trigram = Phrases.load(os.path.join("..", "model", "trigram.model")) with open(apk_lst_path) as fin: apk_lst = [apk_name.strip() for apk_name in fin.readlines()] for apk_name in apk_lst: file = os.path.join(apk_path, "mongodb", apk_name, "review.txt") with open(file) as fin: reviews_sent = [] reviews_doc = [] for line in fin.readlines(): words_sents, wc = extractSentenceWords(line) reviews_sent.append(words_sents) reviews_doc.append(list(itertools.chain.from_iterable(words_sents))) num_docs += 1 num_words += wc sent_reivews[apk_name] = trigram[bigram[reviews_sent]] doc_reviews[apk_name] = trigram[bigram[reviews_doc]] logging.info("Read %d docs, %d words!" % (num_docs, num_words)) return sent_reivews, doc_reviews
def fetch_document_bigrams(self, document_lemmas, number_of_bigrams=100): """ Given a number of lemmas identifying a document, it calculates N bigrams found in that document, where N=number_of_bigrams. """ if not self.include_bigrams: return [] bigram = Phrases() bigram.add_vocab([document_lemmas]) bigram_counter = Counter() for key in bigram.vocab.keys(): if key not in STOPWORDS_BYTES: if len(key.split("_")) > 1: bigram_counter[key] += bigram.vocab[key] bigram_iterators = [ repeat(bigram, bigram_count) for bigram, bigram_count in bigram_counter.most_common(number_of_bigrams) ] found_bigrams = list(chain(*bigram_iterators)) known_bigrams = [bigram for bigram in found_bigrams if bigram in self.top_bigrams] return known_bigrams
def main(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) in_dir, model_out = sys.argv[1:] sentences = Corpus(in_dir) phrases = Phrases(sentences) phrases.save(model_out)
def load_model(): bigram = Phrases.load(os.path.join("..", "model", "bigram.model")) trigram = Phrases.load(os.path.join("..", "model", "trigram.model")) wv_model = Word2Vec.load( os.path.join("..", "model", "appreviews_word2vec.model")) logging.info("Load word2vec model finished") return bigram, trigram, wv_model
def tokeniseAll(posts, stopWords, urduNames): '''Function to tokenise all comments in the file, including ngrams Parameters --------------------------------------- comments: the pandas data frame column containing the comments, transformed into a list stopWords: A list of stopwords urduNames: A list of common Urdu names''' #posts = comments.tolist() n_grams = 3 tokenized_corp = [] for doc in posts: tokenized_corp.append(createToken(doc, stopWords, urduNames)) # Add n_grams bigram = Phrases(tokenized_corp, min_count=5, threshold=10) trigram = Phrases(bigram[tokenized_corp], threshold=10) bigram_mod = Phraser(bigram) trigram_mod = Phraser(trigram) if n_grams > 1: for i, doc in enumerate(tokenized_corp): tokenized_corp[i] = bigram_mod[doc] if n_grams > 2: tokenized_corp[i] = trigram_mod[bigram_mod[doc]] return tokenized_corp
def add_lda_topics(data): # LDA Model lda = models.LdaModel.load('classifiers/lda_model/lda_model') # get tokens from text corpus = data['text'].to_list() data['sentences'] = data.text.map(sent_tokenize) data['tokens_sentences'] = data['sentences'].map( lambda sentences: [word_tokenize(sentence) for sentence in sentences]) data['POS_tokens'] = data['tokens_sentences'].map( lambda tokens_sentences: [pos_tag(tokens) for tokens in tokens_sentences]) data['tokens_sentences_lemmatized'] = data['POS_tokens'].map( lambda list_tokens_POS: [[ lemmatizer.lemmatize(el[0], get_wordnet_pos(el[1])) if get_wordnet_pos(el[1]) != '' else el[0] for el in tokens_POS ] for tokens_POS in list_tokens_POS]) stopwords_custom = [ '[', ']', 'RT', '#', '@', ',', '.', '!', 'http', 'https' ] my_stopwords = list(spacy_stopwords) + stopwords_custom data['tokens'] = data['tokens_sentences_lemmatized'].map( lambda sentences: list(chain.from_iterable(sentences))) data['tokens'] = data['tokens'].map(lambda tokens: [ token.lower() for token in tokens if token.isalpha() and token.lower() not in my_stopwords and len(token) > 1 ]) tokens = data['tokens'].tolist() bigram_model = Phrases(tokens) trigram_model = Phrases(bigram_model[tokens], min_count=1) tokens = list(trigram_model[bigram_model[tokens]]) # create new_corpus dictionary_LDA = corpora.Dictionary(tokens) unseen_corpus = [dictionary_LDA.doc2bow(tok) for tok in tokens] # run model on new_corpus np.random.seed(123456) num_topics = 20 lda_model = models.LdaModel(unseen_corpus, num_topics=num_topics, \ id2word=dictionary_LDA, \ passes=4, alpha=[0.01]*num_topics, \ eta=[0.01]*len(dictionary_LDA.keys())) # get document topic and append to df topics = [lda_model[unseen_corpus[i]] for i in range(len(data))] # like TF-IDF, create a matrix of topic weighting, with documents as rows and topics as columns document_topic = \ pd.concat([topics_document_to_dataframe(topics_document, num_topics=num_topics) for topics_document in topics]).reset_index(drop=True).fillna(0) data = pd.concat([data, document_topic], axis=1, sort=False) data = data.drop([ 'sentences', 'tokens_sentences', 'POS_tokens', 'tokens_sentences_lemmatized', 'tokens' ], 1) return data
def visulaizer_of_gensim(content_list): stop_words = stopwords.words('english') data_words = list(sent_to_words(content_list)) bigram = Phrases(data_words, min_count=5, threshold=100) trigram = Phrases(bigram[data_words], threshold=100) bigram_mod = Phraser(bigram) trigram_mod = Phraser(trigram) data_words_nostops = remove_stopwords(data_words, stop_words) data_words_bigrams = make_bigrams(data_words_nostops, bigram_mod) data_words_trigrams = make_trigrams(data_words_bigrams, bigram_mod, trigram_mod) data_lemmatized = lemmatization( data_words_trigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) id2word = corpora.Dictionary(data_lemmatized) texts = data_lemmatized corpus = [id2word.doc2bow(text) for text in texts] lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=20, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True) vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word) return vis
def get_trigrams(self): """ Builds unigram, bigram, and trigram models respectively. Writes the text of each model to a seperate file. """ unigram_sentences = LineSentence(self.unigram_sentences_filepath) bigram_model = Phrases(unigram_sentences) bigram_model.save(self.bigram_model_filepath) bigram_model = Phrases.load(self.bigram_model_filepath) with open(self.bigram_sentences_filepath, 'w', encoding="utf-8") as f: for unigram_sentence in unigram_sentences: bigram_sent = " ".join(bigram_model[unigram_sentence]) # a bit confused by this. f.write(bigram_sent) bigram_sentences = LineSentence(self.bigram_sentences_filepath) trigram_model = Phrases(bigram_sentences) trigram_model.save(self.trigram_model_filepath) trigram_model = Phrases.load(self.trigram_model_filepath) with open(self.trigram_sentences_filepath, 'w', encoding="utf-8") as f: for bigram_sentence in bigram_sentences: trigram_sentence = " ".join(trigram_model[bigram_sentence]) f.write(trigram_sentence + '\n') trigram_sentences = LineSentence(self.trigram_sentences_filepath) with open(self.trigram_articles_filepath, 'w', encoding="utf-8") as f: for parsed_article in self.line_article("../data/article_texts"): unigram_article = [token.lemma_ for token in self.nlp(parsed_article) if not self.punct_space(token)] bigram_article = bigram_model[unigram_article] trigram_article = trigram_model[bigram_article] trigram_article = [term for term in trigram_article if term not in STOP_WORDS] trigram_article = " ".join(trigram_article) f.write(trigram_article + '\n')
def add_n_grams(self, n=2, min_count=1): logging.info('Performing normalization.') logging.debug('n=' + str(n)) logging.debug('min_count=' + str(min_count)) logging.info('Adding 2-grams') bigram = Phrases(self.corpus, min_count=min_count, delimiter=b' ') if n == 3: logging.info('Adding 3-grams') trigram = Phrases(bigram[self.corpus], min_count=1, delimiter=b' ') for document in range(self.N): self.corpus[document] = [ n_gram for n_gram in trigram[bigram[self.corpus[document]]] if n_gram.count(' ') < n ] elif n == 2: for document in range(self.N): self.corpus[document] = [ n_gram for n_gram in bigram[self.corpus[document]] if n_gram.count(' ') < n ] else: logging.warning('Invalid parameter! Skipping n-grams...') return
def train_with_trigrams(self): trigram_model = Phrases.load(self.trigram_model_filepath) bigram_model = Phrases.load(self.bigram_model_filepath) for doc, id in self.es_docs(): unigrams = text_cleaner.clean_tokens(doc) bigrams = bigram_model[unigrams] trigrams = trigram_model[bigrams] trigrams = text_cleaner.filter_terms(trigrams) td = TaggedDocument(trigrams, [id]) self.taggeddoc.append(td) print('Data Loading finished') print(len(self.taggeddoc), type(self.taggeddoc)) model = gensim.models.Doc2Vec(self.taggeddoc, dm=0, iter=1, window=15, seed=1337, min_count=5, workers=4, alpha=0.025, size=200, min_alpha=0.025) for epoch in range(200): if epoch % 20 == 0: print('Now training epoch %s' % epoch) model.train(self.taggeddoc, total_examples=model.corpus_count, epochs=model.iter) model.alpha -= 0.002 # decrease the learning rate model.min_alpha = model.alpha # fix the learning rate, no decay model.save(self.model_file) model.save_word2vec_format(self.model_file + '.word2vec')
def generate_bow(corpus_filename, category, use_bigrams, no_above, no_below): if not os.path.exists('./data/%s' % category): os.makedirs('./data/%s' % category) tokens = [ utils.tokenize(line) for line, label in zip(open('./data/%s.csv' % corpus_filename), open('./data/corpus-labels.csv')) if category in label ] print 'First token', tokens[1] category_filename = corpus_filename.replace('corpus', 'category') #Each category gets its own dictionary and its own corpus, but uses the same bigram model #that was computed on all the abstracts if use_bigrams: if not os.path.exists('./data/%s/bigram.bin' % category): bigram = Phrases( utils.tokenize(line) for line, label in zip(open('./data/%s.csv' % corpus_filename), open('./data/corpus-labels.csv')) if category in label) Phrases.save(bigram, './data/%s/bigram.bin' % category) else: bigram = Phrases.load('./data/%s/bigram.bin') tokens = [bigram[token] for token in tokens] print 'First bigram token', tokens[1] #Make the dictionary, a collection of statistics about all tokens in the corpus #This is the mapping from words to their id's. It's the lookup table for features. dictionary = corpora.Dictionary(tokens) # words that appear only once dictionary.filter_extremes( no_above, no_below) #no_above=0.05, no_below=10 yielded good results # remove gaps in id sequence after words that were removed dictionary.compactify() # store the dictionary, for future reference dictionary.save('./data/%s/%s.dict' % (category, category_filename)) # memory-friendly bag-of-words class class BOW(object): def __iter__(self): for line, label in zip(open('./data/%s.csv' % corpus_filename), open('./data/corpus-labels.csv')): # assume there's one document per line, tokens separated by whitespace if category in label: yield dictionary.doc2bow(utils.tokenize(line)) else: pass # Now we can make a bag of words and do something with it by iterating over it arxiv_bow = BOW() corpora.MmCorpus.serialize('./data/%s/%s.mm' % (category, category_filename), arxiv_bow) # store to disk, for later use
def get_ngrams(cls, tokens: List[List[str]], n: int = 2, min_count: int = 3, delimiter: str = b' ', stop: Optional[List[str]] = None) -> List[List[str]]: """Add up to tri-grams to a list of tokens. Args: tokens: The list of paragraph tokens from which to search for ngrams. n: Optional, either '2' or '3'; Up to bigrams or trigrams. The default is to add up to bigrams. min_count: Optional; The minimum amount of occurances for an ngram to be added. The default is to add ngrams that occur at least 3 times. delimiter: Optional; The byte string to separate words in an n-gram. The default is to separate words in an n-gram with a space. stop: Optional; A list of stop words. Returns: A list of sentence tokens plus ngrams. """ # Break down the list of paragraph tokens into a list of sentences tokens tokens = [token for paragraph in tokens for token in paragraph] sentences = [ list(token) for delimiter, token in groupby( tokens, lambda token: re.match(cls.is_sentence_delimiter, token)) if not delimiter ] amt_sentences = len(sentences) # Find the bigrams bigram = Phrases(sentences, min_count=min_count, delimiter=delimiter, common_terms=stop) if n == 3: # Find the trigrams trigram = Phrases(bigram[sentences], min_count=1, delimiter=delimiter, common_terms=stop) for sentence in range(amt_sentences): sentences[sentence] = [ n_gram for n_gram in trigram[bigram[sentences[sentence]]] ] else: for sentence in range(amt_sentences): sentences[sentence] = [ n_gram for n_gram in bigram[sentences[sentence]] ] return sentences
def create_ngram_models(documents): bigram = Phrases(documents, min_count=5, threshold=100) trigram = Phrases(bigram[documents], threshold=100) bigram_model = Phraser(bigram) trigram_model = Phraser(trigram) return bigram_model, trigram_model
def get_bigram_model(): model_exists = os.path.exists(bigram_model_filepath) if model_exists: bigram_model = Phrases.load(bigram_model_filepath) else: unigram_sentences = get_unigram_sentences() bigram_model = Phrases(unigram_sentences) bigram_model.save(bigram_model_filepath) return bigram_model
def main(): # ------------------------------------------------------------------------------- # Parameters # the script will most likely work if we swap the TEXTS variable # with any iterable of text (where one element represents a document, # and the whole iterable is the corpus) newsgroups_train = fetch_20newsgroups(subset = 'train') TEXTS = newsgroups_train.data # spacy's english model for text preprocessing NLP = spacy.load('en') # a set of stopwords built-in to spacy, we can always # expand this set for the problem that we are working on, # here we include python built-in string punctuation mark STOPWORDS = spacy.en.STOP_WORDS | set(punctuation) | set(ENGLISH_STOP_WORDS) # create a directory called 'model' to store all outputs in later section MODEL_DIR = 'model' UNIGRAM_PATH = os.path.join(MODEL_DIR, 'unigram.txt') PHRASE_MODEL_CHECKPOINT = os.path.join(MODEL_DIR, 'phrase_model') BIGRAM_PATH = os.path.join(MODEL_DIR, 'bigram.txt') WORD2VEC_CHECKPOINT = os.path.join(MODEL_DIR, 'word2vec') # ------------------------------------------------------------------------------- logger.info('job started') if not os.path.isdir(MODEL_DIR): os.mkdir(MODEL_DIR) if not os.path.exists(UNIGRAM_PATH): logger.info('preprocessing text') export_unigrams(UNIGRAM_PATH, texts = TEXTS, parser = NLP, stopwords = STOPWORDS) if os.path.exists(PHRASE_MODEL_CHECKPOINT): phrase_model = Phrases.load(PHRASE_MODEL_CHECKPOINT) else: logger.info('training phrase model') # use LineSetence to stream text as oppose to loading it all into memory unigram_sentences = LineSentence(UNIGRAM_PATH) phrase_model = Phrases(unigram_sentences) phrase_model.save(PHRASE_MODEL_CHECKPOINT) if not os.path.exists(BIGRAM_PATH): logger.info('converting words to phrases') export_bigrams(UNIGRAM_PATH, BIGRAM_PATH, phrase_model) if os.path.exists(WORD2VEC_CHECKPOINT): word2vec = Word2Vec.load(WORD2VEC_CHECKPOINT) else: logger.info('training word2vec') sentences = LineSentence(BIGRAM_PATH) word2vec = Word2Vec(sentences, workers = cpu_count()) word2vec.save(WORD2VEC_CHECKPOINT) logger.info('job completed')
def learnMultiword(ret): print("Learning multiword expressions") bigram = Phrases(ret) bigram.save("phrase_all.model") print("Sanity checking multiword expressions") test = "i like donald trump and hate muslims , go hillary , i like jesus , jesus , against , abortion " sent = test.split(" ") print(bigram[sent]) return bigram[ret]
def n_gram(df): """ Trigram model of word probabilities (old-fashioned TM) @param df: DataFrame with a 'token' column """ token = df.token.tolist() bigram_model = Phrases(token) trigram_model = Phrases(bigram_model[token], min_count=1) token_list = list(trigram_model[bigram_model[token]]) return token_list
class OverkillTokenizer(Tokenizer): def __init__(self, lemmatize=True, n_jobs=1, bigram=None, trigram=None, min_count=5, threshold=10.): self.lemmatize = lemmatize self.n_jobs = n_jobs self.bigram = bigram self.trigram = trigram self.min_count = min_count self.threshold = threshold def tokenize(self, docs): if self.lemmatize: lem = WordNetLemmatizer() #print('RAKE tokenizing...') pre_tdocs = RAKETokenizer(n_jobs=self.n_jobs).tokenize(docs) for i, tdoc in enumerate(pre_tdocs): for t in tdoc: if t.startswith('one'): print(t) print(i) #print('Additional Tokenizing docs...') if self.n_jobs == 1: tdocs = [pre_tokenize(doc, tdoc, lem=lem) for doc, tdoc in zip(docs, pre_tdocs)] else: tdocs = parallel(partial(pre_tokenize, lem=lem), zip(docs, pre_tdocs), self.n_jobs, expand_args=True) #print('Training bigram...') if self.bigram is None: self.bigram = Phrases(tdocs, min_count=self.min_count, threshold=self.threshold, delimiter=b' ') else: self.bigram.add_vocab(tdocs) #print('Training trigram...') if self.trigram is None: self.trigram = Phrases(self.bigram[tdocs], min_count=self.min_count, threshold=self.threshold, delimiter=b' ') else: self.trigram.add_vocab(self.bigram[tdocs]) return [tdoc for tdoc in self.trigram[self.bigram[tdocs]]]
def train_phrases(paths, out='data/bigram_model.phrases', tokenizer=word_tokenize, **kwargs): """ Train a bigram phrase model on a list of files. """ n = 0 for path in paths: print('Counting lines for {0}...'.format(path)) n += sum(1 for line in open(path, 'r')) print('Processing {0} lines...'.format(n)) # Change to use less memory. Default is 40m. kwargs = { 'max_vocab_size': 40000000, 'threshold': 8. }.update(kwargs) print('Training bigrams...') bigram = Phrases(_phrase_doc_stream(paths, n, tokenizer=word_tokenize), **kwargs) print('Saving...') bigram.save(out)
def phrases(): print('Loading phrases model...') bigram = Phrases.load('data/nyt/bigram_model.phrases') print('Creating listener...') address = ('localhost', 6001) with Listener(address, authkey=b'password') as listener: while True: with listener.accept() as conn: print('connection accepted from {0}'.format(listener.last_accepted)) while True: try: msg = conn.recv() conn.send(bigram[msg]) except (EOFError, ConnectionResetError): break
def __init__(self, num_topics=100, min_word_count=20, top_most_common_words=10, min_doc_length=40, max_doc_length=1000, random_state=None): self.num_topics = num_topics self.min_word_count = min_word_count self.top_most_common_words = top_most_common_words assert max_doc_length > min_doc_length, \ "max_doc_length must be greater than min_doc_length" self.min_doc_length = min_doc_length self.max_doc_length = max_doc_length self.random_state = random_state # natural language processing self.stop_words = self.getEnglishStopWords() self.bigramizer = Phrases()
def __init__(self, remote): global _phrases global _phrases_conn self.remote = remote if not remote and _phrases is None: print('Loading phrases model...') # Trained on 100-200k NYT articles _phrases = Phrases.load('data/nyt/bigram_model.phrases') print('Done loading phrases') elif _phrases_conn is None: print('Connecting to phrases process...') address = ('localhost', 6001) _phrases_conn = Client(address, authkey=b'password') print('Done connecting to phrases') self.conn = _phrases_conn
def extractW2VFeaturesSim(w2vmodelfile, phrasemodel, tweets, targets, labels): phmodel = Phrases.load(phrasemodel) w2vmodel = word2vec.Word2Vec.load(w2vmodelfile) inv_topics = {v: k for k, v in tokenize_tweets.TOPICS_LONG.items()} for i, tweet in enumerate(tweets): # get the neut/pos/neg hashtags neut = KEYWORDS_NEUT[inv_topics[targets[i]]] pos = KEYWORDS_POS[inv_topics[targets[i]]] neg = KEYWORDS_NEG[inv_topics[targets[i]]] tokenised_tweet = tokenize(tweet.lower()) words = filterStopwords(tokenised_tweet) neutcnt, poscnt, negcnt = 0, 0, 0 neutsc, possc, negsc = 0.0, 0.0, 0.0 # transform, as earlier, with the phrase model for token in phmodel[words]: try: neutsim = w2vmodel.similarity(neut, token) neutcnt += 1 neutsc += neutsim except KeyError: neutsim = 0 try: possim = w2vmodel.similarity(pos, token) possc += possim poscnt += 1 except KeyError: possim = 0 try: negsim = w2vmodel.similarity(neg, token) negsc += negsim negcnt += 1 except KeyError: negsim = 0 #print targets[i], "\t", token, "\t", neutsim, "\t", possim, "\t", negsim neutsc_tweet = neutsc/neutcnt possc_tweet = possc/poscnt negsc_tweet = negsc/negcnt print(targets[i], "\t", labels[i], "\t", neutsc_tweet, "\t", possc_tweet, "\t", negsc_tweet)
def extractFeaturesW2V(w2vmodel="skip_nostop_multi_300features_10minwords_10context", phrasemodel="phrase.model", useDev = False): if useDev == False: tweets_train, targets_train, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2) tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tokenize_tweets.FILEDEV, 'windows-1252', 2) else: tweets_train, targets_train, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2) tweets_origdev, targets_origdev, labels_origdev = readTweetsOfficial(tokenize_tweets.FILEDEV, 'windows-1252', 2) tweets_train.extend(tweets_origdev) targets_train.extend(targets_origdev) labels_train.extend(labels_origdev) tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tokenize_tweets.FILETEST, 'windows-1252', 2) phmodel = Phrases.load(phrasemodel) w2vmodel = word2vec.Word2Vec.load(w2vmodel) features_train_w2v = extractW2VAggrFeatures(w2vmodel, phmodel, tweets_train, targets_train, labels_train) features_dev_w2v = extractW2VAggrFeatures(w2vmodel, phmodel, tweets_dev, targets_dev, labels_dev) return features_train_w2v, labels_train, features_dev_w2v, labels_dev
def extractFeaturesMulti(features=["auto_false", "bow", "targetInTweet", "emoticons", "affect", "w2v", "bow_phrase"] , automodel="model.ckpt", w2vmodel="skip_nostop_multi_300features_10minwords_10context", phrasemodel="phrase.model", useDev=True): if useDev==False: tweets_train, targets_train, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2) tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tokenize_tweets.FILEDEV, 'windows-1252', 2) else: tweets_train, targets_train, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2) tweets_origdev, targets_origdev, labels_origdev = readTweetsOfficial(tokenize_tweets.FILEDEV, 'windows-1252', 2) tweets_train.extend(tweets_origdev) targets_train.extend(targets_origdev) labels_train.extend(labels_origdev) tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tokenize_tweets.FILETEST, 'windows-1252', 2) features_final = [] if features.__contains__("bow"): features_final = extractFeatureVocab(tweets_train) features_train = extractFeaturesBOW(tweets_train, targets_train, features_final) features_dev = extractFeaturesBOW(tweets_dev, targets_dev, features_final) elif features.__contains__("targetInTweet"): features_train = extractFeaturesCrossTweetTarget(tweets_train, targets_train) features_dev = extractFeaturesCrossTweetTarget(tweets_dev, targets_dev) features_final.append("targetInTweet") if features.__contains__("bow_phrase") or features.__contains__("bow_phrase_anon"): if features.__contains__("bow_phrase"): features_vocab = extractFeatureVocab(tweets_train, usephrasemodel=True) features_train_phrbow = extractFeaturesBOW(tweets_train, targets_train, features_vocab, usephrasemodel=True) features_dev_phrbow = extractFeaturesBOW(tweets_dev, targets_dev, features_vocab, usephrasemodel=True) elif features.__contains__("bow_phrase_anon"): features_vocab = extractFeatureVocab(tweets_train, usephrasemodel=True, anon_targets=True) features_train_phrbow = extractFeaturesBOW(tweets_train, targets_train, features_vocab, usephrasemodel=True, anon_targets=True) features_dev_phrbow = extractFeaturesBOW(tweets_dev, targets_dev, features_vocab, usephrasemodel=True, anon_targets=True) features_final.extend(features_vocab) if features.__contains__("auto_added"): useph=False if "phrase" in automodel: useph=True features_train_auto, labels_train, features_dev_auto, labels_dev = extractFeaturesAutoencoder(automodel, tweets_train, targets_train, labels_train, tweets_dev, targets_dev, labels_dev, "added", usephrasemodel=useph) elif features.__contains__("auto_true"): useph=False if "phrase" in automodel: useph=True features_train_auto, labels_train, features_dev_auto, labels_dev = extractFeaturesAutoencoder(automodel, tweets_train, targets_train, labels_train, tweets_dev, targets_dev, labels_dev, "true", usephrasemodel=useph) elif features.__contains__("auto_false"): useph=False if "phrase" in automodel: useph=True features_train_auto, labels_train, features_dev_auto, labels_dev = extractFeaturesAutoencoder(automodel, tweets_train, targets_train, labels_train, tweets_dev, targets_dev, labels_dev, "false", usephrasemodel=useph) targetInTweetTrain = [] targetInTweetDev = [] if features.__contains__("targetInTweet") and features.__contains__("bow"): targetInTweetTrain = extractFeaturesCrossTweetTarget(tweets_train, targets_train) targetInTweetDev = extractFeaturesCrossTweetTarget(tweets_dev, targets_dev) features_final.append("targetInTweet") if features.__contains__("emoticons"): emoticons_train, emoticons_vocab = extractEmoticons(tweets_train) emoticons_dev, emoticons_vocab = extractEmoticons(tweets_dev) for emo in emoticons_vocab: features_final.append("Emoticon_" + emo) if features.__contains__("affect"): affect_train, affect_vocab = getAffect(tweets_train) affect_dev, affect_vocab = getAffect(tweets_dev) for aff in affect_vocab: features_final.append("WNaffect_" + aff) if features.__contains__("hash"): phmodel = Phrases.load(phrasemodel) w2vmodel = word2vec.Word2Vec.load(w2vmodel) features_train_w2v, features_w2v_vocab = extractW2VHashFeatures(w2vmodel, phmodel, "hash", tweets_train, targets_train, labels_train) features_dev_w2v, features_w2v_vocab = extractW2VHashFeatures(w2vmodel, phmodel, "hash", tweets_dev, targets_dev, labels_dev) elif features.__contains__("w2v_hash"): # this contains hash phmodel = Phrases.load(phrasemodel) w2vmodel = word2vec.Word2Vec.load(w2vmodel) features_train_w2v, features_w2v_vocab = extractW2VHashFeatures(w2vmodel, phmodel, "w2v_hash", tweets_train, targets_train, labels_train) features_dev_w2v, features_w2v_vocab = extractW2VHashFeatures(w2vmodel, phmodel, "w2v_hash", tweets_dev, targets_dev, labels_dev) # combine features for i, featvec in enumerate(features_train):#features_train_auto) if features.__contains__("auto_added") or features.__contains__("auto_true") or features.__contains__("auto_false"): features_train[i] = np.append(features_train[i], features_train_auto[i]) # numpy append works as extend works for python lists if features.__contains__("targetInTweet") and features.__contains__("bow"): features_train[i] = np.append(features_train[i], targetInTweetTrain[i]) if features.__contains__("bow_phrase") or features.__contains__("bow_phrase_anon"): features_train[i] = np.append(features_train[i], features_train_phrbow[i]) if features.__contains__("emoticons"): features_train[i] = np.append(features_train[i], emoticons_train[i]) if features.__contains__("affect"): features_train[i] = np.append(features_train[i], affect_train[i]) if features.__contains__("w2v_hash") or features.__contains__("hash"): features_train[i] = np.append(features_train[i], features_train_w2v[i]) for i, featvec in enumerate(features_dev):#features_dev_auto): if features.__contains__("auto_added") or features.__contains__("auto_true") or features.__contains__("auto_false"): features_dev[i] = np.append(features_dev[i], features_dev_auto[i]) if features.__contains__("targetInTweet") and features.__contains__("bow"): features_dev[i] = np.append(features_dev[i], targetInTweetDev[i]) if features.__contains__("bow_phrase") or features.__contains__("bow_phrase_anon"): features_dev[i] = np.append(features_dev[i], features_dev_phrbow[i]) if features.__contains__("emoticons"): features_dev[i] = np.append(features_dev[i], emoticons_dev[i]) if features.__contains__("affect"): features_dev[i] = np.append(features_dev[i], affect_dev[i]) if features.__contains__("w2v_hash") or features.__contains__("hash"): features_dev[i] = np.append(features_dev[i], features_dev_w2v[i]) return features_train, labels_train, features_dev, labels_dev, features_final
w2v_model = Word2Vec.load(model_filepath) # C binary format except IndexError: print("using default model") current_dir = os.path.dirname(__file__) model_filepath = os.path.join(current_dir, 'model_sentences_raw_words_trigrams_min_count_50_size_200_downsampling_0.001.bin') w2v_model = Word2Vec.load(model_filepath) # C binary format print("using model from " + model_filepath) bigrams_model_name = 'bigrams_model_nyt_sentences_5.5M_5.bin' trigrams_model_name = "trigrams_model_nyt_sentences_5.5M_5.bin" ngrams_models = { "bigrams": bigrams_model_name, "trigrams": trigrams_model_name } which_ngrams_model = "trigrams" ngrams_model = Phrases.load(ngrams_models[which_ngrams_model]) print("finish loading w2v" + str(datetime.now())) print("loading w2v took " + str((datetime.now() - start).seconds) + " seconds") @w2v_api.route("/") def hello(): return json.dumps({"loaded": True}) @w2v_api.route("/similarize/<word>") def similarize(word): try: try: similar_words = cached_synonyms[word] except KeyError:
extra_testing_mat[row - N_TRAINING, zips2id[line[0]]] = 1 for cuisine in line[3:]: if cuisine in cuisine2id: extra_testing_mat[row - N_TRAINING, cuisine2id[cuisine]] = 1 with open("extra_testing_matrix.pyobject", "wb") as f: pickle.dump(extra_testing_mat, f) finish = time() print("Complete!") print("Running time: %.2f seconds" % (finish - start,)) print() # BIGRAMS & TRIGRAMS print("Creating n-gram corpus from training corpus...") start = time() phrases = Phrases(min_count=3, threshold=10.0) with open("training_corpus.txt", "rt") as f: for line in f: phrases.add_vocab([line.rstrip().split()]) _ = f.seek(0) with open("bigram_training_corpus.txt", "wt") as g: for line in f: word_list = phrases[line.rstrip().split()] g.write(" ".join(word_list) + "\n") phrases = Phrases(min_count=3, threshold=10.0) with open("bigram_training_corpus.txt", "rt") as f: for line in f: phrases.add_vocab([line.rstrip().split()]) _ = f.seek(0) with open("trigram_training_corpus.txt", "wt") as g: for line in f:
class TopicModel(object): ''' This module preprocesses a corpus of documents and runs Latent Dirichlet Allocation (LDA) on a corpus of documents. Parameters ---------- num_topics: int, default: 100 input parameter to LDA min_word_count: int, default: 20 if a token has fewer than min_word_count occurences in the entire corpus, then it will be pruned from the processed corpus top_most_common_words: int, default: 10 prune tokens that are within the top_most_common_words throughout the entire corpus min_doc_length: int, default: 40 if the number of tokens within a processed document is less than min_doc_length, then the document is excluded max_doc_length: int, default: 1000 if the number of tokens within a processed document is greater than max_doc_length, then the document is excluded random_state: default: None the random seed for the Gensim LDA object Attributes ---------- bigramizer: the trained Gensim bigramizer tokens: list of list of strings dictionary: mapping from id to token corpus: bag of words vectorization of the tokens lda: the Gensim LDA object dominant_topic_ids: list of dominant topic ids, in decreasing order of dominance ''' def __init__(self, num_topics=100, min_word_count=20, top_most_common_words=10, min_doc_length=40, max_doc_length=1000, random_state=None): self.num_topics = num_topics self.min_word_count = min_word_count self.top_most_common_words = top_most_common_words assert max_doc_length > min_doc_length, \ "max_doc_length must be greater than min_doc_length" self.min_doc_length = min_doc_length self.max_doc_length = max_doc_length self.random_state = random_state # natural language processing self.stop_words = self.getEnglishStopWords() self.bigramizer = Phrases() def fit(self, documents): ''' parameters: documents: list of strings, each represents a document ''' # tokens, dictionary, corpus for LDA self.tokens = self.preProcessCorpus(documents) self.dictionary = corpora.Dictionary(self.tokens) self.corpus = [self.dictionary.doc2bow(text) for text in self.tokens] self.lda = self.getLDA(dictionary=self.dictionary, corpus=self.corpus, num_topics=self.num_topics, random_state=self.random_state) self.num_dominant_topics=min(10, self.num_topics) self.dominant_topic_ids = self.getDominantTopics(self.corpus, self.lda, self.num_dominant_topics) def __str__(self): description = ("topic model:\n\ttoken length = {0:,d}\n\tdictionary length = {1:,d}" "\n\tnum_topics = {2:,d}\n\tmin_word_count = {3:,d}" "\n\ttop_most_common_words = {4:,d}\n\tmin_doc_length = {5:,d}" "\n\tmax_doc_length = {6:,d}") return description.format(len(self.tokens), len(self.dictionary), self.num_topics, self.min_word_count, self.top_most_common_words, self.min_doc_length, self.max_doc_length) @staticmethod def getEnglishStopWords(): ''' returns a set of stop words for NLP pre-processing from nltk.corpus.stopwords() Also, some words and letters are added to the set, such as "please", "sincerely", "u", etc... ''' stop_words = set(stopwords.words("english")) stop_words.add('please') stop_words.add('would') stop_words.add('use') stop_words.add('also') stop_words.add('thank') stop_words.add('sincerely') stop_words.add('regards') stop_words.add('hi') stop_words.add('hello') stop_words.add('greetings') stop_words.add('hey') stop_words.add('attachment') stop_words.add('attached') stop_words.add('attached_file') stop_words.add('see') stop_words.add('file') stop_words.add('comment') for item in 'abcdefghijklmnopqrstuvwxyz': stop_words.add(item) return stop_words @staticmethod def getFrequencies(tokens): """ input: tokens, a list of list of tokens output: a collections.Counter() object that contains token counts """ frequencies = Counter() for row in tokens: frequencies.update(row) return frequencies @staticmethod def getLowFreqWords(frequencies, countCutOff): """ input: frequencies: a collections.Counter() object countCutOff: the minimum frequency below which tokens are added to the set of low frequency tokens """ lowFreqTokens = set() for token, freq in frequencies.iteritems(): if freq <= countCutOff: lowFreqTokens.add(token) return lowFreqTokens def preProcessCorpus(self, documents, min_word_count=None, top_most_common_words=None, min_doc_length=None, max_doc_length=None): ''' this function pre-processes the documents and converts them into a list of list of tokens input: documents: a list of strings (each string represents a document) min_word_count: if the frequency count of a token in the corpus is less than min_word_count then it is pruned top_most_common_words: if the frequency count of a token in the corpus exceeds top_most_common_words then it is pruned min_doc_length: if the number of tokens within a processed document is less than min_doc_length, then the document is excluded max_doc_length: if the number of tokens within a processed document is greater than max_doc_length, then the document is excluded output: a list of list of tokens ''' if min_word_count is None: min_word_count = self.min_word_count if top_most_common_words is None: top_most_common_words = self.top_most_common_words if min_doc_length is None: min_doc_length = self.min_doc_length if max_doc_length is None: max_doc_length = self.max_doc_length tokens = [tokenizer(document) for document in documents] # exclude comments that are longer than max_doc_length tokens = [tkn for tkn in tokens if len(tkn) < max_doc_length] # train Gensim Phrases model for bigrams self.bigramizer.add_vocab(tokens) # apply Gensim Phrases model to generate bigrams tokens = [self.bigramizer[tkn] for tkn in tokens] # exclude stop words tokens = [[t for t in tkn if t not in self.stop_words] for tkn in tokens] # exclude tokens that are shorter than min_doc_length tokens = [tkn for tkn in tokens if len(tkn) > min_doc_length] # calculate token frequencies to exclude low and high frequency tokens freqs = self.getFrequencies(tokens) low_freq_tokens = set(x[0] for x in freqs.iteritems() if x[1] < min_word_count) high_freq_tokens = [word[0] for word in freqs.most_common(top_most_common_words)] tokens = [[t for t in tkn if t not in low_freq_tokens] for tkn in tokens] tokens = [[t for t in tkn if t not in high_freq_tokens] for tkn in tokens] print '\nnumber of low frequency tokens pruned = {:,d}'\ .format(len(low_freq_tokens)) print 'min_word_count = {:d}, top_most_common_words = {:,d}'\ .format(min_word_count, top_most_common_words) print 'number of high frequency tokens pruned = {:,d}'\ .format(len(high_freq_tokens)) print 'tokens = {:,d} rows'.format(len(tokens)) print 'text pre-processing is complete\n' return tokens def getLDA(self, dictionary=None, corpus=None, num_topics=None, random_state=None): # get LDA for dictionary_all and corpus_all print 'computing LDA...' if dictionary is None: dictionary = self.dictionary if corpus is None: corpus = self.corpus if num_topics is None: num_topics = self.num_topics lda = models.ldamodel.LdaModel(corpus=corpus, alpha='auto', id2word=dictionary, num_topics=num_topics, random_state=random_state) return lda def getDominantTopics(self, corpus, lda, num_dominant_topics=None): print 'computing dominant topics...' if corpus is None: corpus = self.corpus if lda is None: lda = self.lda if num_dominant_topics is None: num_dominant_topics = self.num_dominant_topics # get topic weight matrix using lda.inference # the matrix has dimensions (num documents) x (num topics) inference = lda.inference(corpus) inference = inference[0] # the inference is a tuple, need the first term num_topics = lda.num_topics # find dominant topics across documents (vertical sum) column_sum_of_weights = np.sum(inference, axis=0) sorted_weight_indices = np.argsort(column_sum_of_weights) idx = np.arange(num_topics - num_dominant_topics, num_topics) dominant_topic_ids = sorted_weight_indices[idx] # the dominant_topic_ids store the ids in descending order of dominance dominant_topic_ids = dominant_topic_ids[::-1] # convert from numpy array to list and return return dominant_topic_ids.tolist()
def phrases(): p = Phrases(sentences=process_corpus('/Users/valeriyischenko/local/projects/lingua_hack/Text')) p.save('../wiki/text_phrase_model_p3')
if not 0<=k<=len(seq): for e in seq: yield e else: numbersPicked = 0 for i,number in enumerate(seq): prob = (k-numbersPicked)/(len(seq)-i) if random.random() < prob: yield number numbersPicked += 1 f = open("tokenizer.pk", "rb") tokenizer = pickle.load(f) f.close() bigram = Phrases.load('bigrams.pk') trigram = Phrases.load('trigrams.pk') ngram = Phrases.load('ngrams.pk') print 'SemEval data' for semeval_file in semeval_files: print 'File', semeval_file with open(semeval_file, 'r') as f: st = [] for line in f: st += [line.strip()] text = read_visit_sem(st) text = [nltk.word_tokenize(s.lower()) for s in tokenizer.tokenize(text)] text = ngram[trigram[bigram[text]]] for sent in text: print '->', ' '.join(sent)
from gensim.models import Phrases from nytnlp.keywords import rake from textblob import Blobber from textblob_aptagger import PerceptronTagger blob = Blobber(pos_tagger=PerceptronTagger()) stops = stopwords.words('english') lem = WordNetLemmatizer() dash_map = {ord(p): ' ' for p in '—-'} punct_map = {ord(p): '' for p in string.punctuation + '“”—’‘'} # Trained on 100-200k NYT articles bigram = Phrases.load('data/bigram_model.phrases') def clean_doc(doc): doc = doc.lower() doc = doc.replace('\'s ', ' ') doc = doc.translate(dash_map) doc = doc.translate(punct_map) return doc def keyword_tokenize(doc): """ Tokenizes a document so that only keywords and phrases are returned. Keywords are returned as lemmas. """ doc = clean_doc(doc)
ct += 1 if ct % 50000 == 0: print ct if line.strip() == '</VISIT>': text = read_visit(st) text = tokenizer.tokenize(text) for sent in text: yield nltk.word_tokenize(sent.lower()) st = [] elif line.strip() != '<VISIT>': st += [line.strip()] except IOError: pass f = open("tokenizer.pk", "rb") tokenizer = pickle.load(f) f.close() print 'BIGRAMS' bigram = Phrases(next_note(tokenizer), delimiter='') bigram.save('bigrams.pk') print 'TRIGRAMS' trigram = Phrases(bigram[next_note(tokenizer)], delimiter='') trigram.save('trigrams.pk') print '4GRAMS' ngram = Phrases(trigram[next_note(tokenizer)], delimiter='') ngram.save('ngrams.pk')
import string import bz2 import nltk from collections import Counter from gensim.models import Phrases from gensim.models import Word2Vec from nltk.corpus import stopwords sentences = [] bigram = Phrases() with bz2.BZ2File('./2009.csv.bz2') as file_: for i, line in enumerate(file_): sentence = [word for word in nltk.word_tokenize(line.decode("utf-8").lower()) if word not in string.punctuation] sentences.append(sentence) bigram.add_vocab([sentence]) bigram_model = Word2Vec(bigram[sentences]) bigram_model_counter = Counter() bigram_model.save('ok.w2v') for key in bigram_model.vocab.keys(): if key not in stopwords.words("english"): if len(key.split("_")) > 1: bigram_model_counter[key] += bigram_model.vocab[key].count for key, counts in bigram_model_counter.most_common(50): print('{0: <20} {1}'.format(key.encode("utf-8"), counts))
docs = [ ['the', 'new', 'york', 'times', 'is', 'a', 'newspaper'], ['concern', 'is', 'rising', 'in', 'many', 'quarters', 'that', 'the', 'united', 'states', 'is', 'retreating', 'from', 'global', 'economic', 'leadership', 'just', 'when', 'it', 'is', 'needed', 'most'], ['the', 'afghan', 'president', 'ashraf', 'ghani', 'blamed', 'the', 'islamic', 'state', 'group'], ['building', 'maintenance', 'by', 'the', 'hrynenko', 'family', 'which', 'owns', 'properties', 'in', 'the', 'east', 'village'], ['a', 'telegram', 'from', 'the', 'american', 'embassy', 'in', 'constantinople', 'to', 'the', 'state', 'department', 'in', 'washington'] ] # Change to use less memory. Default is 40m. max_vocab_size = 40000000 # Train up to trigrams. print('Training bigrams...') bigram = Phrases(doc_stream(paths, n), max_vocab_size=max_vocab_size, threshold=8.) print('Saving...') bigram.save('bigram_model.phrases') print('Training trigrams...') trigram = Phrases(bigram[doc_stream(paths, n)], max_vocab_size=max_vocab_size, threshold=10.) print('Saving...') trigram.save('trigram_model.phrases') print('Done.') #print('Loading bigrams...') #bigram = Phrases.load('bigram_model.phrases')
import string import re import numpy as np from numpy import prod, dot from gensim.models import Doc2Vec, Phrases root = settings.root_path big_file_dir = os.path.expanduser('~')+'/model/corpra/' if sys.platform=='darwin': root = root.replace(os.path.expanduser('~'), os.path.expanduser('~')+'/Dropbox') ######################################################################## # Find nearest neighbors in product space ####################################################################### model = Doc2Vec.load(root+"model/movie_space/idf_reddit") bigram = Phrases.load(big_file_dir+'movies_bigram_large.p','rb') book_data = pickle.load( open(root+"model/movie_space/book_meta_data.p", "rb" ) ) title2asin = pickle.load( open(root+"model/movie_space/title2asin.p", "rb" ) ) def get_similar(query_book, pos_words, neg_words, topn=100): try: pos_vecs = [] all_query_words = [] for book in query_book: if book in title2asin: print "\tFound book: ", title2asin[book] all_query_words.append(title2asin[book]) pos_vecs.append(model.docvecs[title2asin[book]]) for word in bigram[pos_words.replace(',', ' ').lower().split()]: if word in model:
from gensim.models import Phrases from gensim.models import Word2Vec from gensim.utils import lemmatize from gensim.parsing.preprocessing import STOPWORDS from nltk.corpus import stopwords from collections import Counter print("Reading input file 'input/audits_with_content.csv'") with open('input/audits_with_content.csv', 'r') as f: reader = csv.reader(f) raw_documents = list(reader) print("Prepare documents") documents = [doc[2] for doc in raw_documents if doc[2] != ''] sentences = [] bigram = Phrases() for document in documents: raw_text = document.lower() tokens = lemmatize(raw_text, stopwords=STOPWORDS) sentences.append(tokens) bigram.add_vocab([tokens]) bigram_counter = Counter() for key in bigram.vocab.keys(): if key not in stopwords.words("english"): if len(key.split("_")) > 1: bigram_counter[key] += bigram.vocab[key] for key, counts in bigram_counter.most_common(200): print '{0: <20} {1}'.format(key.encode("utf-8"), counts)
def __init__(self): super().__init__(multithreaded=False) print('Loading phrases model...') self.bigram = Phrases.load('data/bigram_model.phrases')