def main(args): sentences = TextNormalizer(LineSentence(args.infile), args.keep_mixedcase, args.keep_digits, args.keep_punc) # build initial bigram phrase model model = Phrases(sentences, min_count=5, threshold=10) model.save("%sphrase.model" % (args.outdir))
def build_trigram_model(self,sentences,bigram): print "In Trigram Model" trigram = Phrases(bigram[sentences]) dest = self.models + 'trigram_model' trigram.save(dest) return trigram
def testSaveLoadCustomScorer(self): """ saving and loading a Phrases object with a custom scorer """ try: bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer) bigram.save("test_phrases_testSaveLoadCustomScorer_temp_save.pkl") bigram_loaded = Phrases.load( "test_phrases_testSaveLoadCustomScorer_temp_save.pkl") seen_scores = [] test_sentences = [[ 'graph', 'minors', 'survey', 'human', 'interface', 'system' ]] for phrase, score in bigram_loaded.export_phrases(test_sentences): seen_scores.append(score) assert all(seen_scores) # all scores 1 assert len( seen_scores ) == 3 # 'graph minors' and 'survey human' and 'interface system' finally: if os.path.exists( "test_phrases_testSaveLoadCustomScorer_temp_save.pkl"): os.remove( "test_phrases_testSaveLoadCustomScorer_temp_save.pkl")
def load_shit(file_paths, save_path): for i, path in enumerate(file_paths): # first iteration if i == 0: print('[info] initializing phrase model') with open(path) as f: reader = ndjson.reader(f) reader = extract_text(reader) # initialize phrase model phrases = Phrases(reader, delimiter=b" ") # every other iteration else: if i % 1000: progress = (i / len(file_paths)) * 100 print('[info] processed {}% files'.format(round(progress, 1))) with open(path) as f: reader = ndjson.reader(f) reader = extract_text(reader) # show the model new data phrases.add_vocab(reader) # save model after iterations are done with open(save_path, 'w') as f: phrases.save(save_path)
def testSaveLoadNoScoring(self): """ Saving and loading a Phrases object with no scoring parameter. This should ensure backwards compatibility with old versions of Phrases""" try: bigram = Phrases(self.sentences, min_count=1, threshold=1) del (bigram.scoring) bigram.save("test_phrases_testSaveLoadNoScoring_temp_save.pkl") bigram_loaded = Phrases.load( "test_phrases_testSaveLoadNoScoring_temp_save.pkl") seen_scores = set() test_sentences = [[ 'graph', 'minors', 'survey', 'human', 'interface', 'system' ]] for phrase, score in bigram_loaded.export_phrases(test_sentences): seen_scores.add(round(score, 3)) assert seen_scores == set([ 5.167, # score for graph minors 3.444 # score for human interface ]) finally: if os.path.exists( "test_phrases_testSaveLoadNoScoring_temp_save.pkl"): os.remove("test_phrases_testSaveLoadNoScoring_temp_save.pkl")
def build_phrase_model(): global review_df ### Trigram phrase model. Fed back into the phrases for MWETokenizer bigram = Phrases(review_df.review_pp1, min_count=1, threshold=1) bigram_phraser = Phraser(bigram) trigram = Phrases(bigram_phraser[review_df.review_pp1]) trigram.save(PHRASE_MODEL_LOC) return trigram
def trainSOPhrase(g_DataQueue, g_FinishRead, savePath, priorPhrasePath): """ :param g_DataQueue:全局变量存放数据库中的数据 :param g_FinishRead:是否读取完数据库的标志 :param savePath:短语学习器保存的位置 :param priorPhrasePath:前一个学习器保存的位置 :return: """ count = 0 phrase = Phrases(None, min_count=10, threshold=15) if (priorPhrasePath is None): priorPhraser = None else: priorPhraser = Phraser(Phrases.load(priorPhrasePath)) while (g_FinishRead.value == 0 or (not g_DataQueue.empty())): data = g_DataQueue.get() count += len(data) print("have processed:", count) words = [] reSub0 = re.compile( "(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]" ) # URL reSub1 = re.compile( "[()\"{},:/-]|[^a-z]'|'[^a-z;?.!]|'$") # replace with " " reSub2 = re.compile( "'[.?;!]") # replace with . 主要考虑所有格问题,核心思想单引号左右的各种复杂情况 reSplit1 = re.compile("\.[^a-z0-9]|[?!;]") # 获取单词 for t in data: if (t[0] is not None): st = re.sub(reSub0, " ", t[0].lower()) st = re.sub(reSub1, ".", st) st = re.sub(reSub2, ".", st) for sentence in re.split(reSplit1, st): sen_word = sentence.split() if (len(sen_word) > 6): words.append(sen_word) if (t[1] is not None): st = re.sub(reSub0, " ", t[1].lower()) st = re.sub(reSub1, ".", st) st = re.sub(reSub2, ".", st) for sentence in re.split(reSplit1, st): sen_word = sentence.split() if (len(sen_word) > 6): words.append(sen_word) del data gc.collect() # 训练短语 if (priorPhraser is None): # 第一次训练 phrase.add_vocab(words) else: # 已经训练过一次,寻找个数更多的短语 phrase.add_vocab(priorPhraser[words]) del words # print(len(phrase.vocab)) gc.collect phrase.save(savePath)
def extract_phrases(app_files, bigram_min, trigram_min): rst = build_input(app_files) gen = list(itertools.chain.from_iterable(rst)) # 列表平滑处理 bigram = Phrases(gen, threshold=6, min_count=bigram_min) trigram = Phrases(bigram[gen], threshold=4, min_count=trigram_min) bigram.save('model/%s_bigram_model.pkl' % (app)) trigram.save('model/%s_trigram_model.pkl' % (app))
def train_phraser(sentence_stream, stopword_list, threshold, model_path, save_prefix): phrases_model = Phrases(sentence_stream, common_terms=stopword_list, threshold=threshold) phrases_model.save( os.path.join(model_path, '{}_phrases.bin'.format(save_prefix))) phraser_model = Phraser(phrases_model) phraser_model.save( os.path.join(model_path, '{}_phraser.bin'.format(save_prefix))) return phraser_model
def extract_phrases(app_files, bigram_min, trigram_min): bigram_fp = os.path.join("model", "bigram.model") trigram_fp = os.path.join("model", "trigram.model") rst = build_input(app_files) gen = list(itertools.chain.from_iterable(rst)) # flatten bigram = Phrases(gen, threshold=5, min_count=bigram_min) trigram = Phrases(bigram[gen], threshold=3, min_count=trigram_min) # write bigram.save(bigram_fp) trigram.save(trigram_fp)
def test_save_load_with_connector_words(self): """Test saving and loading a Phrases object.""" connector_words = frozenset({'of'}) bigram = Phrases(self.sentences, min_count=1, threshold=1, connector_words=connector_words) with temporary_file("test.pkl") as fpath: bigram.save(fpath) bigram_loaded = Phrases.load(fpath) assert bigram_loaded.connector_words == connector_words
def generating_bigrams(final_df): eligibility_criteria = final_df['features'] bigrams_input = [each_row.split() for each_row in eligibility_criteria] bigram_transformer = Phrases(bigrams_input, min_count=20, threshold=500) bigram_transformer.save("bigrams", pickle_protocol=4) fd = open("bigrams.txt", 'a') for phrase, score in bigram_transformer.export_phrases(bigrams_input): fd.write(u'{0} {1}'.format(phrase, score)) fd.close() return bigram_transformer
def make_phraser(infile): """ Train the phraser object and save it. :param infile: path to xml file with the wikipedia dump :return: """ p = Phrases( tqdm((i.split() for i in file_yielder(infile)), desc="Phrase-finding")) p = Phraser(p) p.save("../models/phraser") return 0
class GramFacade: def __init__(self, model_dir, min_count_bigrams=8, min_count_trigrams=7): self.model_dir = model_dir self.min_count_bigrams = min_count_bigrams self.min_count_trigrams = min_count_trigrams def load_models(self): self.bigrams_phraser = Phraser.load(self.model_dir + '/' + BIGRAMS_PHRASER_FILENAME) self.trigrams_phraser = Phraser.load(self.model_dir + '/' + TRIGRAMS_PHRASER_FILENAME) def load_phrases(self): self.bigrams_phrases = Phrases.load(self.model_dir + '/' + BIGRAMS_PHRASES_FILENAME) self.trigrams_phrases = Phrases.load(self.model_dir + '/' + TRIGRAMS_PHRASES_FILENAME) def export_bigrams(self, docs): return [self.bigrams_phraser[doc] for doc in docs] def export_trigrams(self, bigrams): return [self.trigrams_phraser[bigram] for bigram in bigrams] def phrase(self, doc): bigrams = self.bigrams_phraser[doc] trigrams = self.trigrams_phraser[bigrams] return trigrams def create_model(self, doc_list): self.bigrams_phrases = Phrases(doc_list, min_count=self.min_count_bigrams) self.bigrams_phraser = Phraser(self.bigrams_phrases) self.trigrams_phrases = Phrases(self.bigrams_phraser[doc_list], min_count=self.min_count_trigrams) self.trigrams_phraser = Phraser(self.trigrams_phrases) self.bigrams_phraser.save(self.model_dir + '/' + BIGRAMS_PHRASER_FILENAME) self.trigrams_phraser.save(self.model_dir + '/' + TRIGRAMS_PHRASER_FILENAME) self.bigrams_phrases.save(self.model_dir + '/' + BIGRAMS_PHRASES_FILENAME) self.trigrams_phrases.save(self.model_dir + '/' + TRIGRAMS_PHRASES_FILENAME) def words_not_in_vocab(self, tok_doc, threshold): word_not_in_doc = set([ x for x in tok_doc if self.trigrams_phrases.vocab[str.encode(x)] < threshold ]) return word_not_in_doc
def testSaveLoadCustomScorer(self): """ saving and loading a Phrases object with a custom scorer """ with temporary_file("test.pkl") as fpath: bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer) bigram.save(fpath) bigram_loaded = Phrases.load(fpath) seen_scores = [] test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] for phrase, score in bigram_loaded.export_phrases(test_sentences): seen_scores.append(score) assert all(seen_scores) # all scores 1 assert len(seen_scores) == 3 # 'graph minors' and 'survey human' and 'interface system'
def build_vocab(): start = time.time() test_path = os.path.join(config.DATA_PATH, 'test.csv') train_path = os.path.join(config.DATA_PATH, 'train.csv') normalized_text_path = os.path.join(config.PROCESSED_PATH, 'normalized_comments.txt') bigram_path = os.path.join(config.PROCESSED_PATH, 'bigram') bigram_comments_path = os.path.join(config.PROCESSED_PATH, 'bigram_commnets.txt') if config.PROCESSED_PATH not in os.listdir(config.DATA_PATH): try: os.mkdir(config.PROCESSED_PATH) except OSError: pass vocab = {} train_df = read_file(train_path) test_df = read_file(test_path) print('tokenizing vocab file') texts = np.concatenate([train_df.comment_text.fillna('N/A').values, test_df.comment_text.fillna('N/A').values]) with open(normalized_text_path, 'w') as f: processed_text = parallelize_dataframe(texts, tokenizer) for line in processed_text: f.write(line + '\n') gc.collect() lines = LineSentence(normalized_text_path) bigram = Phrases(lines) bigram.save(bigram_path) phraser = Phraser(bigram) with open(bigram_comments_path, 'w', encoding='utf_8') as f: for comment in lines: comm = u' '.join(phraser[comment]) f.write(comm + '\n') commnets = LineSentence(bigram_comments_path) bigram_dict = Dictionary(commnets) bigram_dict.filter_extremes(no_below=config.THRESHOLD) bigram_dict.save_as_text(config.VOCAB_PATH) bigram_dict.add_documents([['<pad>']]) with open(os.path.join(config.ROOT, 'src', 'config.py'), 'a') as f: f.write('VOCAB_SIZE = {}'.format(len(bigram_dict))) print('time passed: {} minutes'.format((time.time() - start) / 60))
def trainPhrase(g_DataQueue, g_FinishRead, savePath, priorPhrasePath): count = 0 phrase = Phrases(None, min_count=15, threshold=10, max_vocab_size=40000000) if (priorPhrasePath is None): priorPhraser = None else: priorPhraser = Phraser(Phrases.load(priorPhrasePath)) while (g_FinishRead.value == 0 or (not g_DataQueue.empty())): words = g_DataQueue.get() if (priorPhraser is None): # 第一次训练 phrase.add_vocab(words) else: # 已经训练过一次,寻找个数更多的短语 phrase.add_vocab(priorPhraser[words]) del words gc.collect() phrase.save(savePath)
def testSaveLoad(self): """ Saving and loading a Phrases object.""" with temporary_file("test.pkl") as fpath: bigram = Phrases(self.sentences, min_count=1, threshold=1) bigram.save(fpath) bigram_loaded = Phrases.load(fpath) seen_scores = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] for phrase, score in bigram_loaded.export_phrases(test_sentences): seen_scores.add(round(score, 3)) assert seen_scores == set([ 5.167, # score for graph minors 3.444 # score for human interface ])
def get_bigrams(df, bigram_model_filepath, TRAIN): if TRAIN: # train phrase model # Train the phrase model using the processed sentences (a list of list of strings) sentences_unigrams = df["processed_text"].tolist() bigram_phrase_model = Phrases(sentences_unigrams) # Use the Phraser function to turn the phrase model into a "Phraser" object, # which is optimized for speed and memory use bigram_phrase_model = Phraser(bigram_phrase_model) # Save the model for future use bigram_phrase_model.save(bigram_model_filepath) else: # Load the trained model from disk bigram_phrase_model = Phraser.load(bigram_model_filepath) # Get the first-order transofmred data df["bigrams"] = df["processed_text"].map(lambda x: bigram_phrase_model[x])
def get_trigrams(df, trigram_model_filepath, TRAIN): if TRAIN: # train phrase model # Train the phrase model using the bigram sentences (a list of list of strings) sentences_bigrams = df["bigrams"].tolist() trigram_phrase_model = Phrases(sentences_bigrams) # Use the Phraser function to turn the phrase model into a "Phraser" object, # which is optimized for speed and memory use trigram_phrase_model = Phraser(trigram_phrase_model) # Save the model for future use trigram_phrase_model.save(trigram_model_filepath) else: # Load the trained model from disk trigram_phrase_model = Phraser.load(trigram_model_filepath) # Get the second-order transformed data df["trigrams"] = df["bigrams"].map(lambda x: trigram_phrase_model[x])
def testSaveLoad(self): """Test saving and loading a Phrases object.""" with temporary_file("test.pkl") as fpath: bigram = Phrases(self.sentences, min_count=1, threshold=1) bigram.save(fpath) bigram_loaded = Phrases.load(fpath) test_sentences = [[ 'graph', 'minors', 'survey', 'human', 'interface', 'system' ]] seen_scores = set( round(score, 3) for score in bigram_loaded.find_phrases( test_sentences).values()) assert seen_scores == set([ 5.167, # score for graph minors 3.444 # score for human interface ])
def testSaveLoadCustomScorer(self): """ saving and loading a Phrases object with a custom scorer """ try: bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer) bigram.save("test_phrases_testSaveLoadCustomScorer_temp_save.pkl") bigram_loaded = Phrases.load("test_phrases_testSaveLoadCustomScorer_temp_save.pkl") seen_scores = [] test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] for phrase, score in bigram_loaded.export_phrases(test_sentences): seen_scores.append(score) assert all(seen_scores) # all scores 1 assert len(seen_scores) == 3 # 'graph minors' and 'survey human' and 'interface system' finally: if os.path.exists("test_phrases_testSaveLoadCustomScorer_temp_save.pkl"): os.remove("test_phrases_testSaveLoadCustomScorer_temp_save.pkl")
def create_dictionary(texts, dest_file: str, build_bigram, working_directory=DIR): """ Reads the file specified by source_file, creates a dictionary and saves it to the dest_file path. :param working_directory: The path to the directory where the bigram model files should be saved. :param build_bigram: 1 if building a new phrases object is needed else an already processed bigram model will be loaded. :param source_file: path to source text file. :param dest_file: path to save dictionary to. :return: """ # collect statistics about all tokens stoplist = stopwords.words('english') if build_bigram: bigram = Phrases([tweet.split() for tweet in texts]) bigram.save(working_directory + '/bigram_model.phrase') else: bigram = Phrases.load(working_directory + '/bigram_model.phrase') phraser = Phraser(bigram) # Build dictionary dictionary = corpora.Dictionary(phraser[line.lower().split()] for line in texts) # remove stop words and words that appear only once stop_ids = [ dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id ] once_ids = [ tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1 ] dictionary.filter_tokens( stop_ids + once_ids) # remove stop words and words that appear only once dictionary.filter_extremes(no_below=0.3, no_above=0.85) dictionary.compactify( ) # remove gaps in id sequence after words that were removed dictionary.save(dest_file) print(dictionary) print(dictionary.token2id) return dictionary
def testSaveLoadCustomScorer(self): """Test saving and loading a Phrases object with a custom scorer.""" with temporary_file("test.pkl") as fpath: bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer) bigram.save(fpath) bigram_loaded = Phrases.load(fpath) test_sentences = [[ 'graph', 'minors', 'survey', 'human', 'interface', 'system' ]] seen_scores = list( bigram_loaded.find_phrases(test_sentences).values()) assert all(score == 1 for score in seen_scores) assert len( seen_scores ) == 3 # 'graph minors' and 'survey human' and 'interface system'
def testSaveLoad(self): """ Saving and loading a Phrases object.""" try: bigram = Phrases(self.sentences, min_count=1, threshold=1) bigram.save("test_phrases_testSaveLoad_temp_save.pkl") bigram_loaded = Phrases.load("test_phrases_testSaveLoad_temp_save.pkl") seen_scores = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] for phrase, score in bigram_loaded.export_phrases(test_sentences): seen_scores.add(round(score, 3)) assert seen_scores == set([ 5.167, # score for graph minors 3.444 # score for human interface ]) finally: if os.path.exists("test_phrases_testSaveLoad_temp_save.pkl"): os.remove("test_phrases_testSaveLoad_temp_save.pkl")
def extract_phrases(reviews_sents, reviews_docs, save=False): logging.info("Extracting phrases...") bigram = Phrases(reviews_sents, threshold=5, min_count=5) trigram = Phrases(bigram[reviews_sents], threshold=3, min_count=3) if save: with open('../data/phrase/phrases_%d_%s' % (3, 'app_review'), 'wb') as fout: ph_dic = {} for phrase, score in bigram.export_phrases(reviews_sents): ph_dic[phrase] = score for phrase, score in trigram.export_phrases(bigram[reviews_sents]): ph_dic[phrase] = score for phrase, score in ph_dic.items(): if re.search(r'\d+', phrase): # remove digits continue phrase = b"_".join(phrase.split(b' ')) fout.write(phrase + b'\n') bigram.save("../model/bigram.model") trigram.save("../model/trigram.model") return trigram[bigram[reviews_docs]]
def build_phrase_models(content, base_path, settings): """ Build and save the phrase models """ ngram_level = int(settings['level']) # According to tee() docs, this may be inefficient in terms of memory. # We need to do this because we need multiple passes through the # content stream. content = chain.from_iterable(doc.tokenized_text for doc in content) cs1, cs2 = tee(content, 2) for i in range(ngram_level - 1): phrases = Phrases(cs1) path = "%s.%s" % (base_path, i + 2) # save path as n-gram level logger.info("Phrase processor: Saving %s", path) phrases.save(path) # TODO: gensim complains about not using Phraser(phrases) content = phrases[cs2] # tokenize phrases in content stream cs1, cs2 = tee(content, 2)
def testSaveLoadCustomScorer(self): """ saving and loading a Phrases object with a custom scorer """ with temporary_file("test.pkl") as fpath: bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer) bigram.save(fpath) bigram_loaded = Phrases.load(fpath) seen_scores = [] test_sentences = [[ 'graph', 'minors', 'survey', 'human', 'interface', 'system' ]] for phrase, score in bigram_loaded.export_phrases(test_sentences): seen_scores.append(score) assert all(seen_scores) # all scores 1 assert len( seen_scores ) == 3 # 'graph minors' and 'survey human' and 'interface system'
def testSaveLoadNoScoring(self): """ Saving and loading a Phrases object with no scoring parameter. This should ensure backwards compatibility with old versions of Phrases""" try: bigram = Phrases(self.sentences, min_count=1, threshold=1) del(bigram.scoring) bigram.save("test_phrases_testSaveLoadNoScoring_temp_save.pkl") bigram_loaded = Phrases.load("test_phrases_testSaveLoadNoScoring_temp_save.pkl") seen_scores = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] for phrase, score in bigram_loaded.export_phrases(test_sentences): seen_scores.add(round(score, 3)) assert seen_scores == set([ 5.167, # score for graph minors 3.444 # score for human interface ]) finally: if os.path.exists("test_phrases_testSaveLoadNoScoring_temp_save.pkl"): os.remove("test_phrases_testSaveLoadNoScoring_temp_save.pkl")
def get_trigram_model(self, recalculate=False, from_scratch=True): if not os.path.isfile( self.paths.trigram_model_filepath) or recalculate: if not from_scratch: raise ValueError( 'No trigram model file exists but from_scratch is False') print('Building tri-gram model...') bigram_sentences = LineSentence( self.paths.bigram_sentences_filepath) trigram_model = Phrases(bigram_sentences) trigram_model = Phraser(trigram_model) print('Writing model...') trigram_model.save(self.paths.trigram_model_filepath) else: print('Loading tri-gram model...') trigram_model = Phrases.load(self.paths.trigram_model_filepath) print('Done!') return trigram_model
def train_ngrams_models(self, sent_tokens): """ Train bigrams,trigrams and dictionary and save them in cached models :param sent_tokens: concatenated overall complete dataframe """ bigrams = Phrases(sentences=sent_tokens, min_count=1, threshold=1) trigrams = Phrases(sentences=bigrams[sent_tokens], min_count=1, threshold=1) sent_tokens_transformed = trigrams[bigrams[sent_tokens]] d = corpora.Dictionary(sent_tokens_transformed) bow_corpus = [ d.doc2bow(sent_tokens) for sent_tokens in sent_tokens_transformed ] tfidf = TfidfModel(corpus=bow_corpus, id2word=d) try: bigrams.save('slm/app/cached_models/bigrams.gensim') trigrams.save('slm/app/cached_models/trigrams.gensim') d.save('slm/app/cached_models/dictionary.dict') tfidf.save('slm/app/cached_models/tfidf.gensim') except: pass
def testSaveLoad(self): """ Saving and loading a Phrases object.""" try: bigram = Phrases(self.sentences, min_count=1, threshold=1) bigram.save("test_phrases_testSaveLoad_temp_save.pkl") bigram_loaded = Phrases.load( "test_phrases_testSaveLoad_temp_save.pkl") seen_scores = set() test_sentences = [[ 'graph', 'minors', 'survey', 'human', 'interface', 'system' ]] for phrase, score in bigram_loaded.export_phrases(test_sentences): seen_scores.add(round(score, 3)) assert seen_scores == set([ 5.167, # score for graph minors 3.444 # score for human interface ]) finally: if os.path.exists("test_phrases_testSaveLoad_temp_save.pkl"): os.remove("test_phrases_testSaveLoad_temp_save.pkl")
def get_bigram_model(self, recalculate=False, from_scratch=True): if not os.path.isfile(self.paths.bigram_model_filepath) or recalculate: if not from_scratch: raise ValueError( 'No bigram model file exists but from_scratch is False') print('Building bi-gram model...') unigram_sentences = LineSentence( self.paths.unigram_sentences_filepath) bigram_model = Phrases( unigram_sentences ) # TODO look into supplying stop words here for better phrases bigram_model = Phraser(bigram_model) print('Writing model...') bigram_model.save(self.paths.bigram_model_filepath) else: print('Loading bi-gram model...') bigram_model = Phrases.load(self.paths.bigram_model_filepath) print('Done!') return bigram_model
def fit(self, sentencesPath): """ train phrases :param sentencesPath:the path of text file, the text file should be the format: one line one sentence """ self.phrasers = [] # path detect for path in self.savePhraserPaths: if not os.path.exists(os.path.dirname(path)): raise FileNotFoundError(os.path.dirname(path) + " not exist") for path in self.savePhraserPaths: if not os.path.exists(path): # need train self.phrasers = None break if self.phrasers is not None and self.file_overwrite == False: logging.info("models are already exist, will read it") for path in self.savePhraserPaths: self.phrasers.append(Phraser(Phrases.load(path))) return True self.phrasers = [] c = 2 for path in self.savePhraserPaths: logging.info("getting %d-gram phrase......" % c) c += 1 phrase = Phrases(sentences=TxtIter(sentences=codecs.open( sentencesPath, mode="r", encoding="utf-8"), ngrams=self.phrasers), min_count=self.min_count, threshold=self.threshold, max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, scoring=self.scoring) phrase.save(path) phraser = Phraser(phrase) self.phrasers.append(phraser) del phrase
from gensim.models.phrases import Phrases from gensim.models.word2vec import LineSentence sentence_stream=LineSentence('./data/text_cleaned.txt') bigram = Phrases(sentence_stream,threshold=50.0) bigram.save('./data/bigram.dat') trigram = Phrases(bigram[sentence_stream],threshold=50.0) trigram.save('./data/trigram.dat')
class GramFacade: def __init__(self, model_dir, bigrams_threshold=0.88, trigrams_threshold=0.88): self.model_dir = model_dir self.bigrams_threshold = bigrams_threshold self.trigrams_threshold = trigrams_threshold def load_models(self): self.bigrams_phraser = Phraser.load(self.model_dir + '/' + BIGRAMS_PHRASER_FILENAME) self.trigrams_phraser = Phraser.load(self.model_dir + '/' + TRIGRAMS_PHRASER_FILENAME) def load_phrases(self): self.bigrams_phrases = Phrases.load(self.model_dir + '/' + BIGRAMS_PHRASES_FILENAME) self.trigrams_phrases = Phrases.load(self.model_dir + '/' + TRIGRAMS_PHRASES_FILENAME) def export_bigrams(self, docs): return [self.bigrams_phraser[doc] for doc in docs] def export_trigrams(self, bigrams): return [self.trigrams_phraser[bigram] for bigram in bigrams] def phrase(self, doc): bigrams = self.bigrams_phraser[doc] trigrams = self.trigrams_phraser[bigrams] return trigrams def create_model(self, doc_list): self.bigrams_phrases = Phrases(doc_list, scoring='npmi', threshold=self.bigrams_threshold) self.bigrams_phraser = Phraser(self.bigrams_phrases) self.trigrams_phrases = Phrases(self.bigrams_phraser[doc_list], scoring='npmi', threshold=self.trigrams_threshold) self.trigrams_phraser = Phraser(self.trigrams_phrases) self.bigrams_phraser.save(self.model_dir + '/' + BIGRAMS_PHRASER_FILENAME) self.trigrams_phraser.save(self.model_dir + '/' + TRIGRAMS_PHRASER_FILENAME) self.bigrams_phrases.save(self.model_dir + '/' + BIGRAMS_PHRASES_FILENAME) self.trigrams_phrases.save(self.model_dir + '/' + TRIGRAMS_PHRASES_FILENAME) def words_not_in_vocab(self, tok_doc, threshold): word_not_in_doc = set([ x for x in tok_doc if self.trigrams_phrases.vocab[str.encode(x)] < threshold ]) return word_not_in_doc def retrieve_grams(self): pgrams = self.trigrams_phraser.phrasegrams gram_list = [] for word, values in pgrams.items(): gram = b'_'.join(word) count, score = values[0], values[1] gram_list.append({ "gram": gram.decode("utf-8"), "count": count, "score": score }) gram_sorted = sorted(gram_list, key=lambda x: x["score"], reverse=True) return gram_sorted
seg = Segmenter() # vocab = Dictionary() phrases = Phrases() text_path = sys.argv[1] def get_data(text_path): for line in open(text_path, "r"): line = line.strip() if line: data = json.loads(line) yield data['abstract'] for ind, text in enumerate(get_data(text_path)): segments = seg(text, segment_len=1, segment_overlap=0) phrases.add_vocab(segments) # vocab.add_documents(segments, prune_at=2000000) if ind % 10000: print(f"\rProcessed:{ind}", end = "") break # vocab.filter_extremes(no_below=5, no_above=0.5, keep_n=2000000) # vocab.save("academic.dict") phrases.save("academic.phrases")
def build_bigram_model(self,sentences,count): print "In Bigram Model" bigram = Phrases(sentences,min_count=count) dest = self.models + 'bigram_model' bigram.save(dest) return bigram
class PmiPhraseDetector(object): """ Detection using Pointwise Mutual Information (PMI) """ def __init__(self, sentences, filename=None): # model parameters self.sentences = sentences self.dataset = "CASEREPORT" self.tokenizer = "RAW" self.prune_stopwords = stopwords("pubmed") self.phrases = None self.threshold = 250 self.decay = 2 self.bigram_iter = 3 # data file path models_folder = os.path.join(*[os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'models']) if filename is None: filename = "PHRASE_%s_%s_%s_%s" % (self.threshold, self.decay, self.dataset, self.tokenizer, ) self.filepath = os.path.join(models_folder, filename) # does identical model already exists? model_exists = os.path.isfile(self.filepath) if model_exists: logging.info("LOADING - loading phrase data..") self.phrases = Phrases.load(self.filepath) else: logging.info("CREATE - creating phrase data..") self.build() def build(self): self.phrases = Phrases(self.sentences, min_count=1, threshold=self.threshold) # run additional merge rounds for i in range(2, self.bigram_iter + 1): self.phrases = Phrases(self.sentences, min_count=1, threshold=self.threshold*(1.0/self.decay)**(i-1)) # prune phrases self.prune() # save model to file self.save() def save(self): self.phrases.save(self.filepath) def prune(self, min_reduce=1): """ Remove phrases beginning or ending with a stopword. Also removes phrases appearing less frequently than a threshold. :param min_reduce: frequency threshold """ multiword_phrases = [phrase for phrase in self.phrases.vocab if "_" in phrase] for phrase in multiword_phrases: words = phrase.split("_") first_word, last_word = words[0], words[-1] if first_word in self.prune_stopwords or last_word in self.prune_stopwords: del self.phrases.vocab[phrase] prune_vocab(self.phrases.vocab, min_reduce) def detect(self, sentence): return self.phrases[sentence] def print_phrases(self, threshold=100): for word in self.phrases.vocab: if "_" in word and self.phrases.vocab[word] > threshold: print word, self.phrases.vocab[word]