def testScoringDefault(self): """ test the default scoring, from the mikolov word2vec paper """ bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) seen_scores = set() test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']] for phrase, score in bigram.export_phrases(test_sentences): seen_scores.add(round(score, 3)) min_count = float(bigram.min_count) len_vocab = float(len(bigram.vocab)) graph = float(bigram.vocab[b"graph"]) data = float(bigram.vocab[b"data"]) data_and_graph = float(bigram.vocab[b"data_and_graph"]) human = float(bigram.vocab[b"human"]) interface = float(bigram.vocab[b"interface"]) human_interface = float(bigram.vocab[b"human_interface"]) assert seen_scores == set([ # score for data and graph round((data_and_graph - min_count) / data / graph * len_vocab, 3), # score for human interface round((human_interface - min_count) / human / interface * len_vocab, 3), ])
def testScoringDefault(self): """ test the default scoring, from the mikolov word2vec paper """ bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) seen_scores = set() test_sentences = [[ 'data', 'and', 'graph', 'survey', 'for', 'human', 'interface' ]] for phrase, score in bigram.export_phrases(test_sentences): seen_scores.add(round(score, 3)) min_count = float(bigram.min_count) len_vocab = float(len(bigram.vocab)) graph = float(bigram.vocab[b"graph"]) data = float(bigram.vocab[b"data"]) data_and_graph = float(bigram.vocab[b"data_and_graph"]) human = float(bigram.vocab[b"human"]) interface = float(bigram.vocab[b"interface"]) human_interface = float(bigram.vocab[b"human_interface"]) assert seen_scores == set([ # score for data and graph round((data_and_graph - min_count) / data / graph * len_vocab, 3), # score for human interface round( (human_interface - min_count) / human / interface * len_vocab, 3), ])
def extract_phrases(filename, min_count): rst = build_input(filename) gen = list(itertools.chain.from_iterable(rst)) bigram = Phrases(gen, threshold=5, min_count=min_count) trigram = Phrases(bigram[gen], threshold=2, min_count=2) # write with open('data/phrases_%d_%s' % (min_count, os.path.basename(filename)), 'wb') as fout: ph_dic = {} for phrase, score in bigram.export_phrases(gen): ph_dic[phrase] = score for phrase, score in trigram.export_phrases(bigram[gen]): ph_dic[phrase] = score for phrase, score in ph_dic.items(): if re.search(r'\d+', phrase): # remove digits continue phrase = b"_".join(phrase.split(b' ')) fout.write(phrase + b'\n')
def testMultipleBigramsSingleEntry(self): """ a single entry should produce multiple bigrams. """ bigram = Phrases(self.sentences, min_count=1, threshold=1) seen_bigrams = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']] for phrase, score in bigram.export_phrases(test_sentences): seen_bigrams.add(phrase) assert seen_bigrams == {b'graph minors', b'human interface'}
def testMultipleBigramsSingleEntry(self): """ a single entry should produce multiple bigrams. """ bigram = Phrases(self.sentences, min_count=1, threshold=1) seen_bigrams = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']] for phrase, score in bigram.export_phrases(test_sentences): seen_bigrams.add(phrase) assert seen_bigrams == {b'graph minors', b'human interface'}
def testExportPhrases(self): """Test Phrases bigram export_phrases functionality.""" bigram = Phrases(sentences, min_count=1, threshold=1) seen_bigrams = set() for phrase, score in bigram.export_phrases(sentences): seen_bigrams.add(phrase) assert seen_bigrams == {b'response time', b'graph minors', b'human interface'}
def extract_phrases(reviews_sents, reviews_docs, save=False): logging.info("Extracting phrases...") bigram = Phrases(reviews_sents, threshold=5, min_count=5) trigram = Phrases(bigram[reviews_sents], threshold=3, min_count=3) if save: with open('../data/phrase/phrases_%d_%s' % (3, 'app_review'), 'wb') as fout: ph_dic = {} for phrase, score in bigram.export_phrases(reviews_sents): ph_dic[phrase] = score for phrase, score in trigram.export_phrases(bigram[reviews_sents]): ph_dic[phrase] = score for phrase, score in ph_dic.items(): if re.search(r'\d+', phrase): # remove digits continue phrase = b"_".join(phrase.split(b' ')) fout.write(phrase + b'\n') bigram.save("../model/bigram.model") trigram.save("../model/trigram.model") return trigram[bigram[reviews_docs]]
def testExportPhrases(self): """Test Phrases bigram export_phrases functionality.""" bigram = Phrases(sentences, min_count=1, threshold=1) seen_bigrams = set() for phrase, score in bigram.export_phrases(sentences): seen_bigrams.add(phrase) assert seen_bigrams == set( [b'response time', b'graph minors', b'human interface'])
def testCustomScorer(self): """ test using a custom scoring function """ bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer) seen_scores = [] test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] for phrase, score in bigram.export_phrases(test_sentences): seen_scores.append(score) assert all(seen_scores) # all scores 1 assert len(seen_scores) == 3 # 'graph minors' and 'survey human' and 'interface system'
def testMultipleBigramsSingleEntry(self): """ a single entry should produce multiple bigrams. """ bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) seen_bigrams = set() test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']] for phrase, score in bigram.export_phrases(test_sentences): seen_bigrams.add(phrase) assert seen_bigrams == set([ b'data and graph', b'human interface', ])
def show_phrases(corpus, threshold=1000, shown=1000): # Training the multi-word expression detector tokenized_sentences = tokenize_sentences(corpus) phrases = Phrases(tokenized_sentences, threshold=threshold) i = 0 for phrase, score in phrases.export_phrases(tokenized_sentences): if i > shown: break else: print("Expression : {0}, score = {1}".format( phrase.decode('utf-8'), score)) i = i + 1
def generating_bigrams(final_df): eligibility_criteria = final_df['features'] bigrams_input = [each_row.split() for each_row in eligibility_criteria] bigram_transformer = Phrases(bigrams_input, min_count=20, threshold=500) bigram_transformer.save("bigrams", pickle_protocol=4) fd = open("bigrams.txt", 'a') for phrase, score in bigram_transformer.export_phrases(bigrams_input): fd.write(u'{0} {1}'.format(phrase, score)) fd.close() return bigram_transformer
def test_create_and_decode_phrases(self): df = pd.read_csv('text_analytics/tests/NYT.Corruption') phrases = Phrases( sentences=read_clean(df), min_count=100, threshold=0.70, scoring="npmi", max_vocab_size=100000000, delimiter="_", ) exported = phrases.export_phrases() return exported
def testCustomScorer(self): """ test using a custom scoring function """ bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer, common_terms=self.common_terms) seen_scores = [] test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']] for phrase, score in bigram.export_phrases(test_sentences): seen_scores.append(score) assert all(seen_scores) # all scores 1 assert len(seen_scores) == 2 # 'data and graph' 'survey for human'
def testScoringNpmi(self): """ test normalized pointwise mutual information scoring """ bigram = Phrases(self.sentences, min_count=1, threshold=.5, scoring='npmi') seen_scores = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']] for phrase, score in bigram.export_phrases(test_sentences): seen_scores.add(round(score, 3)) assert seen_scores == { .882, # score for graph minors .714 # score for human interface }
def testScoringDefault(self): """ test the default scoring, from the mikolov word2vec paper """ bigram = Phrases(self.sentences, min_count=1, threshold=1) seen_scores = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']] for phrase, score in bigram.export_phrases(test_sentences): seen_scores.add(round(score, 3)) assert seen_scores == { 5.167, # score for graph minors 3.444 # score for human interface }
def testScoringDefault(self): """ test the default scoring, from the mikolov word2vec paper """ bigram = Phrases(self.sentences, min_count=1, threshold=1) seen_scores = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']] for phrase, score in bigram.export_phrases(test_sentences): seen_scores.add(round(score, 3)) assert seen_scores == { 5.167, # score for graph minors 3.444 # score for human interface }
def testScoringNpmi(self): """ test normalized pointwise mutual information scoring """ bigram = Phrases(sentences, min_count=1, threshold=.5, scoring='npmi') seen_scores = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']] for phrase, score in bigram.export_phrases(test_sentences): seen_scores.add(round(score, 3)) assert seen_scores == set([ .882, # score for graph minors .714 # score for human interface ])
def testExportPhrases(self): """Test Phrases bigram export_phrases functionality.""" bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) seen_bigrams = set() for phrase, score in bigram.export_phrases(self.sentences): seen_bigrams.add(phrase) assert seen_bigrams == set([ b'human interface', b'graph of trees', b'data and graph', b'lack of interest', ])
def test_export_phrases(self): """Test Phrases bigram and trigram export phrases.""" bigram = Phrases(self.sentences, min_count=1, threshold=1, delimiter=' ') trigram = Phrases(bigram[self.sentences], min_count=1, threshold=1, delimiter=' ') seen_bigrams = set(bigram.export_phrases().keys()) seen_trigrams = set(trigram.export_phrases().keys()) assert seen_bigrams == set([ 'human interface', 'response time', 'graph minors', 'minors survey', ]) assert seen_trigrams == set([ 'human interface', 'graph minors survey', ])
def testScoringNpmi(self): """ test normalized pointwise mutual information scoring """ bigram = Phrases(self.sentences, min_count=1, threshold=.5, scoring='npmi', common_terms=self.common_terms) seen_scores = set() test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']] for phrase, score in bigram.export_phrases(test_sentences): seen_scores.add(round(score, 3)) assert seen_scores == set([ .74, # score for data and graph .894 # score for human interface ])
def salient_bigrams(phrases: Phrases): """Finds the most salient bigrams Args: phrases (Phrases): Phrases class set up for bigram search """ for slice in read_corpus(): phrases.add_vocab(read_slice(slice)) # evaluate all previous corpus slices found = set() total_bigrams_encountered = 0 for previous_slice in read_corpus(): for phrase, score in phrases.export_phrases( read_slice(previous_slice)): found.add((phrase, score)) total_bigrams_encountered += 1 if previous_slice == slice: break found = sorted(found, key=lambda element: element[1], reverse=True) # no bigrams found? if len(found) == 0: output(slice, "") # log the top ten bigrams for phrase, score in found[:10]: output(slice, "{phrase}, {score}".format(phrase=phrase, score=score)) # log the total counts output( slice, """ Total bigrams: {total} Unique bigrams: {unique} Median score:{median} Max score:{max} Min score:{min} """.format(total=total_bigrams_encountered, unique=len(found), median=found[len(found) // 2] if len(found) != 0 else 0, max=found[0] if len(found) != 0 else 0, min=found[-1]) if len(found) != 0 else 0) # will log a time if command line args were enabled Timer.try_to_time()
def testMultipleBigramsSingleEntry(self): """ a single entry should produce multiple bigrams. """ bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) seen_bigrams = set() test_sentences = [[ 'data', 'and', 'graph', 'survey', 'for', 'human', 'interface' ]] for phrase, score in bigram.export_phrases(test_sentences): seen_bigrams.add(phrase) assert seen_bigrams == set([ b'data and graph', b'human interface', ])
def test_export_phrases(self): """Test Phrases bigram export phrases.""" bigram = Phrases(self.sentences, min_count=1, threshold=1, delimiter=' ') seen_bigrams = set(bigram.export_phrases().keys()) assert seen_bigrams == set([ 'and graph', 'data and', 'graph of', 'graph survey', 'human interface', 'lack of', 'of interest', 'of trees', ])
def testExportPhrases(self): """Test Phrases bigram export_phrases functionality.""" bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) seen_bigrams = set() for phrase, score in bigram.export_phrases(self.sentences): seen_bigrams.add(phrase) assert seen_bigrams == set([ b'human interface', b'graph of trees', b'data and graph', b'lack of interest', ])
def testExportPhrases(self): """Test Phrases bigram export_phrases functionality.""" bigram = Phrases(sentences, min_count=1, threshold=1) # with this setting we should get response_time and graph_minors bigram1_seen = False bigram2_seen = False for phrase, score in bigram.export_phrases(sentences): if not bigram1_seen and b'response time' == phrase: bigram1_seen = True elif not bigram2_seen and b'graph minors' == phrase: bigram2_seen = True if bigram1_seen and bigram2_seen: break self.assertTrue(bigram1_seen) self.assertTrue(bigram2_seen)
def testCustomScorer(self): """ test using a custom scoring function """ bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer, common_terms=self.common_terms) seen_scores = [] test_sentences = [[ 'data', 'and', 'graph', 'survey', 'for', 'human', 'interface' ]] for phrase, score in bigram.export_phrases(test_sentences): seen_scores.append(score) assert all(seen_scores) # all scores 1 assert len(seen_scores) == 2 # 'data and graph' 'survey for human'
def testExportPhrases(self): """Test Phrases bigram export_phrases functionality.""" bigram = Phrases(sentences, min_count=1, threshold=1) # with this setting we should get response_time and graph_minors bigram1_seen = False bigram2_seen = False for phrase, score in bigram.export_phrases(sentences): if not bigram1_seen and b'response time' == phrase: bigram1_seen = True elif not bigram2_seen and b'graph minors' == phrase: bigram2_seen = True if bigram1_seen and bigram2_seen: break self.assertTrue(bigram1_seen) self.assertTrue(bigram2_seen)
def testCustomScorer(self): """ test using a custom scoring function """ bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer) seen_scores = [] test_sentences = [[ 'graph', 'minors', 'survey', 'human', 'interface', 'system' ]] for phrase, score in bigram.export_phrases(test_sentences): seen_scores.append(score) assert all(seen_scores) # all scores 1 assert len( seen_scores ) == 3 # 'graph minors' and 'survey human' and 'interface system'
def testScoringNpmi(self): """ test normalized pointwise mutual information scoring """ bigram = Phrases(self.sentences, min_count=1, threshold=.5, scoring='npmi', common_terms=self.common_terms) seen_scores = set() test_sentences = [[ 'data', 'and', 'graph', 'survey', 'for', 'human', 'interface' ]] for phrase, score in bigram.export_phrases(test_sentences): seen_scores.add(round(score, 3)) assert seen_scores == set([ .74, # score for data and graph .894 # score for human interface ])
tri_phrase = Phrases(bi_sentances_tokens, min_count=3, threshold=10, scoring='default') tri_grapm = Phraser(tri_phrase) tri_phrase.save(trigram_model_save_path) print('bigram model saved at {}'.format(trigram_model_save_path)) #%% print('run one test:') sent = list(jieba.cut("我们今天来谈一谈区块链和人工智能的结合")) print('origin split: \n {}'.format(sent)) print('updated split: \n {}'.format(tri_grapm[bi_grapm[sent]])) #%% ## export all phrases to excel for review bi_phrases_b = bi_phrase.export_phrases(sentances_tokens) tri_phrases_b = tri_phrase.export_phrases(bi_sentances_tokens) detected_bi_phrases = list( set([(p.decode('utf-8'), v) for (p, v) in bi_phrases_b])) detected_tri_phrases = list( set([(p.decode('utf-8'), v) for (p, v) in tri_phrases_b])) detected_tri_phrases = [(p.replace('_', ' '), v) for (p, v) in detected_tri_phrases if "_" in p] ## only keep real trigrams #%% ## incrrase in vocabulary bi_phrases_df = pd.DataFrame(detected_bi_phrases, columns=['phrases', 'score']) tri_phrases_df = pd.DataFrame(detected_tri_phrases, columns=['phrases', 'score']) bi_phrases_df.sort_values(by='score', inplace=True, ascending=False)
+ 'threshold: ' + str(threshold) + '\t' \ + 'top_ngram: ' + str(top_ngram) print(log_str) # ----------------------------Entrainement # Train bigram model. if (int(args.order) == 2 and args.save == True) or (int(args.order) >= 2 and args.save == False): start = time.time() bigram_phrases = Phrases( sentences, min_count=args.mincount[0], threshold=args.threshold[0], scoring='npmi') bigram = Phraser(bigram_phrases) if args.save == True: bigram_phrases.save('result/model_' + filename + '_m_' + mincount + '_t_' + threshold + '_bigram_model.pkl') score_bigram = sorted(list(set(bigram_phrases.export_phrases( sentences))), key=lambda x: x[1], reverse=True) train_time = time.time() - start ngram = score_bigram log_train('bigram', len(sentences), train_time, len(ngram), ngram[0][1], ngram[-1][1], mincount, threshold, ngram[:10]) # Train trigram model. if (int(args.order) == 3 and args.save == True) or (int(args.order) >= 3 and args.save == False): start = time.time() if args.save == True: bigram_phrases = Phrases.load( 'result/model_' + filename + '_m_' + mincount + '_t_' + threshold + '_bigram_model.pkl') bigram = Phraser(bigram_phrases) trigram_phrases = Phrases(
def process_and_save_worker( infile, threshold=10, min_count=50, min_len=5, delete_orig=False, num_phrasing_rounds=2, ): """ Single threaded worker for the text preprocessing and saving of files. Called by process_and_save(). :param infile: str Path to the .txt.bz2 file to be processed. :param threshold: float The threshold kwarg of Gensim's Phrases() object. :param min_count: int The min_count kwarg of Gensim's Phrases() object. :param min_len: int minimum number of words in a post to keep it. Posts shorter than min_len words AFTER PROCESSING are discarded. :param delete_orig: bool True to delete the original .bz2 file (from Archive.org) after processing, False to keep it. Deleting it can save disk space. :return: """ # Grab two temporary files, so we can shunt data between them, # processing it while it's in memory. suff = infile.replace("\\", "/").split("/")[-1][:-8] S = f"{suff:<20s}" raw_out = infile\ .replace("\\", "/") \ .replace(".txt.bz2", "_raw.txt.bz2") \ .replace("By Subreddit/FINAL/", "Processed Files/") processed_out = infile \ .replace("\\", "/") \ .replace(".txt.bz2", "_processed.txt.bz2") \ .replace("By Subreddit/FINAL/", "Processed Files/") working_file = processed_out.replace("_processed.txt.bz2", "_working.txt.bz2") # Create a total count variable that we'll use to update tqdm appropriately. total = 0 with bz2.open(infile, "rt", encoding="utf8") as I, \ bz2.open(raw_out, "wt", encoding="utf8") as R, \ bz2.open(processed_out, "wt", encoding="utf8") as P: for i in tqdm(I, desc=f"{S}: Preprocessing", mininterval=5, position=1): # Preprocess, and skip if length is too low. text = process_string(literal_eval(i)) if len(text) < min_len: continue text = " ".join(text) # write repr() for raw files to ensure one line per post; # write the ID as a fixed length string. R.write(f"{i.strip()}\n") P.write(f"{text}\n") total += 1 # Stream the processed files through a Gensim Phrases() object. # If there are any phrases to be found, stream through a Phraser() # object and into a temp file. Then overwrite the original # processed file and repeat. for i in range(num_phrasing_rounds): with bz2.open(processed_out, "rt", encoding="utf8") as IN, \ bz2.open(working_file, "wt", encoding="utf8") as OUT: p = Phrases( (i.strip().split() for i in tqdm( IN, total=total, desc=f"{S} Phrase-finding {i+1}", mininterval=5, position=1 )), threshold=threshold, min_count=min_count, # for some reason I get errors if the delimiter isn't a bytestring delimiter=b'_' ) IN.seek(0) # See if there were any phrases found. If not, abort phrasing early. try: next(p.export_phrases(i.strip().split() for i in IN)) except StopIteration: break pp = Phraser(p) IN.seek(0) for i in tqdm(IN, total=total, desc=f"{S} Applying phraser 1", mininterval=5, position=1): OUT.write(f"{' '.join(list(pp[i.strip().split()]))}\n") move(working_file, processed_out) # Now, do a final pass to filter posts by length again. This second pass # is because the length of a file may have changed considerably after # processing. As before, stream to a temporary working file, then # overwrite the original when done. raw_working = raw_out.replace(".txt.bz2", "_working.txt.bz2") processed_working = processed_out.replace(".txt.bz2", "_working.txt.bz2") with bz2.open(raw_out, "rt", encoding="utf8") as RAW_IN, \ bz2.open(processed_out, "rt", encoding="utf8") as PROC_IN, \ bz2.open(raw_working, "wt", encoding="utf8") as RAW_OUT, \ bz2.open(processed_working, "wt", encoding="utf8") as PROC_OUT: for i in zip(PROC_IN, RAW_IN): if len(i[0].split()) >= min_len: PROC_OUT.write(i[0]) RAW_OUT.write(i[1]) move(processed_working, processed_out) move(raw_working, raw_out) if delete_orig == True: os.remove(infile) return 0
def _generate_phrase(self, pd_data, load_model=False, section='phrase'): """ (Private) Generate phrase using the gensim Phrase detection module. Inputs: pd_data: (pd.Series) Data which will be used to generate phase. section: (str, optional) Section name of the .ini file. Returns: pd_data: (pd.Series) Input data but using phrases. """ if not self.configparser.getbool('generate_phrase', section): log.info('Skipping phrase generation...') return pd_data if load_model: model_filepath = self.configparser.getstr('phrase_model', section) model = Phraser.load(model_filepath) # apply phrase model log.info('Applying loaded phrase model...') pd_data = pd_data.apply(lambda x: model[x], convert_dtype=False) else: log.info('Generating new phrases...') # this is our training data sentences = pd_data.tolist() # detect phrases using the configuration model = Phrases( sentences, min_count=self.configparser.getint('min_count', section), threshold=self.configparser.getfloat('threshold', section), max_vocab_size=self.configparser.getint( 'max_vocab_size', section), progress_per=self.configparser.getint('progress_per', section), scoring=self.configparser.getstr('scoring', section)) # apply trained model to generate phrase log.info('Applying phrase model...') pd_data = pd_data.apply(lambda x: model[x], convert_dtype=False) # save phrase model model_filepath = self.configparser.getstr('phrase_model', section) log.info('Saving phrase model to \'%s\'...', model_filepath) model.save(model_filepath) # dump phrase and its score as text phrase_score_list = [] for phrase, score in model.export_phrases(sentences): phrase_score_list.append([phrase.decode('utf-8'), score]) pd_phrase_score = pd.DataFrame(phrase_score_list, columns=['phrase', 'score']) pd_phrase_score.drop_duplicates(subset='phrase', inplace=True) export_filepath = self.configparser.getstr('phrase_dump_filename', section) log.info('Dumping phrases to \'%s\'...', export_filepath) pd_phrase_score.to_csv(export_filepath, sep='\t', index=False) return pd_data
# print(bigram[i]) result = [] for i in test: result.append(bigram[i]) pickle.dump(result, open('phrases_res_nost.pkl','wb')) ### based on bigram phrases to detect trigram and 4-grams bg = pickle.load(open('phrases_res_nost.pkl', 'rb')) phrases2 = Phrases(bg, min_count=10, threshold=0.1, scoring = 'npmi') phr_score = {} for phrase, score in phrases2.export_phrases(bg): # print(phrase.decode(), score) phr_score[phrase.decode()] = score print('The npmi score of phrases:' sorted(phr_score.items(), key=lambda d: d[1], reverse=True)) trigram = Phraser(phrases2) # for i in bg[:100]: # print(trigram[i]) bi_tri_res = [] for i in bg: bi_tri_res.append(trigram[i])
number_points = 15 sentences_taken = [] bigramme_taken = [] time_taken = [] for i in range(1,number_points+1): print(i) start = time.time() sentences = list_sentences[0: int(i * (len_total_sentences/number_points))] bigram_phrases = Phrases(sentences, min_count=1, threshold=10) score_bigram = sorted(list(set(bigram_phrases.export_phrases( sentences))), key=lambda x: x[1], reverse=True) end = time.time() time_taken.append(end-start) sentences_taken.append(len(sentences)) bigramme_taken.append(len(score_bigram)) with open("10mil_text_sentences_bigramme_time.csv", "w") as f: writer = csv.writer(f) writer.writerows(zip(sentences_taken, trigramme_taken, tri_time_taken)) # =============================================================== # Informations pour les trigrammes sents = PathLineSentences(PATH_TRAINING,limit = 160000) list_sentences = list(sents)
all_phrases = {} phrases = Phrases(DOCUMENTS, **param) bi_gram = Phraser(phrases) Bi_PHRASES = [] for doc in DOCUMENTS: bi_grams = bi_gram[doc] Bi_PHRASES.append(bi_grams) # {(10, 10), (15, 20), (20, 10)} tri_phrases = Phrases(Bi_PHRASES) TRI_PHRASES = {} for phrase, score in tri_phrases.export_phrases(Bi_PHRASES): phrase = phrase.decode("utf-8").replace("_", " ") if len(phrase.split()) > 2: all_phrases[phrase] = score results = { k: v for k, v in sorted( all_phrases.items(), key=lambda item: item[1], reverse=True) } print(f"Model Dumping {index}") with open( f"models/phrases_ahsan_{param['min_count']}_{param['threshold']}.json", "w") as out_json: json.dump(results, out_json, ensure_ascii=False, indent=4)
def salient_trigrams(phrases: Phrases): """Finds the most salient trigrams Args: phrases (Phrases): Phrases class set up for bigram search """ trigram = Phrases() for slice in read_corpus(): # prepare the bigram for previous_slice in read_corpus(): phrases.add_vocab(read_slice(slice)) if previous_slice == slice: break # transform sentences into possible bigrams bigram_phraser = Phraser(phrases) def bigrammed(slice: str): for sent in read_slice(slice): yield bigram_phraser[sent] trigram.add_vocab(bigrammed(slice)) # evaluate all previous corpus slices found = set() total_trigrams_encountered = 0 for previous_slice in read_corpus(): for phrase, score in trigram.export_phrases( bigrammed(previous_slice)): if phrase.count(b'_') == 2: found.add((phrase, score)) total_trigrams_encountered += 1 elif '_' in phrase: print(phrase) if previous_slice == slice: break found = sorted(found, key=lambda element: element[1], reverse=True) # no trigrams found? if len(found) == 0: output(slice, "") # log the top ten trigrams for phrase, score in found[:10]: output(slice, "{phrase}, {score}".format(phrase=phrase, score=score)) # log the total counts output( slice, """ Total trigrams: {total} Unique trigrams: {unique} Mean score:{median} Max score:{max} Min score:{min} """.format(total=total_trigrams_encountered, unique=len(found), median=found[len(found) // 2] if len(found) != 0 else 0, max=found[0] if len(found) != 0 else 0, min=found[-1] if len(found) != 0 else 0)) # will log a time if command line args were enabled Timer.try_to_time()