def __init__(self): ''' Training parameters: ''' self.w2v_dim=100 self.num_feature=400 self.batch_size=16 self.num_epoch=30 # self.w2v_model=Word2Vec.load_word2vec_format('./data/word2vec/GoogleNews-vectors-negative300.bin', binary=True) self.w2v_model=Word2Vec.load('./data/word2vec/w2v.model') self.index2word_set = set(self.w2v_model.index2word) #self.bigram=None #self.trigram=None self.bigram=Phrases.load('./data/bigram.dat') self.trigram=Phrases.load('./data/trigram.dat') print('Build model...') self.model = Sequential() self.model.add(Dropout(0.2,input_shape=(self.num_feature,))) self.model.add(Dense(3, input_dim=self.num_feature, init='orthogonal')) self.model.add(Activation('softmax')) self.model.compile(loss='categorical_crossentropy', optimizer='adam', class_mode="categorical") print('Model has been built!')
def testScoringDefault(self): """ test the default scoring, from the mikolov word2vec paper """ bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) seen_scores = set() test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']] for phrase, score in bigram.export_phrases(test_sentences): seen_scores.add(round(score, 3)) min_count = float(bigram.min_count) len_vocab = float(len(bigram.vocab)) graph = float(bigram.vocab[b"graph"]) data = float(bigram.vocab[b"data"]) data_and_graph = float(bigram.vocab[b"data_and_graph"]) human = float(bigram.vocab[b"human"]) interface = float(bigram.vocab[b"interface"]) human_interface = float(bigram.vocab[b"human_interface"]) assert seen_scores == set([ # score for data and graph round((data_and_graph - min_count) / data / graph * len_vocab, 3), # score for human interface round((human_interface - min_count) / human / interface * len_vocab, 3), ])
def build_trigram_model(self,sentences,bigram): print "In Trigram Model" trigram = Phrases(bigram[sentences]) dest = self.models + 'trigram_model' trigram.save(dest) return trigram
def build(self): self.phrases = Phrases(self.sentences, min_count=1, threshold=self.threshold) # run additional merge rounds for i in range(2, self.bigram_iter + 1): self.phrases = Phrases(self.sentences, min_count=1, threshold=self.threshold*(1.0/self.decay)**(i-1)) # prune phrases self.prune() # save model to file self.save()
def testExportPhrases(self): """Test Phrases bigram export_phrases functionality.""" bigram = Phrases(sentences, min_count=1, threshold=1) seen_bigrams = set() for phrase, score in bigram.export_phrases(sentences): seen_bigrams.add(phrase) assert seen_bigrams == {b'response time', b'graph minors', b'human interface'}
def testMultipleBigramsSingleEntry(self): """ a single entry should produce multiple bigrams. """ bigram = Phrases(self.sentences, min_count=1, threshold=1) seen_bigrams = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']] for phrase, score in bigram.export_phrases(test_sentences): seen_bigrams.add(phrase) assert seen_bigrams == {b'graph minors', b'human interface'}
def setUp(self): """Set up FrozenPhrases models for the tests.""" bigram_phrases = Phrases(self.sentences, min_count=1, threshold=1, connector_words=self.connector_words) self.bigram = FrozenPhrases(bigram_phrases) bigram_default_phrases = Phrases(self.sentences, connector_words=self.connector_words) self.bigram_default = FrozenPhrases(bigram_default_phrases)
def generating_bigrams(final_df): eligibility_criteria = final_df['features'] bigrams_input = [each_row.split() for each_row in eligibility_criteria] bigram_transformer = Phrases(bigrams_input, min_count=20, threshold=500) bigram_transformer.save("bigrams", pickle_protocol=4) fd = open("bigrams.txt", 'a') for phrase, score in bigram_transformer.export_phrases(bigrams_input): fd.write(u'{0} {1}'.format(phrase, score)) fd.close() return bigram_transformer
def testCustomScorer(self): """ test using a custom scoring function """ bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer) seen_scores = [] test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] for phrase, score in bigram.export_phrases(test_sentences): seen_scores.append(score) assert all(seen_scores) # all scores 1 assert len(seen_scores) == 3 # 'graph minors' and 'survey human' and 'interface system'
def create_ngrams(category, lang): """Given a category, create n-grams for the text, clean the corpus and return the sentences cleaned Parameters ---------- category : string Name of the domain e.g : "Santé","Business",etc. lang : string default = "fr" Returns ------- sentences : list of string sentences[i] is a sentence in the corpus cleaned """ tagger = treetaggerwrapper.TreeTagger(TAGLANG=lang) #to lemmatize words sentences = [] bigrams_model = Phrases(min_count=100, threshold=10.0, delimiter="-") #to create bigrams filename = "../data/" + category + ".txt" with open(filename, "r") as ins: for line in ins: line = line.decode("utf8") lines = line.split('.') for l in lines: sentence = nltk.word_tokenize(l) if sentence: sentences.append(sentence) #bigrams_model.add_vocab([sentence]) bigrams = list(bigrams_model[sentences]) #to create trigrams trigrams_model = Phrases(bigrams, min_count=50, threshold=10.0, delimiter="-") sentences = list(trigrams_model[bigrams]) n = len(sentences) for i in range(n): tags = tagger.tag_text(sentences[i]) text = [ tag.split('\t')[2] for tag in tags if tag.split('\t')[1] != "NUM" and tag.split('\t')[1] != "PUN" ] text = " ".join(text) text = clean_text_simple(text) sentences[i] = text if i % 10000 == True: print i, "sentences processsed" sentences = [sent for sent in sentences if len(sent) != 0] return sentences
def testMultipleBigramsSingleEntry(self): """ a single entry should produce multiple bigrams. """ bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) seen_bigrams = set() test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']] for phrase, score in bigram.export_phrases(test_sentences): seen_bigrams.add(phrase) assert seen_bigrams == set([ b'data and graph', b'human interface', ])
def test_save_load_with_connector_words(self): """Test saving and loading a Phrases object.""" connector_words = frozenset({'of'}) bigram = Phrases(self.sentences, min_count=1, threshold=1, connector_words=connector_words) with temporary_file("test.pkl") as fpath: bigram.save(fpath) bigram_loaded = Phrases.load(fpath) assert bigram_loaded.connector_words == connector_words
def show_phrases(corpus, threshold=1000, shown=1000): # Training the multi-word expression detector tokenized_sentences = tokenize_sentences(corpus) phrases = Phrases(tokenized_sentences, threshold=threshold) i = 0 for phrase, score in phrases.export_phrases(tokenized_sentences): if i > shown: break else: print("Expression : {0}, score = {1}".format( phrase.decode('utf-8'), score)) i = i + 1
def testExportPhrases(self): """Test Phrases bigram export_phrases functionality.""" bigram = Phrases(sentences, min_count=1, threshold=1) seen_bigrams = set() for phrase, score in bigram.export_phrases(sentences): seen_bigrams.add(phrase) assert seen_bigrams == { b'response time', b'graph minors', b'human interface' }
def test_create_and_decode_phrases(self): df = pd.read_csv('text_analytics/tests/NYT.Corruption') phrases = Phrases( sentences=read_clean(df), min_count=100, threshold=0.70, scoring="npmi", max_vocab_size=100000000, delimiter="_", ) exported = phrases.export_phrases() return exported
def make_phraser(infile): """ Train the phraser object and save it. :param infile: path to xml file with the wikipedia dump :return: """ p = Phrases( tqdm((i.split() for i in file_yielder(infile)), desc="Phrase-finding")) p = Phraser(p) p.save("../models/phraser") return 0
def trainSOPhrase(g_DataQueue, g_FinishRead, savePath, priorPhrasePath): """ :param g_DataQueue:全局变量存放数据库中的数据 :param g_FinishRead:是否读取完数据库的标志 :param savePath:短语学习器保存的位置 :param priorPhrasePath:前一个学习器保存的位置 :return: """ count = 0 phrase = Phrases(None, min_count=10, threshold=15) if (priorPhrasePath is None): priorPhraser = None else: priorPhraser = Phraser(Phrases.load(priorPhrasePath)) while (g_FinishRead.value == 0 or (not g_DataQueue.empty())): data = g_DataQueue.get() count += len(data) print("have processed:", count) words = [] reSub0 = re.compile( "(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]" ) # URL reSub1 = re.compile( "[()\"{},:/-]|[^a-z]'|'[^a-z;?.!]|'$") # replace with " " reSub2 = re.compile( "'[.?;!]") # replace with . 主要考虑所有格问题,核心思想单引号左右的各种复杂情况 reSplit1 = re.compile("\.[^a-z0-9]|[?!;]") # 获取单词 for t in data: if (t[0] is not None): st = re.sub(reSub0, " ", t[0].lower()) st = re.sub(reSub1, ".", st) st = re.sub(reSub2, ".", st) for sentence in re.split(reSplit1, st): sen_word = sentence.split() if (len(sen_word) > 6): words.append(sen_word) if (t[1] is not None): st = re.sub(reSub0, " ", t[1].lower()) st = re.sub(reSub1, ".", st) st = re.sub(reSub2, ".", st) for sentence in re.split(reSplit1, st): sen_word = sentence.split() if (len(sen_word) > 6): words.append(sen_word) del data gc.collect() # 训练短语 if (priorPhraser is None): # 第一次训练 phrase.add_vocab(words) else: # 已经训练过一次,寻找个数更多的短语 phrase.add_vocab(priorPhraser[words]) del words # print(len(phrase.vocab)) gc.collect phrase.save(savePath)
def setUp(self): """Set up Phraser models for the tests.""" bigram_phrases = Phrases(sentences, min_count=1, threshold=1) self.bigram = Phraser(bigram_phrases) bigram_default_phrases = Phrases(sentences) self.bigram_default = Phraser(bigram_default_phrases) bigram_utf8_phrases = Phrases(sentences, min_count=1, threshold=1) self.bigram_utf8 = Phraser(bigram_utf8_phrases) bigram_unicode_phrases = Phrases(unicode_sentences, min_count=1, threshold=1) self.bigram_unicode = Phraser(bigram_unicode_phrases)
def testExportPhrases(self): """Test Phrases bigram export phrases.""" bigram = Phrases(self.sentences, min_count=1, threshold=1, delimiter=' ') seen_bigrams = set(bigram.find_phrases(self.sentences).keys()) assert seen_bigrams == { 'response time', 'graph minors', 'human interface', }
def testCustomScorer(self): """ test using a custom scoring function """ bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer, common_terms=self.common_terms) seen_scores = [] test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']] for phrase, score in bigram.export_phrases(test_sentences): seen_scores.append(score) assert all(seen_scores) # all scores 1 assert len(seen_scores) == 2 # 'data and graph' 'survey for human'
def __init__(self, dataset=CLASSIC3_JSON_DS): # loading the corpus corpus = Sentences(dataset) # Using a phrase model to refine the corpus bigram = Phraser(Phrases(corpus)) trigram = Phraser(Phrases(bigram[corpus])) trig_corpus = trigram[bigram[corpus]] self.vocab = list(set([term for doc in trig_corpus for term in doc])) # creating standard Dictionary representation of corpus and creating standard doc-term matrix dct = Dictionary(trig_corpus) bow_corpus = [dct.doc2bow(line) for line in trig_corpus] self.doc_term_mat = corpus2csc(bow_corpus).T
def __init__(self): reader = Reader() print('loading data') self.X_train = reader.getData(TRAIN) print('train data has been loaded!') self.X_valid = reader.getData(DEV) print('valid data has been loaded!') self.X_test = reader.getData(TEST) print('test data has been loaded!') self.c_title = [] self.c_body = [] self.bigram = Phrases.load('./data/bigram.dat') self.trigram = Phrases.load('./data/trigram.dat')
def testScoringNpmi(self): """ test normalized pointwise mutual information scoring """ bigram = Phrases(self.sentences, min_count=1, threshold=.5, scoring='npmi') seen_scores = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']] for phrase, score in bigram.export_phrases(test_sentences): seen_scores.add(round(score, 3)) assert seen_scores == { .882, # score for graph minors .714 # score for human interface }
def __init__(self): reader = Reader() print('loading data') self.X_train=reader.getData(TRAIN) print('train data has been loaded!') self.X_valid=reader.getData(DEV) print('valid data has been loaded!') self.X_test=reader.getData(TEST) print('test data has been loaded!') self.c_title=[] self.c_body=[] self.bigram=Phrases.load('./data/bigram.dat') self.trigram=Phrases.load('./data/trigram.dat')
class GramFacade: def __init__(self, model_dir, min_count_bigrams=8, min_count_trigrams=7): self.model_dir = model_dir self.min_count_bigrams = min_count_bigrams self.min_count_trigrams = min_count_trigrams def load_models(self): self.bigrams_phraser = Phraser.load(self.model_dir + '/' + BIGRAMS_PHRASER_FILENAME) self.trigrams_phraser = Phraser.load(self.model_dir + '/' + TRIGRAMS_PHRASER_FILENAME) def load_phrases(self): self.bigrams_phrases = Phrases.load(self.model_dir + '/' + BIGRAMS_PHRASES_FILENAME) self.trigrams_phrases = Phrases.load(self.model_dir + '/' + TRIGRAMS_PHRASES_FILENAME) def export_bigrams(self, docs): return [self.bigrams_phraser[doc] for doc in docs] def export_trigrams(self, bigrams): return [self.trigrams_phraser[bigram] for bigram in bigrams] def phrase(self, doc): bigrams = self.bigrams_phraser[doc] trigrams = self.trigrams_phraser[bigrams] return trigrams def create_model(self, doc_list): self.bigrams_phrases = Phrases(doc_list, min_count=self.min_count_bigrams) self.bigrams_phraser = Phraser(self.bigrams_phrases) self.trigrams_phrases = Phrases(self.bigrams_phraser[doc_list], min_count=self.min_count_trigrams) self.trigrams_phraser = Phraser(self.trigrams_phrases) self.bigrams_phraser.save(self.model_dir + '/' + BIGRAMS_PHRASER_FILENAME) self.trigrams_phraser.save(self.model_dir + '/' + TRIGRAMS_PHRASER_FILENAME) self.bigrams_phrases.save(self.model_dir + '/' + BIGRAMS_PHRASES_FILENAME) self.trigrams_phrases.save(self.model_dir + '/' + TRIGRAMS_PHRASES_FILENAME) def words_not_in_vocab(self, tok_doc, threshold): word_not_in_doc = set([ x for x in tok_doc if self.trigrams_phrases.vocab[str.encode(x)] < threshold ]) return word_not_in_doc
def testScoringNpmi(self): """ test normalized pointwise mutual information scoring """ bigram = Phrases(sentences, min_count=1, threshold=.5, scoring='npmi') seen_scores = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']] for phrase, score in bigram.export_phrases(test_sentences): seen_scores.add(round(score, 3)) assert seen_scores == set([ .882, # score for graph minors .714 # score for human interface ])
def testScoringDefault(self): """ test the default scoring, from the mikolov word2vec paper """ bigram = Phrases(self.sentences, min_count=1, threshold=1) seen_scores = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']] for phrase, score in bigram.export_phrases(test_sentences): seen_scores.add(round(score, 3)) assert seen_scores == { 5.167, # score for graph minors 3.444 # score for human interface }
def testSaveLoadCustomScorer(self): """ saving and loading a Phrases object with a custom scorer """ with temporary_file("test.pkl") as fpath: bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer) bigram.save(fpath) bigram_loaded = Phrases.load(fpath) seen_scores = [] test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] for phrase, score in bigram_loaded.export_phrases(test_sentences): seen_scores.append(score) assert all(seen_scores) # all scores 1 assert len(seen_scores) == 3 # 'graph minors' and 'survey human' and 'interface system'
def get_ngram(n, sentence): """ Function to get n grams to examine relationship between words in the news content """ if n == 1: return sentence # create phrases model to find words and ngrams that occur at least once ngram = Phraser(Phrases(sentence, min_count=1, threshold=1)) # for bigrams and higher grams for i in range(3,n): ngram = Phraser(Phrases(ngram[sentence], min_count=1, threshold=1)) return ngram[sentence]
def build_model(): """build doc2vec model from cases""" """get urls for cases""" urls = make_links() shuffle(urls) """async downloads""" loop = asyncio.get_event_loop() future = asyncio.ensure_future(coordinate_downloads(urls)) cases = [c for c in loop.run_until_complete(future) if len(c[1]) > 25] print("retrieved {} usable cases".format(len(cases))) lls = [] for label, case in cases: lls.append(LabeledSentence(words=case.split(), tags=label)) model = Doc2Vec(size=300, window=10, min_count=5, workers=6, alpha=0.025, min_alpha=0.025) model.build_vocab(lls) for epoch in range(10): model.train(lls) print("trained") for dv in model.docvecs: print(dv) input() print(model.most_similar("court")) """make sentences""" print("preprocessing text...") sentences = [] for c in cases: s = sentence_maker.split_into_sentences(c[1], lower=True) sentences.extend(sentence_maker.split_into_sentences(c[1], lower=True)) print("found {} sentences".format(len(sentences))) """phrase pre-processing""" print("building phrases...") phrases = Phrases(sentences, min_count=5, threshold=100) bigramphraser = Phraser(phrases) """produce a representation of the text including 2 and 3 word phrases""" trg_phrases = Phrases(bigramphraser[sentences], min_count=5, threshold=100) trigram_phraser = Phraser(trg_phrases) phrased_sentences = list(trigram_phraser[list(bigramphraser[sentences])]) print("building Word2Vec model...") return Word2Vec(phrased_sentences, min_count=10, workers=6)
def testExportPhrases(self): """Test Phrases bigram export phrases.""" bigram = Phrases(self.sentences, min_count=1, threshold=1, connector_words=self.connector_words, delimiter=' ') seen_bigrams = set(bigram.find_phrases(self.sentences).keys()) assert seen_bigrams == set([ 'human interface', 'graph of trees', 'data and graph', 'lack of interest', ])
def testCustomScorer(self): """Test using a custom scoring function.""" bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer) test_sentences = [[ 'graph', 'minors', 'survey', 'human', 'interface', 'system' ]] seen_scores = list(bigram.find_phrases(test_sentences).values()) assert all(score == 1 for score in seen_scores) assert len( seen_scores ) == 3 # 'graph minors' and 'survey human' and 'interface system'
def testScoringDefault(self): """Test the default scoring, from the mikolov word2vec paper.""" bigram = Phrases(self.sentences, min_count=1, threshold=1, delimiter=' ') test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']] seen_scores = set( round(score, 3) for score in bigram.find_phrases(test_sentences).values()) assert seen_scores == { 5.167, # score for graph minors 3.444 # score for human interface }
def testScoringNpmi(self): """Test normalized pointwise mutual information scoring.""" bigram = Phrases(self.sentences, min_count=1, threshold=.5, scoring='npmi') test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']] seen_scores = set( round(score, 3) for score in bigram.find_phrases(test_sentences).values()) assert seen_scores == { .882, # score for graph minors .714 # score for human interface }
def testScoringNpmi(self): """ test normalized pointwise mutual information scoring """ bigram = Phrases(self.sentences, min_count=1, threshold=.5, scoring='npmi', common_terms=self.common_terms) seen_scores = set() test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']] for phrase, score in bigram.export_phrases(test_sentences): seen_scores.add(round(score, 3)) assert seen_scores == set([ .74, # score for data and graph .894 # score for human interface ])
def testExportPhrases(self): """Test Phrases bigram export_phrases functionality.""" bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) seen_bigrams = set() for phrase, score in bigram.export_phrases(self.sentences): seen_bigrams.add(phrase) assert seen_bigrams == set([ b'human interface', b'graph of trees', b'data and graph', b'lack of interest', ])
def create_model(self, doc_list): self.bigrams_phrases = Phrases(doc_list, min_count=self.min_count_bigrams) self.bigrams_phraser = Phraser(self.bigrams_phrases) self.trigrams_phrases = Phrases(self.bigrams_phraser[doc_list], min_count=self.min_count_trigrams) self.trigrams_phraser = Phraser(self.trigrams_phrases) self.bigrams_phraser.save(self.model_dir + '/' + BIGRAMS_PHRASER_FILENAME) self.trigrams_phraser.save(self.model_dir + '/' + TRIGRAMS_PHRASER_FILENAME) self.bigrams_phrases.save(self.model_dir + '/' + BIGRAMS_PHRASES_FILENAME) self.trigrams_phrases.save(self.model_dir + '/' + TRIGRAMS_PHRASES_FILENAME)
def setUp(self): self.bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) self.bigram_default = Phrases(self.sentences, common_terms=self.common_terms) self.bigram_utf8 = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) self.bigram_unicode = Phrases(self.unicode_sentences, min_count=1, threshold=1, common_terms=self.common_terms)
def build_ngrams(df, min_count=5, threshold=2): """ This function builds bigram and ngrams. Please don't modify, it may explode. """ print("Building Bigrams") phrases = Phrases(tqdm(df.clean), min_count=min_count, threshold=threshold) bigrams = Phraser(phrases) # Phrases -> Phraser: lighter/faster object, but can't be updated df['bigrams'] = df.clean.progress_apply(lambda r: bigrams[r]) print("Building Ngrams") phrases_2 = Phrases(tqdm(df.bigrams), min_count=min_count, threshold=threshold) ngrams = Phraser(phrases_2) df['ngrams'] = df.clean.progress_apply(lambda r: ngrams[r])
def build_vocab(): start = time.time() test_path = os.path.join(config.DATA_PATH, 'test.csv') train_path = os.path.join(config.DATA_PATH, 'train.csv') normalized_text_path = os.path.join(config.PROCESSED_PATH, 'normalized_comments.txt') bigram_path = os.path.join(config.PROCESSED_PATH, 'bigram') bigram_comments_path = os.path.join(config.PROCESSED_PATH, 'bigram_commnets.txt') if config.PROCESSED_PATH not in os.listdir(config.DATA_PATH): try: os.mkdir(config.PROCESSED_PATH) except OSError: pass vocab = {} train_df = read_file(train_path) test_df = read_file(test_path) print('tokenizing vocab file') texts = np.concatenate([train_df.comment_text.fillna('N/A').values, test_df.comment_text.fillna('N/A').values]) with open(normalized_text_path, 'w') as f: processed_text = parallelize_dataframe(texts, tokenizer) for line in processed_text: f.write(line + '\n') gc.collect() lines = LineSentence(normalized_text_path) bigram = Phrases(lines) bigram.save(bigram_path) phraser = Phraser(bigram) with open(bigram_comments_path, 'w', encoding='utf_8') as f: for comment in lines: comm = u' '.join(phraser[comment]) f.write(comm + '\n') commnets = LineSentence(bigram_comments_path) bigram_dict = Dictionary(commnets) bigram_dict.filter_extremes(no_below=config.THRESHOLD) bigram_dict.save_as_text(config.VOCAB_PATH) bigram_dict.add_documents([['<pad>']]) with open(os.path.join(config.ROOT, 'src', 'config.py'), 'a') as f: f.write('VOCAB_SIZE = {}'.format(len(bigram_dict))) print('time passed: {} minutes'.format((time.time() - start) / 60))
def testCompatibilty(self): phrases = Phrases.load(datapath("phrases-3.6.0.model")) phraser = FrozenPhrases.load(datapath("phraser-3.6.0.model")) test_sentences = ['trees', 'graph', 'minors'] self.assertEqual(phrases[test_sentences], ['trees', 'graph_minors']) self.assertEqual(phraser[test_sentences], ['trees', 'graph_minors'])
def testCustomScorer(self): """Test using a custom scoring function.""" bigram = Phrases( self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer, connector_words=self.connector_words, ) test_sentences = [[ 'data', 'and', 'graph', 'survey', 'for', 'human', 'interface' ]] seen_scores = list(bigram.find_phrases(test_sentences).values()) assert all(seen_scores) # all scores 1 assert len(seen_scores) == 2 # 'data and graph' 'survey for human'
def testMultipleBigramsSingleEntry(self): """Test a single entry produces multiple bigrams.""" bigram = Phrases(self.sentences, min_count=1, threshold=1, connector_words=self.connector_words, delimiter=' ') test_sentences = [[ 'data', 'and', 'graph', 'survey', 'for', 'human', 'interface' ]] seen_bigrams = set(bigram.find_phrases(test_sentences).keys()) assert seen_bigrams == set([ 'data and graph', 'human interface', ])
def learn_word_embeddings(corpus_fpath, vectors_fpath, cbow, window, iter_num, size, threads, min_count, detect_phrases=True): tic = time() sentences = GzippedCorpusStreamer(corpus_fpath) if detect_phrases: print("Extracting phrases from the corpus:", corpus_fpath) phrases = Phrases(sentences) bigram = Phraser(phrases) input_sentences = list(bigram[sentences]) print("Time, sec.:", time() - tic) else: input_sentences = sentences print("Training word vectors:", corpus_fpath) model = Word2Vec(input_sentences, min_count=min_count, size=size, window=window, max_vocab_size=None, workers=threads, sg=(1 if cbow == 0 else 0), iter=iter_num) model.wv.save_word2vec_format(vectors_fpath, binary=False) print("Vectors:", vectors_fpath) print("Time, sec.:", time() - tic)
def __init__(self,train_data,dev_data,test_data): self.train_data=train_data self.dev_data=dev_data self.test_data=test_data # Hyper-parameters self.learningRate=0.01 self.trainSize=2000 self.testSize=1000 self.totalSize = self.trainSize + self.testSize self.maxEpochs=10000 self.num_processed=-1 self.w2v_model=Word2Vec.load('./data/word2vec/w2v.model') self.bigram=Phrases.load('./data/bigram.dat') self.trigram=Phrases.load('./data/trigram.dat')
def testSaveLoadNoCommonTerms(self): """ Ensure backwards compatibility with old versions of Phrases, before common_terms""" bigram_loaded = Phrases.load(datapath("phrases-no-common-terms.pkl")) self.assertEqual(bigram_loaded.common_terms, frozenset()) # can make a phraser, cf #1751 phraser = Phraser(bigram_loaded) # does not raise phraser[["human", "interface", "survey"]] # does not raise
def testSaveLoad(self): """ Saving and loading a Phrases object.""" with temporary_file("test.pkl") as fpath: bigram = Phrases(self.sentences, min_count=1, threshold=1) bigram.save(fpath) bigram_loaded = Phrases.load(fpath) seen_scores = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] for phrase, score in bigram_loaded.export_phrases(test_sentences): seen_scores.add(round(score, 3)) assert seen_scores == set([ 5.167, # score for graph minors 3.444 # score for human interface ])
def __init__(self, sentences, filename=None): # model parameters self.sentences = sentences self.dataset = "CASEREPORT" self.tokenizer = "RAW" self.prune_stopwords = stopwords("pubmed") self.phrases = None self.threshold = 250 self.decay = 2 self.bigram_iter = 3 # data file path models_folder = os.path.join(*[os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'models']) if filename is None: filename = "PHRASE_%s_%s_%s_%s" % (self.threshold, self.decay, self.dataset, self.tokenizer, ) self.filepath = os.path.join(models_folder, filename) # does identical model already exists? model_exists = os.path.isfile(self.filepath) if model_exists: logging.info("LOADING - loading phrase data..") self.phrases = Phrases.load(self.filepath) else: logging.info("CREATE - creating phrase data..") self.build()
def testExportPhrases(self): """Test Phrases bigram export_phrases functionality.""" bigram = Phrases(sentences, min_count=1, threshold=1) # with this setting we should get response_time and graph_minors bigram1_seen = False bigram2_seen = False for phrase, score in bigram.export_phrases(sentences): if not bigram1_seen and b'response time' == phrase: bigram1_seen = True elif not bigram2_seen and b'graph minors' == phrase: bigram2_seen = True if bigram1_seen and bigram2_seen: break self.assertTrue(bigram1_seen) self.assertTrue(bigram2_seen)
def testSaveLoadCustomScorer(self): """ saving and loading a Phrases object with a custom scorer """ try: bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer) bigram.save("test_phrases_testSaveLoadCustomScorer_temp_save.pkl") bigram_loaded = Phrases.load("test_phrases_testSaveLoadCustomScorer_temp_save.pkl") seen_scores = [] test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] for phrase, score in bigram_loaded.export_phrases(test_sentences): seen_scores.append(score) assert all(seen_scores) # all scores 1 assert len(seen_scores) == 3 # 'graph minors' and 'survey human' and 'interface system' finally: if os.path.exists("test_phrases_testSaveLoadCustomScorer_temp_save.pkl"): os.remove("test_phrases_testSaveLoadCustomScorer_temp_save.pkl")
def testSaveLoad(self): """ Saving and loading a Phrases object.""" try: bigram = Phrases(self.sentences, min_count=1, threshold=1) bigram.save("test_phrases_testSaveLoad_temp_save.pkl") bigram_loaded = Phrases.load("test_phrases_testSaveLoad_temp_save.pkl") seen_scores = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] for phrase, score in bigram_loaded.export_phrases(test_sentences): seen_scores.add(round(score, 3)) assert seen_scores == set([ 5.167, # score for graph minors 3.444 # score for human interface ]) finally: if os.path.exists("test_phrases_testSaveLoad_temp_save.pkl"): os.remove("test_phrases_testSaveLoad_temp_save.pkl")
def testCompatibilty(self): phr = Phraser.load(datapath("phraser-3.6.0.model")) model = Phrases.load(datapath("phrases-3.6.0.model")) test_sentences = ['trees', 'graph', 'minors'] expected_res = ['trees', 'graph_minors'] phr_out = phr[test_sentences] model_out = model[test_sentences] self.assertEqual(phr_out, expected_res) self.assertEqual(model_out, expected_res)
def testSaveLoadStringScoring(self): """ Saving and loading a Phrases object with a string scoring parameter. This should ensure backwards compatibility with the previous version of Phrases""" bigram_loaded = Phrases.load(datapath("phrases-scoring-str.pkl")) seen_scores = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] for phrase, score in bigram_loaded.export_phrases(test_sentences): seen_scores.add(round(score, 3)) assert seen_scores == set([ 5.167, # score for graph minors 3.444 # score for human interface ])
def testSaveLoadNoScoring(self): """ Saving and loading a Phrases object with no scoring parameter. This should ensure backwards compatibility with old versions of Phrases""" try: bigram = Phrases(self.sentences, min_count=1, threshold=1) del(bigram.scoring) bigram.save("test_phrases_testSaveLoadNoScoring_temp_save.pkl") bigram_loaded = Phrases.load("test_phrases_testSaveLoadNoScoring_temp_save.pkl") seen_scores = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] for phrase, score in bigram_loaded.export_phrases(test_sentences): seen_scores.add(round(score, 3)) assert seen_scores == set([ 5.167, # score for graph minors 3.444 # score for human interface ]) finally: if os.path.exists("test_phrases_testSaveLoadNoScoring_temp_save.pkl"): os.remove("test_phrases_testSaveLoadNoScoring_temp_save.pkl")
def __init__(self): ''' Training parameters: ''' self.w2v_dim=100 self.num_feature=400 self.batch_size=16 self.num_epoch=1 #self.w2v_model=Word2Vec.load_word2vec_format('./data/word2vec/GoogleNews-vectors-negative300.bin', binary=True) self.w2v_model=Word2Vec.load('./data/word2vec/w2v.model') self.index2word_set = set(self.w2v_model.index2word) self.bigram=Phrases.load('./data/bigram.dat') self.trigram=Phrases.load('./data/trigram.dat') print('Build model...') param_dist = { "n_estimators":sp_randint(20,250), "criterion": ["gini", "entropy"], "max_depth": sp_randint(10, 300), "min_samples_split": sp_randint(1, 30), "min_samples_leaf": sp_randint(1, 30), "max_features": sp_randint(1, 200), "bootstrap": [True, False], 'random_state':sp_randint(1, 1000000), } # build a classifier clf = RandomForestClassifier(n_jobs=8) # run randomized search self.model=RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=10,cv=9,n_jobs=8) print('Model has been built!')
def update(self,new_corpus,count,wrkers,sze,wndow): sentences = Corpus_Sentence_Extractor(new_corpus) bigram = Phrases().load(self.models + 'bigram_model') trigram = Phrases().load(self.models + 'trigram_model') bigram.add_vocab(sentences) trigram.add_vocab(bigram[sentences]) self.train(sentences,trigram,self.word2vec,count,wrkers,sze,wndow)
def build_bigram_model(self,sentences,count): print "In Bigram Model" bigram = Phrases(sentences,min_count=count) dest = self.models + 'bigram_model' bigram.save(dest) return bigram
def __init__(self): self.session = tf.Session() ''' Training parameters: ''' self.w2v_dim=30 self.num_feature=400 self.batch_size=32 self.num_epoch=10000 self.num_hidden_1=50 self.num_hidden_2=3 self.number_of_layers=1 #self.max_len = 50 self.max_len_title=6 self.max_len_body=38 #self.w2v_model=Word2Vec.load_word2vec_format('./data/word2vec/GoogleNews-vectors-negative300.bin', binary=True) self.w2v_model=Word2Vec.load('data/word2vec/w2v.model') self.index2word_set = set(self.w2v_model.index2word) #self.bigram = None #self.trigram =None self.bigram=Phrases.load('./data/bigram.dat') self.trigram=Phrases.load('./data/trigram.dat') # Model self.input_0=tf.placeholder(tf.float32,[self.max_len_title,self.batch_size,self.w2v_dim]) self.input_1=tf.placeholder(tf.float32,[self.max_len_title,self.batch_size,self.w2v_dim]) self.input_0_=tf.placeholder(tf.float32,[self.max_len_body,self.batch_size,self.w2v_dim]) self.input_1_=tf.placeholder(tf.float32,[self.max_len_body,self.batch_size,self.w2v_dim]) self.dropout_input = tf.placeholder(tf.float32) self.dropout_hidden = tf.placeholder(tf.float32) self.target = tf.placeholder(tf.float32, [self.batch_size, 3]) input_0=array_ops.unpack(self.input_0) input_1=array_ops.unpack(self.input_1) input_0_=array_ops.unpack(self.input_0_) input_1_=array_ops.unpack(self.input_1_) def _rnn(inputs, reverse=False): with tf.variable_scope("GRU_RNN") as scope: cell=rnn_cell.GRUCell(self.w2v_dim) cell = rnn_cell.DropoutWrapper(cell, output_keep_prob=self.dropout_input) stacked_cell = rnn_cell.MultiRNNCell([cell] * self.number_of_layers) state = stacked_cell.zero_state(self.batch_size, tf.float32) if reverse: inputs=reversed(inputs) for time, input_ in enumerate(inputs): if time > 0: scope.reuse_variables() output, state = stacked_cell(input_, state) return state with tf.variable_scope('Feature_Generator') as scope: state_0 = _rnn(input_0) scope.reuse_variables() state_1 = _rnn(input_1) state_0_ = _rnn(input_0_) state_1_ = _rnn(input_1_) ''' with tf.variable_scope('Feature_Generator_body') as scope: state_0_ = _rnn(input_0_) scope.reuse_variables() state_1_ = _rnn(input_1_) ''' ''' with tf.variable_scope('Feature_Generator_body_reverse') as scope: state_0_reverse = _rnn(input_0_, reverse=True) scope.reuse_variables() state_1_reverse = _rnn(input_1_, reverse=True) ''' ''' with tf.variable_scope('Feature_Generator_title') as scope: state_0 = _rnn(input_0) scope.reuse_variables() state_1 = _rnn(input_1) with tf.variable_scope('Feature_Generator_body') as scope: state_0_ = _rnn(input_0_) scope.reuse_variables() state_1_ = _rnn(input_1_) # state=tf.concat(1,[tf.abs(tf.sub(state_0,state_1)),tf.mul(state_0,state_1), # tf.abs(tf.sub(state_0_,state_1_)),tf.mul(state_0_,state_1_)]) # state=tf.concat(1,[state_0,state_1, state_0_, state_1_]) # state = tf.ones([32,10]) # state=tf.concat(1,[tf.abs(tf.sub(state_0,state_1)),tf.mul(state_0,state_1)]) ''' # 2-layer NN with tf.variable_scope("NN", initializer=tf.random_uniform_initializer(-1.0,1.0)): self.W_mul = tf.get_variable("W_mul", [state_0_.get_shape()[1]*2,self.num_hidden_1]) self.W_sub = tf.get_variable("W_sub", [state_0_.get_shape()[1]*2,self.num_hidden_1]) self.b = tf.get_variable("b", [self.num_hidden_1]) self.W_softmax=tf.get_variable("W_softmax", [self.num_hidden_1,self.num_hidden_2]) self.b_softmax = tf.get_variable("b_softmax", [self.num_hidden_2]) # h_mul = tf.mul(state_0,state_1) # h_sub = tf.abs(tf.sub(state_0,state_1)) h_mul = tf.concat(1,[tf.mul(state_0,state_1),tf.mul(state_0_,state_1_)]) h_sub = tf.concat(1,[tf.abs(tf.sub(state_0,state_1)),tf.abs(tf.sub(state_0_,state_1_))]) y_1 = tf.nn.sigmoid(tf.matmul(h_mul, self.W_mul)+tf.matmul(h_sub, self.W_sub)+self.b) y_2 = tf.matmul(y_1, self.W_softmax)+self.b_softmax # regularizers = (tf.nn.l2_loss(self.W_1) + tf.nn.l2_loss(self.b_1)+tf.nn.l2_loss(self.W_2) + tf.nn.l2_loss(self.b_2)) ''' state_0_title_normalized = tf.nn.l2_normalize(state_0, 1) state_1_title_normalized = tf.nn.l2_normalize(state_1, 1) state_0_body_normalized = tf.nn.l2_normalize(state_0_, 1) state_1_body_normalized = tf.nn.l2_normalize(state_1_, 1) dist_title_ = tf.mul(state_0_title_normalized, state_1_title_normalized) dist_body_ = tf.mul(state_0_body_normalized, state_1_body_normalized)s dist_title=tf.reduce_sum(dist_title_, 1, keep_dims=True) dist_body=tf.reduce_sum(dist_body_, 1, keep_dims=True) feature = tf.concat(1, [dist_title,dist_body]) with tf.variable_scope("log_reg", initializer=tf.random_uniform_initializer()): self.W = tf.get_variable("W", [feature.get_shape()[1],3]) self.b = tf.get_variable("b", [3]) y_2 = tf.matmul(feature, self.W)+self.b ''' ''' with tf.variable_scope("log_reg", initializer=tf.random_uniform_initializer()): self.W_1 = tf.get_variable("W_1", [state.get_shape()[1],self.num_hidden_1]) self.b_1 = tf.get_variable("b_1", [self.num_hidden_1]) self.W_2 = tf.get_variable("W_2", [self.num_hidden_1,self.num_hidden_2]) self.b_2 = tf.get_variable("b_2", [self.num_hidden_2]) ''' ''' # Create model def multilayer_perceptron(_X, _weights, _biases): layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(_X, _weights['h1']), _biases['b1'])) #Hidden layer with RELU activation layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, _weights['h2']), _biases['b2'])) #Hidden layer with RELU activation return tf.matmul(layer_2, _weights['out']) + _biases['out'] # Store layers weight & bias weights = { 'h1': tf.Variable(tf.random_normal([10, 10])), 'h2': tf.Variable(tf.random_normal([10, 5])), 'out': tf.Variable(tf.random_normal([5, 3])) } biases = { 'b1': tf.Variable(tf.random_normal([10])), 'b2': tf.Variable(tf.random_normal([5])), 'out': tf.Variable(tf.random_normal([3])) } # Construct model self.y_pred = multilayer_perceptron(state, weights, biases) # Define loss and optimizer self.cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(self.y_pred, self.target)) # Softmax loss self.optimizer = tf.train.AdamOptimizer(learning_rate=0.1).minimize(self.cross_entropy) # Adam Optimizer ''' # self.W = tf.Variable(tf.zeros([10, 3])) # self.b = tf.Variable(tf.zeros([3])) # y_1 = tf.sigmoid(tf.matmul(state, self.W_1)+self.b_1) # y_2 = tf.sigmoid(tf.matmul(y_1, self.W_2)+self.b_2) # self.y_pred = tf.nn.softmax(tf.nn.sigmoid(tf.add(tf.matmul(state, self.W),self.b))) self.y_pred=tf.nn.softmax(y_2) # self.y_pred = tf.nn.softmax(tf.nn.sigmoid(tf.matmul(state, self.W_1)+self.b_1)) self.cross_entropy = -tf.reduce_mean(self.target*tf.log(self.y_pred)) # self.optimizer = tf.train.AdamOptimizer().minimize(self.cross_entropy) # self.optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(self.cross_entropy) # self.optimizer = tf.train.GradientDescentOptimizer(0.01).minimize(self.cross_entropy) # self.gradstep = self.optimizer.compute_gradients(self.cross_entropy) # Optimizer. global_step = tf.Variable(0) # optimizer = tf.train.GradientDescentOptimizer(0.1) optimizer = tf.train.AdagradOptimizer(0.1) gradients, v = zip(*optimizer.compute_gradients(self.cross_entropy)) gradients, _ = tf.clip_by_global_norm(gradients, 10) self.optimizer= optimizer.apply_gradients(zip(gradients, v), global_step=global_step) print('Model has been built!')
def __init__(self): self.session = tf.Session() ''' Training parameters: ''' self.w2v_dim=10 self.num_feature=400 self.batch_size=32 self.num_epoch=10000 self.num_hidden_1=100 self.num_hidden_2=50 self.num_hidden_3=3 self.number_of_layers=1 #self.max_len = 50 self.max_len_title=13 self.max_len_body=50 # self.w2v_model=Word2Vec.load_word2vec_format('./data/word2vec/GoogleNews-vectors-negative300.bin', binary=True) self.w2v_model=Word2Vec.load('data/word2vec/w2v.model') self.index2word_set = set(self.w2v_model.index2word) #self.bigram = None #self.trigram =None self.bigram=Phrases.load('./data/bigram.dat') self.trigram=Phrases.load('./data/trigram.dat') # Model self.input_0=tf.placeholder(tf.float32,[self.max_len_title,self.batch_size,self.w2v_dim]) self.input_1=tf.placeholder(tf.float32,[self.max_len_title,self.batch_size,self.w2v_dim]) self.input_0_=tf.placeholder(tf.float32,[self.max_len_body,self.batch_size,self.w2v_dim]) self.input_1_=tf.placeholder(tf.float32,[self.max_len_body,self.batch_size,self.w2v_dim]) self.dropout_input = tf.placeholder(tf.float32) self.dropout_hidden_1 = tf.placeholder(tf.float32) self.target = tf.placeholder(tf.float32, [self.batch_size, 3]) input_0=array_ops.unpack(self.input_0) input_1=array_ops.unpack(self.input_1) input_0_=array_ops.unpack(self.input_0_) input_1_=array_ops.unpack(self.input_1_) def _encoder(inputs, reverse=False): with tf.variable_scope("GRU_RNN") as scope: cell=rnn_cell.BasicLSTMCell(self.w2v_dim) stacked_cell = rnn_cell.MultiRNNCell([cell] * self.number_of_layers) # state = tf.zeros([1, cell.state_size]) state = stacked_cell.zero_state(self.batch_size, tf.float32) if reverse: inputs=reversed(inputs) for time, input_ in enumerate(inputs): if time > 0: scope.reuse_variables() output, state = stacked_cell(input_, state) return state def _decoder(state, inputs): with tf.variable_scope("GRU_RNN") as scope: cell=rnn_cell.BasicLSTMCell(self.w2v_dim) stacked_cell = rnn_cell.MultiRNNCell([cell] * self.number_of_layers*2) for time, input_ in enumerate(inputs): if time > 0: scope.reuse_variables() output, state = stacked_cell(input_, state) return output with tf.variable_scope('Encoder') as scope: state = _encoder(input_0_) scope.reuse_variables() state_reversed = _encoder(input_0_, reverse=True) with tf.variable_scope('Decoder') as scope: state = _decoder(tf.concat(1,[state,state_reversed]), input_1_) with tf.variable_scope("to_score", initializer=tf.random_uniform_initializer()): self.W = tf.get_variable("W", [state.get_shape()[1],3]) self.b = tf.get_variable("b", [3]) score = tf.matmul(state, self.W)+self.b # score_1 = tf.sigmoid(tf.matmul(out_1, self.W)+self.b) # state=tf.concat(1,[score_0,score_1]) ''' with tf.variable_scope("to_final", initializer=tf.random_uniform_initializer()): self.W = tf.get_variable("W", [state.get_shape()[1],3]) self.b = tf.get_variable("b", [3]) final = tf.matmul(state, self.W)+self.b ''' self.y_pred=tf.nn.softmax(score) self.cross_entropy = -tf.reduce_mean(self.target*tf.log(self.y_pred)) # Optimizer. global_step = tf.Variable(0) optimizer = tf.train.GradientDescentOptimizer(0.1) # optimizer = tf.train.AdamOptimizer(0.1) gradients, v = zip(*optimizer.compute_gradients(self.cross_entropy)) gradients, _ = tf.clip_by_global_norm(gradients, 20) self.optimizer= optimizer.apply_gradients(zip(gradients, v), global_step=global_step) print('Model has been built!')