def __init__(self): self.stopwords = stopwords.words('english') # Lemmatizer self.lmtzr = WordNetLemmatizer() # Stemmer self.stemmer = PorterStemmer() self.word2vec_model = None self.words = re.compile(r"\w+", re.I) try: self.bigrams = Phrases.load('slm/app/cached_models/bigrams.gensim') except: self.bigrams = None try: self.trigrams = Phrases.load( 'slm/app/cached_models/trigrams.gensim') except: self.trigrams = None try: self.dictionary = corpora.Dictionary.load( 'slm/app/cached_models/dictionary.dict') except: self.dictionary = None try: self.tfidf = TfidfModel.load('slm/app/cached_models/tfidf.gensim') except: self.tfidf = None
def __init__(self): ''' Training parameters: ''' self.w2v_dim=100 self.num_feature=400 self.batch_size=16 self.num_epoch=30 # self.w2v_model=Word2Vec.load_word2vec_format('./data/word2vec/GoogleNews-vectors-negative300.bin', binary=True) self.w2v_model=Word2Vec.load('./data/word2vec/w2v.model') self.index2word_set = set(self.w2v_model.index2word) #self.bigram=None #self.trigram=None self.bigram=Phrases.load('./data/bigram.dat') self.trigram=Phrases.load('./data/trigram.dat') print('Build model...') self.model = Sequential() self.model.add(Dropout(0.2,input_shape=(self.num_feature,))) self.model.add(Dense(3, input_dim=self.num_feature, init='orthogonal')) self.model.add(Activation('softmax')) self.model.compile(loss='categorical_crossentropy', optimizer='adam', class_mode="categorical") print('Model has been built!')
def __init__(self): ''' Training parameters: ''' self.w2v_dim = 100 self.num_feature = 400 self.batch_size = 16 self.num_epoch = 30 # self.w2v_model=Word2Vec.load_word2vec_format('./data/word2vec/GoogleNews-vectors-negative300.bin', binary=True) self.w2v_model = Word2Vec.load('./data/word2vec/w2v.model') self.index2word_set = set(self.w2v_model.index2word) #self.bigram=None #self.trigram=None self.bigram = Phrases.load('./data/bigram.dat') self.trigram = Phrases.load('./data/trigram.dat') print('Build model...') self.model = Sequential() self.model.add(Dropout(0.2, input_shape=(self.num_feature, ))) self.model.add(Dense(3, input_dim=self.num_feature, init='orthogonal')) self.model.add(Activation('softmax')) self.model.compile(loss='categorical_crossentropy', optimizer='adam', class_mode="categorical") print('Model has been built!')
def getTrigramList(g_DataQueue, g_FinishRead, savePath, bigramPath, trigramPath): """ :param g_DataQueue: :param g_FinishRead: :param savePath:保存字典路径 :param bigramPath: :param trigramPath: :return: """ count = 0 vocabulary_dic = {} bigram = Phraser(Phrases.load(bigramPath)) trigram = Phraser(Phrases.load(trigramPath)) while (g_FinishRead.value == 0 or (not g_DataQueue.empty())): words = g_DataQueue.get() count += len(words) print("have processed sentences:", count) # 获取短语 trigram_list = trigram[bigram[words]] del words gc.collect() # 放入字典中 for phrase_list in trigram_list: for phrase in phrase_list: if phrase not in vocabulary_dic: vocabulary_dic[phrase] = 0 vocabulary_dic[phrase] += 1 # 存入本地 fw = codecs.open(savePath, "w", encoding="utf-8") fw.write(json.dumps(vocabulary_dic)) fw.close() del vocabulary_dic gc.collect()
def train_w2v_model() -> (Phraser, Word2Vec): # Build Word2Vec model if not Path(model_file).exists(): sent = [row.split() for row in df['clean_lyrics'] if row] # Build collocations if not Path(bigrams_file).exists(): bigram_phrases = Phrases(sent, min_count=30, progress_per=10000, max_vocab_size=200000, common_terms=sentiment_terms) bigram = Phraser(bigram_phrases) bigram.save(bigrams_file) trigram_phrases = Phrases(bigram[sent], min_count=30, progress_per=10000, max_vocab_size=200000, common_terms=sentiment_terms) trigram = Phraser(trigram_phrases) trigram.save(trigrams_file) trigram = Phrases.load(trigrams_file) sentences = trigram[sent] cores = multiprocessing.cpu_count() w2v_model = Word2Vec( min_count=20, # Remove rare words window=2, size=300, sample=6e-5, alpha=0.03, min_alpha=0.0007, negative=20, workers=cores - 1) t = time() w2v_model.build_vocab(sentences, progress_per=10000) print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2))) w2v_model.vocabulary.save(vocabulary_file) t = time() w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1) print('Time to train the model: {} mins'.format( round((time() - t) / 60, 2))) w2v_model.save(model_file) trigram = Phrases.load(trigrams_file) w2v_model = Word2Vec.load(model_file) return trigram, w2v_model
def test_build_phrase_models_real(self, doc_content_stream): from eea.corpus.processing.phrases.phrases import build_phrase_models from eea.corpus.utils import rand from gensim.models.phrases import Phrases from itertools import tee, chain import os.path import tempfile content_A, content_B, test_A = tee(doc_content_stream, 3) # proof that the simple_content_stream can be used for phrases # ph_model = Phrases(content_A) # phrases = list(ph_model.export_phrases(sents)) # assert phrases[0][0].decode('utf-8') == 'freshwater resources' base_dir = tempfile.gettempdir() b_name = rand(10) base_path = os.path.join(base_dir, b_name) build_phrase_models(content_A, base_path, {'level': 2}) assert b_name + '.2' in os.listdir(base_dir) assert not (b_name + '.3' in os.listdir(base_dir)) os.remove(base_path + '.2') t_name = rand(10) base_path = os.path.join(base_dir, t_name) build_phrase_models(content_B, base_path, {'level': 3}) assert t_name + '.2' in os.listdir(base_dir) assert t_name + '.3' in os.listdir(base_dir) pm2 = Phrases.load(base_path + '.2') pm3 = Phrases.load(base_path + '.3') os.remove(base_path + '.2') os.remove(base_path + '.3') # an iterator of sentences, each a list of words test_A = chain.from_iterable(doc.tokenized_text for doc in test_A) trigrams = pm3[pm2[test_A]] words = chain.from_iterable(trigrams) w2, w3 = tee(words, 2) bigrams = [w for w in w2 if w.count('_') == 1] assert len(bigrams) == 27622 assert len(set(bigrams)) == 2060 trigrams = [w for w in w3 if w.count('_') == 2] assert len(trigrams) == 11268 assert len(set(trigrams)) == 706 assert 'freshwater_resources' in bigrams assert 'water_stress_conditions' in trigrams
def __init__(self): reader = Reader() print('loading data') self.X_train=reader.getData(TRAIN) print('train data has been loaded!') self.X_valid=reader.getData(DEV) print('valid data has been loaded!') self.X_test=reader.getData(TEST) print('test data has been loaded!') self.c_title=[] self.c_body=[] self.bigram=Phrases.load('./data/bigram.dat') self.trigram=Phrases.load('./data/trigram.dat')
def __init__(self): reader = Reader() print('loading data') self.X_train = reader.getData(TRAIN) print('train data has been loaded!') self.X_valid = reader.getData(DEV) print('valid data has been loaded!') self.X_test = reader.getData(TEST) print('test data has been loaded!') self.c_title = [] self.c_body = [] self.bigram = Phrases.load('./data/bigram.dat') self.trigram = Phrases.load('./data/trigram.dat')
def __init__(self, sentences, filename=None): # model parameters self.sentences = sentences self.dataset = "CASEREPORT" self.tokenizer = "RAW" self.prune_stopwords = stopwords("pubmed") self.phrases = None self.threshold = 250 self.decay = 2 self.bigram_iter = 3 # data file path models_folder = os.path.join(*[os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'models']) if filename is None: filename = "PHRASE_%s_%s_%s_%s" % (self.threshold, self.decay, self.dataset, self.tokenizer, ) self.filepath = os.path.join(models_folder, filename) # does identical model already exists? model_exists = os.path.isfile(self.filepath) if model_exists: logging.info("LOADING - loading phrase data..") self.phrases = Phrases.load(self.filepath) else: logging.info("CREATE - creating phrase data..") self.build()
def __init__(self, train_data, dev_data, test_data): self.train_data = train_data self.dev_data = dev_data self.test_data = test_data # Hyper-parameters self.learningRate = 0.01 self.trainSize = 2000 self.testSize = 1000 self.totalSize = self.trainSize + self.testSize self.maxEpochs = 10000 self.num_processed = -1 self.w2v_model = Word2Vec.load('./data/word2vec/w2v.model') self.bigram = Phrases.load('./data/bigram.dat') self.trigram = Phrases.load('./data/trigram.dat')
def testSaveLoadCustomScorer(self): """ saving and loading a Phrases object with a custom scorer """ try: bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer) bigram.save("test_phrases_testSaveLoadCustomScorer_temp_save.pkl") bigram_loaded = Phrases.load( "test_phrases_testSaveLoadCustomScorer_temp_save.pkl") seen_scores = [] test_sentences = [[ 'graph', 'minors', 'survey', 'human', 'interface', 'system' ]] for phrase, score in bigram_loaded.export_phrases(test_sentences): seen_scores.append(score) assert all(seen_scores) # all scores 1 assert len( seen_scores ) == 3 # 'graph minors' and 'survey human' and 'interface system' finally: if os.path.exists( "test_phrases_testSaveLoadCustomScorer_temp_save.pkl"): os.remove( "test_phrases_testSaveLoadCustomScorer_temp_save.pkl")
def testCompatibilty(self): phrases = Phrases.load(datapath("phrases-3.6.0.model")) phraser = FrozenPhrases.load(datapath("phraser-3.6.0.model")) test_sentences = ['trees', 'graph', 'minors'] self.assertEqual(phrases[test_sentences], ['trees', 'graph_minors']) self.assertEqual(phraser[test_sentences], ['trees', 'graph_minors'])
def testSaveLoadNoScoring(self): """ Saving and loading a Phrases object with no scoring parameter. This should ensure backwards compatibility with old versions of Phrases""" try: bigram = Phrases(self.sentences, min_count=1, threshold=1) del (bigram.scoring) bigram.save("test_phrases_testSaveLoadNoScoring_temp_save.pkl") bigram_loaded = Phrases.load( "test_phrases_testSaveLoadNoScoring_temp_save.pkl") seen_scores = set() test_sentences = [[ 'graph', 'minors', 'survey', 'human', 'interface', 'system' ]] for phrase, score in bigram_loaded.export_phrases(test_sentences): seen_scores.add(round(score, 3)) assert seen_scores == set([ 5.167, # score for graph minors 3.444 # score for human interface ]) finally: if os.path.exists( "test_phrases_testSaveLoadNoScoring_temp_save.pkl"): os.remove("test_phrases_testSaveLoadNoScoring_temp_save.pkl")
def testSaveLoadNoCommonTerms(self): """ Ensure backwards compatibility with old versions of Phrases, before common_terms""" bigram_loaded = Phrases.load(datapath("phrases-no-common-terms.pkl")) self.assertEqual(bigram_loaded.common_terms, frozenset()) # can make a phraser, cf #1751 phraser = Phraser(bigram_loaded) # does not raise phraser[["human", "interface", "survey"]] # does not raise
def __init__(self,train_data,dev_data,test_data): self.train_data=train_data self.dev_data=dev_data self.test_data=test_data # Hyper-parameters self.learningRate=0.01 self.trainSize=2000 self.testSize=1000 self.totalSize = self.trainSize + self.testSize self.maxEpochs=10000 self.num_processed=-1 self.w2v_model=Word2Vec.load('./data/word2vec/w2v.model') self.bigram=Phrases.load('./data/bigram.dat') self.trigram=Phrases.load('./data/trigram.dat')
def testSaveLoadNoCommonTerms(self): """Ensure backwards compatibility with old versions of Phrases, before connector_words.""" bigram_loaded = Phrases.load(datapath("phrases-no-common-terms.pkl")) self.assertEqual(bigram_loaded.connector_words, frozenset()) # can make a phraser, cf #1751 phraser = FrozenPhrases(bigram_loaded) # does not raise phraser[["human", "interface", "survey"]] # does not raise
def trainSOPhrase(g_DataQueue, g_FinishRead, savePath, priorPhrasePath): """ :param g_DataQueue:全局变量存放数据库中的数据 :param g_FinishRead:是否读取完数据库的标志 :param savePath:短语学习器保存的位置 :param priorPhrasePath:前一个学习器保存的位置 :return: """ count = 0 phrase = Phrases(None, min_count=10, threshold=15) if (priorPhrasePath is None): priorPhraser = None else: priorPhraser = Phraser(Phrases.load(priorPhrasePath)) while (g_FinishRead.value == 0 or (not g_DataQueue.empty())): data = g_DataQueue.get() count += len(data) print("have processed:", count) words = [] reSub0 = re.compile( "(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]" ) # URL reSub1 = re.compile( "[()\"{},:/-]|[^a-z]'|'[^a-z;?.!]|'$") # replace with " " reSub2 = re.compile( "'[.?;!]") # replace with . 主要考虑所有格问题,核心思想单引号左右的各种复杂情况 reSplit1 = re.compile("\.[^a-z0-9]|[?!;]") # 获取单词 for t in data: if (t[0] is not None): st = re.sub(reSub0, " ", t[0].lower()) st = re.sub(reSub1, ".", st) st = re.sub(reSub2, ".", st) for sentence in re.split(reSplit1, st): sen_word = sentence.split() if (len(sen_word) > 6): words.append(sen_word) if (t[1] is not None): st = re.sub(reSub0, " ", t[1].lower()) st = re.sub(reSub1, ".", st) st = re.sub(reSub2, ".", st) for sentence in re.split(reSplit1, st): sen_word = sentence.split() if (len(sen_word) > 6): words.append(sen_word) del data gc.collect() # 训练短语 if (priorPhraser is None): # 第一次训练 phrase.add_vocab(words) else: # 已经训练过一次,寻找个数更多的短语 phrase.add_vocab(priorPhraser[words]) del words # print(len(phrase.vocab)) gc.collect phrase.save(savePath)
def load_phraser_models(models_dir, bigram_model_name, trigram_model_name): bigram_model = None trigram_model = None # check models dir if not os.path.isdir(models_dir): return bigram_model, trigram_model # check bigram model elif not os.path.exists(os.path.join(models_dir, bigram_model_name)): return bigram_model, trigram_model else: bigram_model = Phrases.load(os.path.join(models_dir, bigram_model_name)) # check trigram model if os.path.exists(os.path.join(models_dir, trigram_model_name)): trigram_model = Phrases.load( os.path.join(models_dir, trigram_model_name)) return bigram_model, trigram_model
def use_phrase_models(content, files, settings): for doc in content: text = doc.tokenized_text for fpath in files: phrases = Phrases.load(fpath) text = phrases[text] text = ". ".join([" ".join(sent) for sent in text]) yield set_text(doc, text)
def test_save_load_with_connector_words(self): """Test saving and loading a Phrases object.""" connector_words = frozenset({'of'}) bigram = Phrases(self.sentences, min_count=1, threshold=1, connector_words=connector_words) with temporary_file("test.pkl") as fpath: bigram.save(fpath) bigram_loaded = Phrases.load(fpath) assert bigram_loaded.connector_words == connector_words
def testCompatibilty(self): phr = Phraser.load(datapath("phraser-3.6.0.model")) model = Phrases.load(datapath("phrases-3.6.0.model")) test_sentences = ['trees', 'graph', 'minors'] expected_res = ['trees', 'graph_minors'] phr_out = phr[test_sentences] model_out = model[test_sentences] self.assertEqual(phr_out, expected_res) self.assertEqual(model_out, expected_res)
def load_phrase_models(indir, n): """ :param indir: :param n: :return: """ models = [] for _ in range(2, n + 1): infile = "%s%sgram.phrase.model" % (indir, n) models += [Phrases.load(infile)] return models
def __init__(self, search_pattern, window=5, lemma=False, document_ids=None, path_to_phrase_model=None): self.ids = document_ids self.window = window self.search_pattern = search_pattern self.lemma = lemma self.path_to_phrase_model = path_to_phrase_model if path_to_phrase_model is not None: self.phraser_model = Phraser(Phrases.load(path_to_phrase_model))
def testSaveLoadStringScoring(self): """ Saving and loading a Phrases object with a string scoring parameter. This should ensure backwards compatibility with the previous version of Phrases""" bigram_loaded = Phrases.load(datapath("phrases-scoring-str.pkl")) seen_scores = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] for phrase, score in bigram_loaded.export_phrases(test_sentences): seen_scores.add(round(score, 3)) assert seen_scores == set([ 5.167, # score for graph minors 3.444 # score for human interface ])
def __init__(self): ''' Training parameters: ''' self.w2v_dim = 100 self.num_feature = 400 self.batch_size = 16 self.num_epoch = 1 #self.w2v_model=Word2Vec.load_word2vec_format('./data/word2vec/GoogleNews-vectors-negative300.bin', binary=True) self.w2v_model = Word2Vec.load('./data/word2vec/w2v.model') self.index2word_set = set(self.w2v_model.index2word) self.bigram = Phrases.load('./data/bigram.dat') self.trigram = Phrases.load('./data/trigram.dat') print('Build model...') param_dist = { "n_estimators": sp_randint(20, 250), "criterion": ["gini", "entropy"], "max_depth": sp_randint(10, 300), "min_samples_split": sp_randint(1, 30), "min_samples_leaf": sp_randint(1, 30), "max_features": sp_randint(1, 200), "bootstrap": [True, False], 'random_state': sp_randint(1, 1000000), } # build a classifier clf = RandomForestClassifier(n_jobs=8) # run randomized search self.model = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=10, cv=9, n_jobs=8) print('Model has been built!')
def testSaveLoadNoScoring(self): """Test backwards compatibility with old versions of Phrases with no scoring parameter.""" bigram_loaded = Phrases.load(datapath("phrases-no-scoring.pkl")) test_sentences = [[ 'graph', 'minors', 'survey', 'human', 'interface', 'system' ]] seen_scores = set( round(score, 3) for score in bigram_loaded.find_phrases(test_sentences).values()) assert seen_scores == set([ 5.167, # score for graph minors 3.444 # score for human interface ])
def testSaveLoadCustomScorer(self): """ saving and loading a Phrases object with a custom scorer """ with temporary_file("test.pkl") as fpath: bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer) bigram.save(fpath) bigram_loaded = Phrases.load(fpath) seen_scores = [] test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] for phrase, score in bigram_loaded.export_phrases(test_sentences): seen_scores.append(score) assert all(seen_scores) # all scores 1 assert len(seen_scores) == 3 # 'graph minors' and 'survey human' and 'interface system'
def test_save_load_string_scoring(self): """Test backwards compatibility with a previous version of Phrases with custom scoring.""" bigram_loaded = Phrases.load(datapath("phrases-scoring-str.pkl")) test_sentences = [[ 'graph', 'minors', 'survey', 'human', 'interface', 'system' ]] seen_scores = set( round(score, 3) for score in bigram_loaded.find_phrases(test_sentences).values()) assert seen_scores == set([ 5.167, # score for graph minors 3.444 # score for human interface ])
def __init__(self): ''' Training parameters: ''' self.w2v_dim=100 self.num_feature=400 self.batch_size=16 self.num_epoch=1 #self.w2v_model=Word2Vec.load_word2vec_format('./data/word2vec/GoogleNews-vectors-negative300.bin', binary=True) self.w2v_model=Word2Vec.load('./data/word2vec/w2v.model') self.index2word_set = set(self.w2v_model.index2word) self.bigram=Phrases.load('./data/bigram.dat') self.trigram=Phrases.load('./data/trigram.dat') print('Build model...') param_dist = { "n_estimators":sp_randint(20,250), "criterion": ["gini", "entropy"], "max_depth": sp_randint(10, 300), "min_samples_split": sp_randint(1, 30), "min_samples_leaf": sp_randint(1, 30), "max_features": sp_randint(1, 200), "bootstrap": [True, False], 'random_state':sp_randint(1, 1000000), } # build a classifier clf = RandomForestClassifier(n_jobs=8) # run randomized search self.model=RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=10,cv=9,n_jobs=8) print('Model has been built!')
def testSaveLoadStringScoring(self): """ Saving and loading a Phrases object with a string scoring parameter. This should ensure backwards compatibility with the previous version of Phrases""" bigram_loaded = Phrases.load(datapath("phrases-scoring-str.pkl")) seen_scores = set() test_sentences = [[ 'graph', 'minors', 'survey', 'human', 'interface', 'system' ]] for phrase, score in bigram_loaded.export_phrases(test_sentences): seen_scores.add(round(score, 3)) assert seen_scores == set([ 5.167, # score for graph minors 3.444 # score for human interface ])
def trainPhrase(g_DataQueue, g_FinishRead, savePath, priorPhrasePath): count = 0 phrase = Phrases(None, min_count=15, threshold=10, max_vocab_size=40000000) if (priorPhrasePath is None): priorPhraser = None else: priorPhraser = Phraser(Phrases.load(priorPhrasePath)) while (g_FinishRead.value == 0 or (not g_DataQueue.empty())): words = g_DataQueue.get() if (priorPhraser is None): # 第一次训练 phrase.add_vocab(words) else: # 已经训练过一次,寻找个数更多的短语 phrase.add_vocab(priorPhraser[words]) del words gc.collect() phrase.save(savePath)
def testSaveLoad(self): """ Saving and loading a Phrases object.""" with temporary_file("test.pkl") as fpath: bigram = Phrases(self.sentences, min_count=1, threshold=1) bigram.save(fpath) bigram_loaded = Phrases.load(fpath) seen_scores = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] for phrase, score in bigram_loaded.export_phrases(test_sentences): seen_scores.add(round(score, 3)) assert seen_scores == set([ 5.167, # score for graph minors 3.444 # score for human interface ])
def testSaveLoad(self): """Test saving and loading a Phrases object.""" with temporary_file("test.pkl") as fpath: bigram = Phrases(self.sentences, min_count=1, threshold=1) bigram.save(fpath) bigram_loaded = Phrases.load(fpath) test_sentences = [[ 'graph', 'minors', 'survey', 'human', 'interface', 'system' ]] seen_scores = set( round(score, 3) for score in bigram_loaded.find_phrases( test_sentences).values()) assert seen_scores == set([ 5.167, # score for graph minors 3.444 # score for human interface ])
def testSaveLoadCustomScorer(self): """ saving and loading a Phrases object with a custom scorer """ try: bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer) bigram.save("test_phrases_testSaveLoadCustomScorer_temp_save.pkl") bigram_loaded = Phrases.load("test_phrases_testSaveLoadCustomScorer_temp_save.pkl") seen_scores = [] test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] for phrase, score in bigram_loaded.export_phrases(test_sentences): seen_scores.append(score) assert all(seen_scores) # all scores 1 assert len(seen_scores) == 3 # 'graph minors' and 'survey human' and 'interface system' finally: if os.path.exists("test_phrases_testSaveLoadCustomScorer_temp_save.pkl"): os.remove("test_phrases_testSaveLoadCustomScorer_temp_save.pkl")
def create_dictionary(texts, dest_file: str, build_bigram, working_directory=DIR): """ Reads the file specified by source_file, creates a dictionary and saves it to the dest_file path. :param working_directory: The path to the directory where the bigram model files should be saved. :param build_bigram: 1 if building a new phrases object is needed else an already processed bigram model will be loaded. :param source_file: path to source text file. :param dest_file: path to save dictionary to. :return: """ # collect statistics about all tokens stoplist = stopwords.words('english') if build_bigram: bigram = Phrases([tweet.split() for tweet in texts]) bigram.save(working_directory + '/bigram_model.phrase') else: bigram = Phrases.load(working_directory + '/bigram_model.phrase') phraser = Phraser(bigram) # Build dictionary dictionary = corpora.Dictionary(phraser[line.lower().split()] for line in texts) # remove stop words and words that appear only once stop_ids = [ dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id ] once_ids = [ tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1 ] dictionary.filter_tokens( stop_ids + once_ids) # remove stop words and words that appear only once dictionary.filter_extremes(no_below=0.3, no_above=0.85) dictionary.compactify( ) # remove gaps in id sequence after words that were removed dictionary.save(dest_file) print(dictionary) print(dictionary.token2id) return dictionary
def identify_phrases(sentence, path_to_gensim_phrase_model): """Identify multiword expression by a trained phrase model. Parameters ---------- sentence : {list} list with tokens as elements path_to_gensim_phrase_model : {str} Absolute path to the model. Returns ------- list List with tokens as elements. """ phrase_model = Phrases.load(path_to_gensim_phrase_model) phraser_model = Phraser(phrase_model) new_sentence = phraser_model[sentence] return new_sentence
def testSaveLoadCustomScorer(self): """Test saving and loading a Phrases object with a custom scorer.""" with temporary_file("test.pkl") as fpath: bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer) bigram.save(fpath) bigram_loaded = Phrases.load(fpath) test_sentences = [[ 'graph', 'minors', 'survey', 'human', 'interface', 'system' ]] seen_scores = list( bigram_loaded.find_phrases(test_sentences).values()) assert all(score == 1 for score in seen_scores) assert len( seen_scores ) == 3 # 'graph minors' and 'survey human' and 'interface system'
def testSaveLoad(self): """ Saving and loading a Phrases object.""" try: bigram = Phrases(self.sentences, min_count=1, threshold=1) bigram.save("test_phrases_testSaveLoad_temp_save.pkl") bigram_loaded = Phrases.load("test_phrases_testSaveLoad_temp_save.pkl") seen_scores = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] for phrase, score in bigram_loaded.export_phrases(test_sentences): seen_scores.add(round(score, 3)) assert seen_scores == set([ 5.167, # score for graph minors 3.444 # score for human interface ]) finally: if os.path.exists("test_phrases_testSaveLoad_temp_save.pkl"): os.remove("test_phrases_testSaveLoad_temp_save.pkl")
def testSaveLoadCustomScorer(self): """ saving and loading a Phrases object with a custom scorer """ with temporary_file("test.pkl") as fpath: bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer) bigram.save(fpath) bigram_loaded = Phrases.load(fpath) seen_scores = [] test_sentences = [[ 'graph', 'minors', 'survey', 'human', 'interface', 'system' ]] for phrase, score in bigram_loaded.export_phrases(test_sentences): seen_scores.append(score) assert all(seen_scores) # all scores 1 assert len( seen_scores ) == 3 # 'graph minors' and 'survey human' and 'interface system'
def testSaveLoadNoScoring(self): """ Saving and loading a Phrases object with no scoring parameter. This should ensure backwards compatibility with old versions of Phrases""" try: bigram = Phrases(self.sentences, min_count=1, threshold=1) del(bigram.scoring) bigram.save("test_phrases_testSaveLoadNoScoring_temp_save.pkl") bigram_loaded = Phrases.load("test_phrases_testSaveLoadNoScoring_temp_save.pkl") seen_scores = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] for phrase, score in bigram_loaded.export_phrases(test_sentences): seen_scores.add(round(score, 3)) assert seen_scores == set([ 5.167, # score for graph minors 3.444 # score for human interface ]) finally: if os.path.exists("test_phrases_testSaveLoadNoScoring_temp_save.pkl"): os.remove("test_phrases_testSaveLoadNoScoring_temp_save.pkl")
def get_trigram_model(self, recalculate=False, from_scratch=True): if not os.path.isfile( self.paths.trigram_model_filepath) or recalculate: if not from_scratch: raise ValueError( 'No trigram model file exists but from_scratch is False') print('Building tri-gram model...') bigram_sentences = LineSentence( self.paths.bigram_sentences_filepath) trigram_model = Phrases(bigram_sentences) trigram_model = Phraser(trigram_model) print('Writing model...') trigram_model.save(self.paths.trigram_model_filepath) else: print('Loading tri-gram model...') trigram_model = Phrases.load(self.paths.trigram_model_filepath) print('Done!') return trigram_model
from reader import Reader,TRAIN,TEST,DEV,EXTRA from preprocess import preprocess from gensim.models.phrases import Phrases reader = Reader() sentences=reader.getText(TRAIN+EXTRA) # use phrase only when it has already trained bigram=Phrases.load('./data/bigram.dat') trigram=Phrases.load('./data/trigram.dat') sen_set=set() with open('./data/text_cleaned_phrase.txt','w') as f: for sentence in sentences: s=preprocess(sentence,bigram=bigram,trigram=trigram) if s not in sen_set: sen_set.add(s) f.write(preprocess(sentence,bigram=bigram,trigram=trigram)) f.write('\n') ''' # for phrase training only with open('./data/text_cleaned.txt','w') as f: for sentence in sentences: f.write(preprocess(sentence,no_stopwords=True)) f.write('\n') '''
def __init__(self): self.session = tf.Session() ''' Training parameters: ''' self.w2v_dim=100 self.num_feature=400 self.batch_size=32 self.num_epoch=10000 self.num_hidden_1=3 self.num_hidden_2=3 self.number_of_layers=3 #self.max_len = 50 self.max_len_title=6 self.max_len_body=38 self.d2v_model=Doc2Vec.load('data/word2vec/d2v.model') #self.bigram = None #self.trigram =None self.bigram=Phrases.load('./data/bigram.dat') self.trigram=Phrases.load('./data/trigram.dat') # Model self.input=tf.placeholder(tf.float32,[None,self.w2v_dim*4]) self.dropout_input = tf.placeholder(tf.float32) self.dropout_hidden = tf.placeholder(tf.float32) self.target = tf.placeholder(tf.float32, [None, 3]) # 2-layer NN # 2-layer NN with tf.variable_scope("NN", initializer=tf.random_uniform_initializer()): W_1 = tf.get_variable("W_1", [self.w2v_dim*4, self.num_hidden_1]) b_1 = tf.get_variable("b_1", [self.num_hidden_1]) # W_2 = tf.get_variable("W_2", [self.num_hidden_1, self.num_hidden_2]) # b_2 = tf.get_variable("b_2", [self.num_hidden_2]) # input = tf.nn.dropout(input, self.dropout_input) # y_1 = tf.sigmoid(tf.matmul(self.input, W_1)+b_1) # y_1 = tf.nn.dropout(y_1, self.dropout_hidden) # y_2 = tf.matmul(y_1, W_2)+b_2 y_2 = tf.matmul(self.input, W_1)+b_1 self.y_pred=tf.nn.softmax(y_2) self.y_pred=tf.clip_by_value(self.y_pred,1e-7, 1.0) self.cross_entropy = -tf.reduce_mean(self.target*tf.log(self.y_pred)) # Optimizer. global_step = tf.Variable(0) # optimizer = tf.train.GradientDescentOptimizer(0.1) # optimizer = tf.train.AdamOptimizer(0.01) # gradients, v = zip(*optimizer.compute_gradients(self.cross_entropy)) # gradients, _ = tf.clip_by_global_norm(gradients, 50) # self.optimizer= optimizer.apply_gradients(zip(gradients, v), global_step=global_step) self.optimizer = tf.train.AdamOptimizer(0.01).minimize(self.cross_entropy) print('Model has been built!')
def __init__(self): self.session = tf.Session() ''' Training parameters: ''' self.w2v_dim=30 self.num_feature=400 self.batch_size=32 self.num_epoch=10000 self.num_hidden_1=50 self.num_hidden_2=3 self.number_of_layers=1 #self.max_len = 50 self.max_len_title=6 self.max_len_body=38 #self.w2v_model=Word2Vec.load_word2vec_format('./data/word2vec/GoogleNews-vectors-negative300.bin', binary=True) self.w2v_model=Word2Vec.load('data/word2vec/w2v.model') self.index2word_set = set(self.w2v_model.index2word) #self.bigram = None #self.trigram =None self.bigram=Phrases.load('./data/bigram.dat') self.trigram=Phrases.load('./data/trigram.dat') # Model self.input_0=tf.placeholder(tf.float32,[self.max_len_title,self.batch_size,self.w2v_dim]) self.input_1=tf.placeholder(tf.float32,[self.max_len_title,self.batch_size,self.w2v_dim]) self.input_0_=tf.placeholder(tf.float32,[self.max_len_body,self.batch_size,self.w2v_dim]) self.input_1_=tf.placeholder(tf.float32,[self.max_len_body,self.batch_size,self.w2v_dim]) self.dropout_input = tf.placeholder(tf.float32) self.dropout_hidden = tf.placeholder(tf.float32) self.target = tf.placeholder(tf.float32, [self.batch_size, 3]) input_0=array_ops.unpack(self.input_0) input_1=array_ops.unpack(self.input_1) input_0_=array_ops.unpack(self.input_0_) input_1_=array_ops.unpack(self.input_1_) def _rnn(inputs, reverse=False): with tf.variable_scope("GRU_RNN") as scope: cell=rnn_cell.GRUCell(self.w2v_dim) cell = rnn_cell.DropoutWrapper(cell, output_keep_prob=self.dropout_input) stacked_cell = rnn_cell.MultiRNNCell([cell] * self.number_of_layers) state = stacked_cell.zero_state(self.batch_size, tf.float32) if reverse: inputs=reversed(inputs) for time, input_ in enumerate(inputs): if time > 0: scope.reuse_variables() output, state = stacked_cell(input_, state) return state with tf.variable_scope('Feature_Generator') as scope: state_0 = _rnn(input_0) scope.reuse_variables() state_1 = _rnn(input_1) state_0_ = _rnn(input_0_) state_1_ = _rnn(input_1_) ''' with tf.variable_scope('Feature_Generator_body') as scope: state_0_ = _rnn(input_0_) scope.reuse_variables() state_1_ = _rnn(input_1_) ''' ''' with tf.variable_scope('Feature_Generator_body_reverse') as scope: state_0_reverse = _rnn(input_0_, reverse=True) scope.reuse_variables() state_1_reverse = _rnn(input_1_, reverse=True) ''' ''' with tf.variable_scope('Feature_Generator_title') as scope: state_0 = _rnn(input_0) scope.reuse_variables() state_1 = _rnn(input_1) with tf.variable_scope('Feature_Generator_body') as scope: state_0_ = _rnn(input_0_) scope.reuse_variables() state_1_ = _rnn(input_1_) # state=tf.concat(1,[tf.abs(tf.sub(state_0,state_1)),tf.mul(state_0,state_1), # tf.abs(tf.sub(state_0_,state_1_)),tf.mul(state_0_,state_1_)]) # state=tf.concat(1,[state_0,state_1, state_0_, state_1_]) # state = tf.ones([32,10]) # state=tf.concat(1,[tf.abs(tf.sub(state_0,state_1)),tf.mul(state_0,state_1)]) ''' # 2-layer NN with tf.variable_scope("NN", initializer=tf.random_uniform_initializer(-1.0,1.0)): self.W_mul = tf.get_variable("W_mul", [state_0_.get_shape()[1]*2,self.num_hidden_1]) self.W_sub = tf.get_variable("W_sub", [state_0_.get_shape()[1]*2,self.num_hidden_1]) self.b = tf.get_variable("b", [self.num_hidden_1]) self.W_softmax=tf.get_variable("W_softmax", [self.num_hidden_1,self.num_hidden_2]) self.b_softmax = tf.get_variable("b_softmax", [self.num_hidden_2]) # h_mul = tf.mul(state_0,state_1) # h_sub = tf.abs(tf.sub(state_0,state_1)) h_mul = tf.concat(1,[tf.mul(state_0,state_1),tf.mul(state_0_,state_1_)]) h_sub = tf.concat(1,[tf.abs(tf.sub(state_0,state_1)),tf.abs(tf.sub(state_0_,state_1_))]) y_1 = tf.nn.sigmoid(tf.matmul(h_mul, self.W_mul)+tf.matmul(h_sub, self.W_sub)+self.b) y_2 = tf.matmul(y_1, self.W_softmax)+self.b_softmax # regularizers = (tf.nn.l2_loss(self.W_1) + tf.nn.l2_loss(self.b_1)+tf.nn.l2_loss(self.W_2) + tf.nn.l2_loss(self.b_2)) ''' state_0_title_normalized = tf.nn.l2_normalize(state_0, 1) state_1_title_normalized = tf.nn.l2_normalize(state_1, 1) state_0_body_normalized = tf.nn.l2_normalize(state_0_, 1) state_1_body_normalized = tf.nn.l2_normalize(state_1_, 1) dist_title_ = tf.mul(state_0_title_normalized, state_1_title_normalized) dist_body_ = tf.mul(state_0_body_normalized, state_1_body_normalized)s dist_title=tf.reduce_sum(dist_title_, 1, keep_dims=True) dist_body=tf.reduce_sum(dist_body_, 1, keep_dims=True) feature = tf.concat(1, [dist_title,dist_body]) with tf.variable_scope("log_reg", initializer=tf.random_uniform_initializer()): self.W = tf.get_variable("W", [feature.get_shape()[1],3]) self.b = tf.get_variable("b", [3]) y_2 = tf.matmul(feature, self.W)+self.b ''' ''' with tf.variable_scope("log_reg", initializer=tf.random_uniform_initializer()): self.W_1 = tf.get_variable("W_1", [state.get_shape()[1],self.num_hidden_1]) self.b_1 = tf.get_variable("b_1", [self.num_hidden_1]) self.W_2 = tf.get_variable("W_2", [self.num_hidden_1,self.num_hidden_2]) self.b_2 = tf.get_variable("b_2", [self.num_hidden_2]) ''' ''' # Create model def multilayer_perceptron(_X, _weights, _biases): layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(_X, _weights['h1']), _biases['b1'])) #Hidden layer with RELU activation layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, _weights['h2']), _biases['b2'])) #Hidden layer with RELU activation return tf.matmul(layer_2, _weights['out']) + _biases['out'] # Store layers weight & bias weights = { 'h1': tf.Variable(tf.random_normal([10, 10])), 'h2': tf.Variable(tf.random_normal([10, 5])), 'out': tf.Variable(tf.random_normal([5, 3])) } biases = { 'b1': tf.Variable(tf.random_normal([10])), 'b2': tf.Variable(tf.random_normal([5])), 'out': tf.Variable(tf.random_normal([3])) } # Construct model self.y_pred = multilayer_perceptron(state, weights, biases) # Define loss and optimizer self.cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(self.y_pred, self.target)) # Softmax loss self.optimizer = tf.train.AdamOptimizer(learning_rate=0.1).minimize(self.cross_entropy) # Adam Optimizer ''' # self.W = tf.Variable(tf.zeros([10, 3])) # self.b = tf.Variable(tf.zeros([3])) # y_1 = tf.sigmoid(tf.matmul(state, self.W_1)+self.b_1) # y_2 = tf.sigmoid(tf.matmul(y_1, self.W_2)+self.b_2) # self.y_pred = tf.nn.softmax(tf.nn.sigmoid(tf.add(tf.matmul(state, self.W),self.b))) self.y_pred=tf.nn.softmax(y_2) # self.y_pred = tf.nn.softmax(tf.nn.sigmoid(tf.matmul(state, self.W_1)+self.b_1)) self.cross_entropy = -tf.reduce_mean(self.target*tf.log(self.y_pred)) # self.optimizer = tf.train.AdamOptimizer().minimize(self.cross_entropy) # self.optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(self.cross_entropy) # self.optimizer = tf.train.GradientDescentOptimizer(0.01).minimize(self.cross_entropy) # self.gradstep = self.optimizer.compute_gradients(self.cross_entropy) # Optimizer. global_step = tf.Variable(0) # optimizer = tf.train.GradientDescentOptimizer(0.1) optimizer = tf.train.AdagradOptimizer(0.1) gradients, v = zip(*optimizer.compute_gradients(self.cross_entropy)) gradients, _ = tf.clip_by_global_norm(gradients, 10) self.optimizer= optimizer.apply_gradients(zip(gradients, v), global_step=global_step) print('Model has been built!')
def __init__(self): self.session = tf.Session() ''' Training parameters: ''' self.w2v_dim=10 self.num_feature=400 self.batch_size=32 self.num_epoch=10000 self.num_hidden_1=100 self.num_hidden_2=50 self.num_hidden_3=3 self.number_of_layers=1 #self.max_len = 50 self.max_len_title=13 self.max_len_body=50 # self.w2v_model=Word2Vec.load_word2vec_format('./data/word2vec/GoogleNews-vectors-negative300.bin', binary=True) self.w2v_model=Word2Vec.load('data/word2vec/w2v.model') self.index2word_set = set(self.w2v_model.index2word) #self.bigram = None #self.trigram =None self.bigram=Phrases.load('./data/bigram.dat') self.trigram=Phrases.load('./data/trigram.dat') # Model self.input_0=tf.placeholder(tf.float32,[self.max_len_title,self.batch_size,self.w2v_dim]) self.input_1=tf.placeholder(tf.float32,[self.max_len_title,self.batch_size,self.w2v_dim]) self.input_0_=tf.placeholder(tf.float32,[self.max_len_body,self.batch_size,self.w2v_dim]) self.input_1_=tf.placeholder(tf.float32,[self.max_len_body,self.batch_size,self.w2v_dim]) self.dropout_input = tf.placeholder(tf.float32) self.dropout_hidden_1 = tf.placeholder(tf.float32) self.target = tf.placeholder(tf.float32, [self.batch_size, 3]) input_0=array_ops.unpack(self.input_0) input_1=array_ops.unpack(self.input_1) input_0_=array_ops.unpack(self.input_0_) input_1_=array_ops.unpack(self.input_1_) def _encoder(inputs, reverse=False): with tf.variable_scope("GRU_RNN") as scope: cell=rnn_cell.BasicLSTMCell(self.w2v_dim) stacked_cell = rnn_cell.MultiRNNCell([cell] * self.number_of_layers) # state = tf.zeros([1, cell.state_size]) state = stacked_cell.zero_state(self.batch_size, tf.float32) if reverse: inputs=reversed(inputs) for time, input_ in enumerate(inputs): if time > 0: scope.reuse_variables() output, state = stacked_cell(input_, state) return state def _decoder(state, inputs): with tf.variable_scope("GRU_RNN") as scope: cell=rnn_cell.BasicLSTMCell(self.w2v_dim) stacked_cell = rnn_cell.MultiRNNCell([cell] * self.number_of_layers*2) for time, input_ in enumerate(inputs): if time > 0: scope.reuse_variables() output, state = stacked_cell(input_, state) return output with tf.variable_scope('Encoder') as scope: state = _encoder(input_0_) scope.reuse_variables() state_reversed = _encoder(input_0_, reverse=True) with tf.variable_scope('Decoder') as scope: state = _decoder(tf.concat(1,[state,state_reversed]), input_1_) with tf.variable_scope("to_score", initializer=tf.random_uniform_initializer()): self.W = tf.get_variable("W", [state.get_shape()[1],3]) self.b = tf.get_variable("b", [3]) score = tf.matmul(state, self.W)+self.b # score_1 = tf.sigmoid(tf.matmul(out_1, self.W)+self.b) # state=tf.concat(1,[score_0,score_1]) ''' with tf.variable_scope("to_final", initializer=tf.random_uniform_initializer()): self.W = tf.get_variable("W", [state.get_shape()[1],3]) self.b = tf.get_variable("b", [3]) final = tf.matmul(state, self.W)+self.b ''' self.y_pred=tf.nn.softmax(score) self.cross_entropy = -tf.reduce_mean(self.target*tf.log(self.y_pred)) # Optimizer. global_step = tf.Variable(0) optimizer = tf.train.GradientDescentOptimizer(0.1) # optimizer = tf.train.AdamOptimizer(0.1) gradients, v = zip(*optimizer.compute_gradients(self.cross_entropy)) gradients, _ = tf.clip_by_global_norm(gradients, 20) self.optimizer= optimizer.apply_gradients(zip(gradients, v), global_step=global_step) print('Model has been built!')