def setUp(self): """Set up Phraser models for the tests.""" bigram_phrases = Phrases(sentences, min_count=1, threshold=1) self.bigram = Phraser(bigram_phrases) bigram_default_phrases = Phrases(sentences) self.bigram_default = Phraser(bigram_default_phrases) bigram_utf8_phrases = Phrases(sentences, min_count=1, threshold=1) self.bigram_utf8 = Phraser(bigram_utf8_phrases) bigram_unicode_phrases = Phrases(unicode_sentences, min_count=1, threshold=1) self.bigram_unicode = Phraser(bigram_unicode_phrases)
def setUp(self): self.bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) self.bigram_default = Phrases(self.sentences, common_terms=self.common_terms) self.bigram_utf8 = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) self.bigram_unicode = Phrases(self.unicode_sentences, min_count=1, threshold=1, common_terms=self.common_terms)
def extract_phrases(app_files, bigram_min, trigram_min): bigram_fp = os.path.join("..", "model", "bigram.model") trigram_fp = os.path.join("..", "model", "trigram.model") rst = build_input(app_files) gen = list(itertools.chain.from_iterable(rst)) # flatten bigram = Phrases(gen, threshold=5, min_count=bigram_min) trigram = Phrases(bigram[gen], threshold=3, min_count=trigram_min) w2v_model = Word2Vec(trigram[bigram[gen]], min_count=1, size=200) # write bigram.save(bigram_fp) trigram.save(trigram_fp) return w2v_model
def build_ngrams(df, min_count=5, threshold=2): """ This function builds bigram and ngrams. Please don't modify, it may explode. """ print("Building Bigrams") phrases = Phrases(tqdm(df.clean), min_count=min_count, threshold=threshold) bigrams = Phraser(phrases) # Phrases -> Phraser: lighter/faster object, but can't be updated df['bigrams'] = df.clean.progress_apply(lambda r: bigrams[r]) print("Building Ngrams") phrases_2 = Phrases(tqdm(df.bigrams), min_count=min_count, threshold=threshold) ngrams = Phraser(phrases_2) df['ngrams'] = df.clean.progress_apply(lambda r: ngrams[r])
def create_model(self, doc_list): self.bigrams_phrases = Phrases(doc_list, min_count=self.min_count_bigrams) self.bigrams_phraser = Phraser(self.bigrams_phrases) self.trigrams_phrases = Phrases(self.bigrams_phraser[doc_list], min_count=self.min_count_trigrams) self.trigrams_phraser = Phraser(self.trigrams_phrases) self.bigrams_phraser.save(self.model_dir + '/' + BIGRAMS_PHRASER_FILENAME) self.trigrams_phraser.save(self.model_dir + '/' + TRIGRAMS_PHRASER_FILENAME) self.bigrams_phrases.save(self.model_dir + '/' + BIGRAMS_PHRASES_FILENAME) self.trigrams_phrases.save(self.model_dir + '/' + TRIGRAMS_PHRASES_FILENAME)
def init_phraser(self, components=False, **kwargs): sentences = LineSentence(self.path + 'sentences.txt.gz') phrases = Phrases(sentences, **kwargs) self.phraser = GensimPhraser(phrases) self.phraser.components = components self.phraser.save(self.path + 'phraser.pkl') del phrases
def bigrammer(source_file, outfile, mincount=100, threshold=0.99, scoring='npmi', commonfile='common_tagged.txt'): """ :param source_file: :param outfile: :param mincount: :param threshold: :param scoring: :param commonfile: :return: """ common = set([word.strip() for word in open(commonfile, 'r').readlines()]) data = LineSentence(source_file) bigram_transformer = Phrases(sentences=data, min_count=mincount, threshold=threshold, scoring=scoring, max_vocab_size=400000000, delimiter=b':::', progress_per=100000, common_terms=common) bigrams = Phraser(bigram_transformer) tempfile = smart_open(outfile, 'a') print('Writing bigrammed text to %s' % outfile, file=sys.stderr) for i in bigrams[data]: tempfile.write(' '.join(i) + '\n') tempfile.close() return len(bigrams.phrasegrams)
def update_namespaces(self, project_id, log_words): all_words = self.minio_client.get_project_object( project_id, "project_log_unique_words") for word in log_words: all_words[word] = 1 self.minio_client.put_project_object(all_words, project_id, "project_log_unique_words") phrases = Phrases([w.split(".") for w in all_words], min_count=1, threshold=1) potential_project_namespaces = {} for word in all_words: potential_namespace = phrases[word.split(".")][0] if "_" not in potential_namespace: continue if potential_namespace not in potential_project_namespaces: potential_project_namespaces[potential_namespace] = 0 potential_project_namespaces[potential_namespace] += 1 chosen_namespaces = {} for item, cnt in potential_project_namespaces.items(): if cnt > 10: chosen_namespaces[item.replace("_", ".")] = cnt logger.debug("Chosen namespaces %s", chosen_namespaces) self.minio_client.put_project_object(chosen_namespaces, project_id, "chosen_namespaces")
def __init__(self, df): self.sent = df.tolist() self.phrases = Phrases(self.sent, min_count=30, threshold=1) self.bigram = Phraser(self.phrases) self.sentences = self.bigram[self.sent] self.w2v_model = Word2Vec(min_count=30, window=3, size=252, sample=6e-5, alpha=0.01, # sample=1e-5 min_alpha=0.0005, negative=5, workers=multiprocessing.cpu_count()-1)
def w2v(): import multiprocessing cores = multiprocessing.cpu_count() txt_list = [] df = pd.read_csv("bibs.csv") for doc in df["Abs"]: txt_list.append(cleaning(doc)) df["clean"] = txt_list sent = [row.split() for row in df['clean']] phrases = Phrases(sent, min_count=10, progress_per=10) bigram = Phraser(phrases) sentences = bigram[sent] w2v_model = Word2Vec(min_count=4, window=5, size=10, sample=6e-5, alpha=0.03, min_alpha=0.0007, negative=4, workers=cores - 1) w2v_model.build_vocab(sentences, progress_per=10) #aki = Acute Kidney Injury w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1) w2v_model.save("word2vec.model") w2v_model.init_sims(replace=True) res = w2v_model.wv.most_similar(positive=["surgical"]) pprint(res)
def testSaveLoadCustomScorer(self): """ saving and loading a Phrases object with a custom scorer """ try: bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer) bigram.save("test_phrases_testSaveLoadCustomScorer_temp_save.pkl") bigram_loaded = Phrases.load( "test_phrases_testSaveLoadCustomScorer_temp_save.pkl") seen_scores = [] test_sentences = [[ 'graph', 'minors', 'survey', 'human', 'interface', 'system' ]] for phrase, score in bigram_loaded.export_phrases(test_sentences): seen_scores.append(score) assert all(seen_scores) # all scores 1 assert len( seen_scores ) == 3 # 'graph minors' and 'survey human' and 'interface system' finally: if os.path.exists( "test_phrases_testSaveLoadCustomScorer_temp_save.pkl"): os.remove( "test_phrases_testSaveLoadCustomScorer_temp_save.pkl")
def _phrase(self, token): bigram = Phrases(token, min_count=5, threshold=100) bigram_mod = Phraser(bigram) # trigram = Phrases(bigram_mod[token],min_count=5,threshold=100) # trigram_mod = Phraser(trigram) # return [trigram_mod[bigram_mod[doc]] for doc in token] return [bigram_mod[doc] for doc in token]
def deserialize(self, type, name, language='en'): serializer = self.serializers[type] if type != "lda_model": with codecs.open(name, "r", encoding = "utf-8") as f: data = json.load(f) elif type == "lda_model": with open(name, "rb") as f: data = pickle.load(f) deserialized = serializer(data).deserialize() if type == "phrases": if language == 'en': common_terms = self.function_words_single else: common_terms = safe_get_stop_words(language) phrases = Phrases(delimiter="_", connector_words=common_terms) phrases.phrasegrams = deserialized deserialized = phrases return deserialized
def prepareDocs(self, phrases=1): preppedDocs = [] # Clean for i, doc in enumerate(self.uncleanDocList): cleanedDoc = ftfy.fix_text(doc, normalization='NFKC') cleanedDoc = cleanedDoc.replace('?', ' ') cleanedDoc = ' '.join(cleanedDoc.splitlines()) cleanedDoc = re.sub(r'http\S+', '', cleanedDoc) cleanedDoc = re.sub(r'https\S+', '', cleanedDoc) translator = str.maketrans(punctuation, ' ' * len(punctuation)) cleanedDoc = cleanedDoc.translate(translator) cleanedDoc = cleanedDoc.translate({ord(k): None for k in digits}) cleanedDoc = cleanedDoc.lower() cleanedDoc = ' '.join(cleanedDoc.split()) preppedDocs.append(cleanedDoc) print('%d of %d documents cleaned.' % (i + 1, len(self.uncleanDocList))) # Detect phrases (optional) if phrases is not None: print('Phrase detection requested. Running...') tokenizedDocs = [] for doc in preppedDocs: tokenizedDocs.append(doc.split()) bigrammer = Phrases(tokenizedDocs) preppedDocs = [] for tokdoc in tokenizedDocs: preppedDocs.append(' '.join(bigrammer[tokdoc])) print('Documents are now phrase-collocated.') # Save prepared documents to class instance self.preppedDocList = preppedDocs
def _train_phraser(self, min_count, phrase_threshold, delimiter): print("Training collocation detector...") return Phraser( Phrases(self.line_iterator, min_count=min_count, threshold=phrase_threshold, delimiter=delimiter))
def make_trigrams(bigram_sentences: list): from gensim.models.phrases import Phrases trigram_model = Phrases(bigram_sentences, threshold=40) results = [] for doc in bigram_sentences: results.append(trigram_model[doc]) return results
def __call__(self, docs): phrases = Phrases(docs, min_count=10) bigram = self.phraser(phrases) p = Pool(cores) docs = p.starmap(self.append_bigram, zip(docs, [bigram] * len(docs))) pool.close() return docs
def advb_bigram_detect(sentences): # first build the list of maintenance words_by_alphebat list_of_adverb = Utility.read_words_file_into_list( save_folder_name + "/List_of_advb.txt", 1) phrases = Phrases( sentences, max_vocab_size=max_vocab_size, min_count=bigram_minimum_count_threshold, threshold=threshold, delimiter=delimiter, progress_per=progress_per ) # use # as delimiter to distinguish from ~ used in previous stages with open(save_folder_name + '/' + 'advb_bigram.txt', "w") as bigram_2_file: c = 1 for key in phrases.vocab.keys(): a = key.decode() a = a.split("#") if len(a) > 1: flag = False flag2 = False for w in a: if w in list_of_adverb: flag = True if len(w) > 4 and w[-3:] == 'ing': flag2 = True if flag and flag2: s = key.decode() print('{0}\t\t{1:<10}'.format(c, s), file=bigram_2_file) c += 1 logger.info("PROGRESS: Finished advb_bigram_detect")
def createEmbeddingSpace(filename): # you need to remake key common phrases... # "new york" should really be "new_york" as a collective since "new" and "york" have different meanings # if they are used together vs separately # https://stackoverflow.com/questions/35716121/how-to-extract-phrases-from-corpus-using-gensim #sentencesAll = [] with open(filename, 'r') as f: sentencesAll = [line.split(" ") for line in f if line != None] #takes about ~10 min random.shuffle(sentencesAll) phrases = Phrases(sentencesAll, min_count=1, threshold=2, progress_per=10000) bigram = Phraser(phrases) sentences = bigram[sentencesAll] print(len(sentences)) #15,786,808 print(sentences[0]) # Building and Training the Model cores = multiprocessing.cpu_count() # I removed min_count... idk how to see which we not used w2v_model = Word2Vec(window=6, size=100, sample=6e-5, alpha=0.03, min_alpha=0.0007, negative=20, workers=cores - 1) t = time() w2v_model.build_vocab(sentences, progress_per=10000) print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2))) #6.71 mins t = time() w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1) print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2))) print("Sentence[0]: in embedding Model {}".format(sentences[0])) print("Sentence[1]: in embedding Model {}".format(sentences[1])) print("Similarity is: {}".format( w2v_model.wv.wmdistance(sentences[0], sentences[1]))) return w2v_model
def testSaveLoadNoScoring(self): """ Saving and loading a Phrases object with no scoring parameter. This should ensure backwards compatibility with old versions of Phrases""" try: bigram = Phrases(self.sentences, min_count=1, threshold=1) del (bigram.scoring) bigram.save("test_phrases_testSaveLoadNoScoring_temp_save.pkl") bigram_loaded = Phrases.load( "test_phrases_testSaveLoadNoScoring_temp_save.pkl") seen_scores = set() test_sentences = [[ 'graph', 'minors', 'survey', 'human', 'interface', 'system' ]] for phrase, score in bigram_loaded.export_phrases(test_sentences): seen_scores.add(round(score, 3)) assert seen_scores == set([ 5.167, # score for graph minors 3.444 # score for human interface ]) finally: if os.path.exists( "test_phrases_testSaveLoadNoScoring_temp_save.pkl"): os.remove("test_phrases_testSaveLoadNoScoring_temp_save.pkl")
def load_vector_data(dataset_name, bgr=False): sentences = pd.read_csv("../cleaned/" + dataset_name + "_stems.csv", delimiter=",").astype(str).fillna("").values.tolist() targets = pd.read_csv("../cleaned/" + dataset_name + "_clean.csv", delimiter=",", dtype=types).astype(str)["a"].tolist() vector_model = FastText.load("../models/word_embeddings/" + dataset_name + "_fasttext", binary=True) # replace placeholders (" "), make one-string-sentences for index, sample in enumerate(sentences): sentences[index] = list(filter((" ").__ne__, sample)) inputs = [" ".join(sentence) for sentence in sentences] sentences if bgr: tokenized = [t.split() for t in inputs] phrases = Phrases(tokenized) bigram = Phraser(phrases) bigrammed = [] # make bigrams for inputs for sentence in inputs: sentence = [t.split() for t in [sentence]] bigrammed.append(bigram[sentence[0]]) inputs = [] for sent in bigrammed: inputs.append(np.sum(vector_model.wv[sent], 0).tolist()) if sent else inputs.append(np.zeros(32)) else: inputs = [vector_model.wv[sample] for sample in inputs] inputs = np.array(inputs) train_x, test_x, train_y, test_y = train_test_split(inputs, targets, test_size=0.2) return train_x, test_x, train_y, test_y
def preprocess(segments, dct=None, bigram=None): processed_segments = [] for seg in segments: processed_seg = [] for word in seg: if True in [word.is_space, word.is_stop, word.is_punct]: continue word = word.lemma_ word = word.lower() processed_seg.append(word) processed_segments.append(processed_seg) if bigram is None: phrases = Phrases(processed_segments, min_count=3, threshold=3) bigram = Phraser(phrases) processed_segments = bigram[processed_segments] if dct is None: dct = Dictionary(processed_segments) else: dct.add_documents(processed_segments) return [dct.doc2bow(line) for line in processed_segments], dct, processed_segments, bigram
def word2vec_sentence(data, save_path): phrases = Phrases(data, min_count=1, progress_per=50000) bigrame = Phraser(phrases) sentences = bigrame[data] print(sentences) w2v_model = Word2Vec(min_count=3, window=4, size=300, sample=1e-5, alpha=0.03, min_alpha=0.0007, negative=20, workers=multiprocessing.cpu_count() - 1) # init start = time() w2v_model.build_vocab(sentences, progress_per=50000) print('Time to build vocab: {} mins'.format(round((time() - start) / 60, 2))) # train start = time() w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1) print('Time to train the model: {} mins'.format( round((time() - start) / 60, 2))) w2v_model.init_sims(replace=True) w2v_model.save("word2vec.model") w2v_model.wv.save_word2vec_format(save_path, binary=False)
def testScoringDefault(self): """ test the default scoring, from the mikolov word2vec paper """ bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) seen_scores = set() test_sentences = [[ 'data', 'and', 'graph', 'survey', 'for', 'human', 'interface' ]] for phrase, score in bigram.export_phrases(test_sentences): seen_scores.add(round(score, 3)) min_count = float(bigram.min_count) len_vocab = float(len(bigram.vocab)) graph = float(bigram.vocab[b"graph"]) data = float(bigram.vocab[b"data"]) data_and_graph = float(bigram.vocab[b"data_and_graph"]) human = float(bigram.vocab[b"human"]) interface = float(bigram.vocab[b"interface"]) human_interface = float(bigram.vocab[b"human_interface"]) assert seen_scores == set([ # score for data and graph round((data_and_graph - min_count) / data / graph * len_vocab, 3), # score for human interface round( (human_interface - min_count) / human / interface * len_vocab, 3), ])
def testScoringDefault(self): """ test the default scoring, from the mikolov word2vec paper """ bigram = Phrases(self.sentences, min_count=1, threshold=1, connector_words=self.connector_words) test_sentences = [[ 'data', 'and', 'graph', 'survey', 'for', 'human', 'interface' ]] seen_scores = set( round(score, 3) for score in bigram.find_phrases(test_sentences).values()) min_count = float(bigram.min_count) len_vocab = float(len(bigram.vocab)) graph = float(bigram.vocab["graph"]) data = float(bigram.vocab["data"]) data_and_graph = float(bigram.vocab["data_and_graph"]) human = float(bigram.vocab["human"]) interface = float(bigram.vocab["interface"]) human_interface = float(bigram.vocab["human_interface"]) assert seen_scores == set([ # score for data and graph round((data_and_graph - min_count) / data / graph * len_vocab, 3), # score for human interface round( (human_interface - min_count) / human / interface * len_vocab, 3), ])
def preprocess(self): from nltk import word_tokenize print("Starting to preprocess...") for split in ['train','test']: unigrams = [word_tokenize(sentence[0]) for sentence in self.data[split].values] ps = PorterStemmer() for idx,review in enumerate(unigrams): stemmedSentence=[] for word in review: #stemmedSentence.append(ps.stem(word)) # stemming takes too long ... stemmedSentence.append(word) self.data[split].iloc[idx,0]=" ".join(stemmedSentence) bigrams = Phrases(unigrams, min_count=2) bigram_phraser = Phraser(bigrams) if self.representation == 'GloVe': # let X be a list of tokenized texts (i.e. list of lists of tokens) self.word_model = gensim.models.Word2Vec(bigram_phraser[unigrams], min_count=1) self.w2v = dict(zip(self.word_model.wv.index2word, self.word_model.wv.syn0)) elif self.representation == 'fasttext': self.word_model = FastText(bigram_phraser[unigrams], min_count=1) self.w2v=dict(zip(self.word_model.wv.index2word, self.word_model.wv.syn0)) print("Finished preprocessing.")
def fit(self, sentencesPath): """ train phrases :param sentencesPath:the path of text file, the text file should be the format: one line one sentence """ self.phrasers = [] # path detect for path in self.savePhraserPaths: if not os.path.exists(os.path.dirname(path)): raise FileNotFoundError(os.path.dirname(path) + " not exist") for path in self.savePhraserPaths: if not os.path.exists(path): # need train self.phrasers = None break if self.phrasers is not None and self.file_overwrite == False: logging.info("models are already exist, will read it") for path in self.savePhraserPaths: self.phrasers.append(Phraser.load(path)) return True self.phrasers = [] c = 2 for path in self.savePhraserPaths: logging.info("getting %d-gram phrase......" % c) c += 1 phraser = Phraser( Phrases(sentences=TxtIter(sentences=codecs.open( sentencesPath, mode="r", encoding="utf-8"), ngrams=self.phrasers), min_count=self.min_count, threshold=self.threshold, max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, scoring=self.scoring)) phraser.save(path) self.phrasers.append(phraser)
def build_phrases(sentences): phrases = Phrases( sentences, min_count=2, threshold=10, ) return Phraser(phrases)
def learn_word_embeddings(corpus_fpath, vectors_fpath, cbow, window, iter_num, size, threads, min_count, detect_phrases=True): tic = time() sentences = GzippedCorpusStreamer(corpus_fpath) if detect_phrases: print("Extracting phrases from the corpus:", corpus_fpath) phrases = Phrases(sentences) bigram = Phraser(phrases) input_sentences = list(bigram[sentences]) print("Time, sec.:", time() - tic) else: input_sentences = sentences print("Training word vectors:", corpus_fpath) model = Word2Vec(input_sentences, min_count=min_count, size=size, window=window, max_vocab_size=None, workers=threads, sg=(1 if cbow == 0 else 0), iter=iter_num) model.wv.save_word2vec_format(vectors_fpath, binary=False) print("Vectors:", vectors_fpath) print("Time, sec.:", time() - tic)
def collocation(in_path): """Creates corpus considering collocations, frequent co-occuring bigrams are merged (new york -> new_york)""" corpus = LineSentence(in_path) bigram = Phraser(Phrases(corpus)) collocation_corpus = bigram[corpus] for sentence in collocation_corpus: print(' '.join(sentence))