def prep_corpus(docs, additional_stopwords=set(), no_below=5, no_above=0.5): print('Building dictionary...') dictionary = Dictionary(docs) # remove stopwords stopwords = nltk_stopwords().union(additional_stopwords) stopword_ids = map(dictionary.token2id.get, stopwords) # get ids for short words len(word)<=3 shortword_ids = [tokenid for tokenid, word in dictionary.iteritems() if len(word.split('/')[0])<= 3] dictionary.filter_tokens(stopword_ids) dictionary.compactify() # get ids for short words len(word)<=3 shortword_ids = [tokenid for tokenid, word in dictionary.iteritems() if len(word.split('/')[0])<= 3] dictionary.filter_tokens(shortword_ids) dictionary.compactify() # remove words that appear only once once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems()if docfreq == 1] dictionary.filter_tokens(once_ids) dictionary.compactify() # filter extreme values dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None) dictionary.compactify() print('Building corpus...') corpus = [dictionary.doc2bow(doc) for doc in docs] return dictionary, corpus
def produce(self): print('Getting src docs') docs = [] doctokens = [] # aka Gensim's "text" stopwords = nltk.corpus.stopwords.words('english') for doc in self.src_doc_generator(): (doc_id,doc_label,doc_str) = doc docs.append(doc) doctokens.append([token for token in nltk.word_tokenize(doc_str) if token not in stopwords]) if len(docs) % 1000 == 0: print(len(docs)) print('Creating the dictionary') dictionary = Dictionary(doctokens) #dictionary.compactify() #dictionary.filter_extremes(keep_n=None) if self.dictfile: dictionary.save_as_text(self.dictfile+'.dict', sort_by_word=True) with self.dbi as db: print('Creating WORD') # aka Gensim's "dictionary" db.create_table('word') for word_id, word_str in dictionary.iteritems(): db.cur.execute('INSERT INTO word (word_id, word_str) VALUES (?,?)',(word_id,word_str)) print('Creating DOC and DOCWORD') db.create_table('doc') db.create_table('docword') for doc_idx, doc in enumerate(docs): db.cur.execute('INSERT INTO doc (doc_index,doc_id,doc_label,doc_str ) VALUES (?,?,?,?)',(doc_idx,doc[0],doc[1],doc[2])) doc_id = doc[0] for word_id, word_count in (dictionary.doc2bow(doctokens[doc_idx])): word_str = dictionary.get(word_id) # Is this valid? I believe it is. db.cur.execute('INSERT INTO docword (doc_index,doc_id,word_id,word_str,word_count) VALUES (?,?,?,?,?)',(doc_idx,doc_id,word_id,word_str,word_count))
def prepare_word_embedding(): """Construct vocabulary file and word embedding file. """ df = pd.read_csv( "data/raw/train.csv", usecols=["original_phrase1", "original_phrase2", "ytrue"] ) model = KeyedVectors.load_word2vec_format( "/data/mayu-ot/Data/Model/GoogleNews-vectors-negative300.bin.gz", binary=True ) CUSTOM_FILTERS = [ lambda x: x.lower(), strip_punctuation, strip_multiple_whitespaces, strip_numeric, ] doc = [preprocess_string(x, CUSTOM_FILTERS) for x in df.values[:, :2].ravel()] dct = Dictionary(doc) bad_ids = [] for k, v in dct.iteritems(): if v not in model: bad_ids.append(k) dct.filter_tokens(bad_ids) dct.compactify() for k, v in dct.iteritems(): print(k, v) if k == 10: break dct.save_as_text("data/processed/dictionary.txt") word_emb = np.ones((len(dct), 300)) for k, v in dct.iteritems(): word_emb[k] = model[v] np.save("data/processed/word2vec", word_emb)
def test_dict_interface(self): """Test Python 2 dict-like interface in both Python 2 and 3.""" d = Dictionary(self.texts) self.assertTrue(isinstance(d, Mapping)) self.assertEqual(list(zip(d.keys(), d.values())), list(d.items())) # Even in Py3, we want the iter* members. self.assertEqual(list(d.items()), list(d.iteritems())) self.assertEqual(list(d.keys()), list(d.iterkeys())) self.assertEqual(list(d.values()), list(d.itervalues()))
def prep_corpus(docs, additional_stopwords=set(), no_below=5, no_above=0.5): print('Building dictionary...') dictionary = Dictionary(docs) # remove stopwords stopwords = nltk_stopwords().union(additional_stopwords) stopword_ids = map(dictionary.token2id.get, stopwords) # get ids for short words len(word)<=3 shortword_ids = [ tokenid for tokenid, word in dictionary.iteritems() if len(word.split('/')[0]) <= 3 ] dictionary.filter_tokens(stopword_ids) dictionary.compactify() # get ids for short words len(word)<=3 shortword_ids = [ tokenid for tokenid, word in dictionary.iteritems() if len(word.split('/')[0]) <= 3 ] dictionary.filter_tokens(shortword_ids) dictionary.compactify() # remove words that appear only once once_ids = [ tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1 ] dictionary.filter_tokens(once_ids) dictionary.compactify() # filter extreme values dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None) dictionary.compactify() print('Building corpus...') corpus = [dictionary.doc2bow(doc) for doc in docs] return dictionary, corpus
def test_dict_interface(self): """Test Python 2 dict-like interface in both Python 2 and 3.""" d = Dictionary(self.texts) self.assertTrue(isinstance(d, Mapping)) self.assertEqual(list(zip(d.keys(), d.values())), list(d.items())) # Even in Py3, we want the iter* members. self.assertEqual(list(d.items()), list(d.iteritems())) self.assertEqual(list(d.keys()), list(d.iterkeys())) self.assertEqual(list(d.values()), list(d.itervalues())) # XXX Do we want list results from the dict members in Py3 too? if not PY3: self.assertTrue(isinstance(d.items(), list)) self.assertTrue(isinstance(d.keys(), list)) self.assertTrue(isinstance(d.values(), list))
def produce(self): doc_n = 0 docs = [] doctokens = [] # AKA gensim "text" stopwords = nltk.corpus.stopwords.words('english') NOALPHA = re.compile('[^a-z]+') def prep_string(my_string,pattern = NOALPHA): return re.sub(pattern, ' ', my_string.strip().lower()) print('Getting src docs') for doc in self.src_doc_generator(): content = re.sub(NOALPHA, ' ', doc) # Do this in the corpus generator? docs.append(content) doctokens.append([token for token in nltk.word_tokenize(content) if token not in stopwords]) doc_n += 1 if doc_n % 1000 == 0: print(doc_n) print('Creating the dictionary') dictionary = Dictionary(doctokens) dictionary.compactify() dictionary.filter_extremes(keep_n=None) if self.dictfile: dictionary.save_as_text(self.dictfile+'.dict', sort_by_word=True) with self.dbi as db: print('Creating DOC') db.create_table('doc') for i, doc in enumerate(docs): db.cur.execute('INSERT INTO doc VALUES (?,?)',(i,doc)) print('Creating WORD') db.create_table('word') for item in dictionary.iteritems(): db.cur.execute('INSERT INTO word (word_id, word_str) VALUES (?,?)',item) print('Creating DOCWORD') db.create_table('docword') for i, tokens in enumerate(doctokens): for item in (dictionary.doc2bow(tokens)): db.cur.execute('INSERT INTO docword (doc_id,word_id,word_count) VALUES (?,?,?)',[i,item[0],item[1]])
dict2 = Dictionary.load(os.path.join(dirpath, "dic_s.txt")) elif storage_env == "remote": # todo: construct Dictionary objects from remote file contents # texts = [ # ['human', 'interface', 'computer'] # ] # my_dict = Dictionary(texts) raise NotImplementedError return dict1, dict2 if __name__ == "__main__": texts = [['human', 'interface', 'computer']] my_dict = Dictionary(texts) for s in my_dict.iteritems(): print(s) d1, d2 = load_dictionaries() print("-----------------------------") print("DICTIONARY 1", type(d1), len(d1)) for s in d1.iteritems(): print(s) print("-----------------------------") print("DICTIONARY 2", type(d1), len(d1)) for s in d2.iteritems(): print(s)
def extract_and_save_biterm(fname, embed_size=300, min_count=5, max_percent=0.5, iteration=200): ''' simple preprocessing of biterm A biterm is an unordered words pair Biterm is drawn from documents not from the whole corpus ''' docs = read_corpus(fname, labeled=False, tokens_only=True) docs = [[token for token in doc if not token.isnumeric()] for doc in docs] # Remove words that are only one character, and remove stop words docs = [[ token for token in doc if len(token) > 1 and token not in STOP_WORDS ] for doc in docs] lemmatizer = WordNetLemmatizer() docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs] bigram = Phrases(docs, min_count=20) for idx in range(len(docs)): for token in bigram[docs[idx]]: if '_' in token: # Token is a bigram, add to document. docs[idx].append(token) dictionary = Dictionary(docs) dictionary.filter_extremes(no_below=min_count, no_above=max_percent) dictionary.compactify() '''encode''' docs = [[token for token in doc if token in dictionary.token2id] for doc in docs] # # Remove docs that contains less than 3 words docs = [doc for doc in docs if len(set(doc)) > 1] # remove docs that contain less than 2 unique words model = gensim.models.Word2Vec(docs, workers=4, size=embed_size, iter=100, min_count=2) docs = [dictionary.doc2idx(doc) for doc in docs] biterms = {} i = 0 doc_bitems = [] for doc in docs: d_bi = {} doc = sorted(doc) for x in range(len(doc) - 1): for y in range(x + 1, len(doc)): if doc[x] == doc[y]: continue biterm = (doc[x], doc[y]) idx = 0 if biterm not in biterms: biterms[biterm] = i idx = i i += 1 else: idx = biterms[biterm] if idx in d_bi: d_bi[idx] += 1 else: d_bi[idx] = 1 doc_bitems.append(d_bi) fname = os.path.basename(fname) fname = fname.split('.')[0] dirc = os.path.join(os.getcwd(), 'Data', 'unsupervised') if not os.path.exists(dirc): os.makedirs(dirc) embeddings = {} for key, token in dictionary.iteritems(): embeddings[key] = model.wv[token] dictionary.save(os.path.join(dirc, fname + '_dic.pkl')) biterms = dict([key, val] for val, key in biterms.items()) with open(os.path.join(dirc, fname + '_bit.pkl'), 'wb') as f: pickle.dump(biterms, f) with open(os.path.join(dirc, fname + '_doc_bit.pkl'), 'wb') as f: pickle.dump(doc_bitems, f) with open(os.path.join(dirc, fname + '_emb.pkl'), 'wb') as f: pickle.dump(embeddings, f) with open(os.path.join(dirc, fname + '_doc.pkl'), 'wb') as f: pickle.dump(docs, f)
from gensim.corpora import Dictionary # # doc2bow,将document转换成bow(bag of words),(token_id, token_counts) # 1.documents texts = [['humans', 'interface', 'computer'], ["hello", "wife", "computer"]] # 2. create a Dictionary dct = Dictionary(texts) # 3. add new documents dct.add_documents([["cat", "say", "meow", "dog"], ["dog"]]) # 4. watch Dictionary items for key, value in dct.iteritems(): print(key, value) # 5. doc to bag of words bow = dct.doc2bow(["dog", "computer", "non_existent_word", "computer"]) print(bow) # 6. filter out extremes dct.filter_extremes(no_below=1, no_above=0.5, keep_n=3) for key, value in dct.iteritems(): print(key, value) from gensim import corpora, models tfidf = models.TfidfModel(bow) corpus_tfidf = tfidf[bow]