def genCorpus(corpus_name, f_inputs=[], word_count=20, word_doc_freq=0.3): """ Function generate corpus (bow representation of documents, id2words) for topic models(e.g. LDA) Input Args f_inputs type: list of filenames format: ['file1', 'file2', ...] desc: files to be converted to corpus corpus_name type: str desc: corpus path to be stored Return gensim_docBow type: list format: [ [doc1 bag of words], [doc2 bag of words], ] id2word type: gensim.corpora.Dictionary """ if not path.exists(corpus_name): docs = SentenceIter(*f_inputs) wordDict = dictionary.Dictionary(documents=docs, prune_at=None) wordDict.filter_extremes( no_below=word_count, no_above=word_doc_freq ) # filter word count< WORD_COUNT, word appear in documents rate > WORD_DOC_FREQ corpus_docBow = [wordDict.doc2bow(doc) for doc in docs] with open(corpus_name, 'wb') as fo: pickle.dump([corpus_docBow, wordDict], fo) else: with open(corpus_name, 'rb') as fi: corpus_docBow, wordDict = pickle.load(fi) return corpus_docBow, wordDict
def fit(self, training, training_info): # store training sets self.training = training self.training_info = training_info print("creating train tokens") train_tokens = training_info["tokens"].apply( lambda tokens: tokens.split(" ")).values.tolist() print("creating train dict") train_my_dict = dictionary.Dictionary(train_tokens) print("creating train corpus") train_corpus = [train_my_dict.doc2bow(token) for token in train_tokens] print("training Lsi model") if os.path.isfile('temp/model.lsi') and self.use_pretrained_model: self.lsi = models.LsiModel.load('temp/model.lsi') else: self.lsi = models.LsiModel(train_corpus, id2word=train_my_dict, num_topics=500) self.lsi.save('temp/model.lsi') print("creating train Lsi matrix") self.lsi_train_matrix = np.array( [self.lsi[document] for document in train_corpus]) self.address_books = create_address_books(training, training_info) self.mids_sender_recipient = create_dictionary_mids( training, training_info)
def __init__(self, dataframe): self.dataframe = dataframe self.tokens = dictionary.Dictionary() # Retrieve tokens form documents, populating the tokens dictionary self.dataframe = self.dataframe.apply(self.get_tokens) #print(tabulate(self.dataframe, headers='keys', tablefmt='psql')) print(self.dataframe) self.dataframe.apply(self.dump_tokens)
def __init__(self, docs): self.documents = docs self.tokens = dictionary.Dictionary() # Retrieve tokens form documents, populating the tokens dictionary #从ducuments中得到tokens并添加在corpus中 for doc in self.documents: content = [[word for word in open(doc).read().lower().split() if word not in [",","%","(",")",",",":","\n","$"]]] self.tokens.add_documents(content) print ("[*] Retrieved %s tokens from %s documents in the corpus" % (len(self.tokens), len(self.documents)))
def testBuild(self): d = dictionary.Dictionary(self.texts) expected = {0: 2, 1: 2, 2: 2, 3: 2, 4: 2, 5: 3, 6: 2, 7: 3, 8: 2, 9: 3, 10: 3, 11: 2} self.assertEqual(d.docFreq, expected) expected = {'computer': 0, 'eps': 8, 'graph': 10, 'human': 1, 'interface': 2, 'minors': 11, 'response': 3, 'survey': 4, 'system': 5, 'time': 6, 'trees': 9, 'user': 7} self.assertEqual(d.token2id, expected) expected = dict((v, k) for k, v in expected.iteritems()) self.assertEqual(d.id2token, expected)
def __init__(self, docs): self.documents = docs self.tokens = dictionary.Dictionary() # Retrieve tokens form documents, populating the tokens dictionary for doc in self.documents: content = [] for c in [ word for word in open(doc).read().lower().split() if word not in [",", "%", "(", ")", ",", ":", "\n", "$"] ]: if re.match('^[A-Za-z0-9();-<>:./]+$', c): content.append(c) self.tokens.add_documents([content]) print "[*] Retrieved %s tokens from %s documents in the corpus" % (len( self.tokens), len(self.documents))
def filter_tokenlist(): id2token_dictionary = dictionary.Dictionary.load_from_text(corpus_path) token2id = [] print(len(id2token_dictionary)) tokenTuples = [ tuple(map(int, i.split(' '))) for i in open(feature_mapping_path) ] # with open('/home/miriam/malwaredetection/utils/source_codes/token_filtered_new.txt', 'w+') as fh: for i, token in id2token_dictionary.items(): if re.match('^[A-Za-z0-9();-<>:./]+$', token): # fh.write(token.encode('utf-8') + '\n') token2id.append((i, token)) tokens_filtered = dictionary.Dictionary() tokens_filtered.token2id = token2id print(dictionary.Dictionary.from_corpus(token2id)) print()
def vectorize(ori_data, train_file_path, test_file_path): train_data = ori_data['train'] corpus = [] for label, data in train_data: corpus.append(data) dic = dictionary.Dictionary(corpus) corpus = [dic.doc2bow(doc) for doc in corpus] lda = ldamodel.LdaModel(corpus, passes=40, num_topics=80) #training data fp = open(train_file_path, 'w') for label, data in train_data: vec = lda[dic.doc2bow(data)] fp.write("%s %s\n" % (label, sparse(vec))) fp.close() fp = open(test_file_path, 'w') for label, data in ori_data['test']: vec = lda[dic.doc2bow(data)] fp.write("%s %s\n" % (label, sparse(vec))) fp.close()
def vectorize(ori_data, train_file_path, test_file_path): train_data = ori_data['train'] corpus = [] for label, data in train_data: corpus.append(data) dic = dictionary.Dictionary(corpus) corpus = [dic.doc2bow(doc) for doc in corpus] tfidf = tfidfmodel.TfidfModel(corpus) #training data fp = open(train_file_path, 'w') for label, data in train_data: vec = tfidf[dic.doc2bow(data)] fp.write("%s %s\n" % (label, sparse(vec))) fp.close() fp = open(test_file_path, 'w') for label, data in ori_data['test']: vec = tfidf[dic.doc2bow(data)] fp.write("%s %s\n" % (label, sparse(vec))) fp.close()
def get_tfidf(files, update): if update: tokenizer, lemmatizer, stop = RegexpTokenizer(r'[a-z]{4,}'), WordNetLemmatizer(), set(stopwords.words('english')) bug_list = bugs_exceeding_count(files, 10) replacements = get_replacements(bug_list) with open(files.replacements, 'w') as outf: json.dump(replacements, outf) with open(files.abstracts) as documents: word_dict = dictionary.Dictionary( get_words(id_abstract, replacements, tokenizer, lemmatizer) for id_abstract in documents) word_dict.filter_tokens(bad_ids=stop) # get rid of stop words word_dict.filter_extremes() # filter out documents.seek(0) # reset to beginning of file iterator corpus = (word_dict.doc2bow(get_words(id_abstract, replacements, tokenizer, lemmatizer)) for id_abstract in documents) mmcorpus.MmCorpus.serialize(files.corpus, corpus) tfidf_model = models.tfidfmodel.TfidfModel(dictionary=word_dict, id2word=word_dict) tfidf_model.save(files.tfidf) tfidf_model = models.tfidfmodel.TfidfModel.load(files.tfidf) return tfidf_model
def similarity(folder_path, stop_word_file, user_dict, cos_result_file, sim_result_file): print("start time is :", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) files = load_files(folder_path) words = [cut(file, stop_word_file, user_dict) for file in files] # 生成字典和向量语料 doc_dict = dictionary.Dictionary(words) doc_corpus = [doc_dict.doc2bow(item) for item in words] # 5.通过token2id得到特征数(字典里面的键的个数) tfidf, index = tfidf_calc(doc_corpus, len(doc_dict.token2id.keys())) log = idx_out(files, index) # sims = simhash(words, doc_corpus) sims = simhash_tfidf(words, tfidf) log1 = sim_out(files, sims) write_file(cos_result_file, log) write_file(sim_result_file, log1) print("end time is :", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
from gensim.utils import to_unicode from gensim.interfaces import TransformedCorpus from gensim.corpora import (bleicorpus, mmcorpus, lowcorpus, svmlightcorpus, ucicorpus, malletcorpus, textcorpus, indexedcorpus, dictionary) from gensim.models import (tfidfmodel,word2vec,ldamodel) print 'start' train_set=[] for line in open('articles.txt'): items = line.strip().split('\t', 1) if len(items) < 2: continue words = items[1].strip().split(' ') train_set.append(words) print 'construct dict' dic = dictionary.Dictionary(train_set) print 'doc2bow' corpus = [dic.doc2bow(text) for text in train_set] print 'ifidf' tfidf = tfidfmodel.TfidfModel(corpus) print 'ifidf corpus' corpus_tfidf = tfidf[corpus] print 'lda model' lda = ldamodel.LdaModel(corpus_tfidf, id2word = dic, num_topics = 1000, iterations = 1300, alpha = 0.15, eta = 0.01) print 'corpus_tfidf' corpus_lda = lda[corpus_tfidf] lda.save('lda_model')
def testFilter(self): d = dictionary.Dictionary(self.texts) d.filterExtremes(noBelow = 2, noAbove = 1.0, keepN = 4) expected = {0: 3, 1: 3, 2: 3, 3: 3} self.assertEqual(d.docFreq, expected)
def get_dic(self): """ Creates a gensim dictionary and return it :return: Gensim dictionary """ return dictionary.Dictionary(self.tokens)
logging.basicConfig(filename='build_corpus_and_dictionary.log', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) sys.path.append(os.path.join(os.path.abspath(os.path.dirname(__file__)), os.path.pardir)) from gensim.corpora import dictionary import settings from models import tokenizer from models import corpus punct_tags = list(',.:)"') + ['CD', 'IN'] pt_tokenizer = tokenizer.PretaggedTokenizer(stopword_list=None, filter_tags=punct_tags) lj_corpus = corpus.LazyJSONCorpus(tokenizer=pt_tokenizer, dictionary=None, path_to_text="tagged") glob_pattern = os.path.join(settings.PROC_DIR, '*.json') #glob_pattern = os.path.join(settings.PROC_DIR, '60182*.json') lj_corpus.glob_documents(glob_pattern) with open(os.path.join(settings.PERSIST_DIR, 'document_index'), 'w') as fout: for floc in iglob(glob_pattern): doc_id = os.path.basename(floc).split('.')[0] fout.write(doc_id+'\n') my_dict = dictionary.Dictionary(lj_corpus) lj_corpus.dictionary = my_dict my_dict.save(os.path.join(settings.PERSIST_DIR, 'my_dict'))
def predict(self, test, test_info): print("creating test tokens") test_tokens = test_info["tokens"].apply( lambda tokens: tokens.split(" ")).values.tolist() print("creating test dictionnary") test_my_dict = dictionary.Dictionary(test_tokens) print("creating test corpus") test_corpus = [test_my_dict.doc2bow(token) for token in test_tokens] print("creating lsi test matrix") lsi_test_matrix = np.array([self.lsi[doc] for doc in test_corpus]) print("prediction per sender") predictions_per_sender = {} for nb_done, row in test.iterrows(): print("Progression: %f" % (nb_done / len(test))) # retrieve sender attributes sender = row[0] mids_sender = self.training[self.training["sender"] == sender]["mids"].values[0] mids_sender = np.array(mids_sender.split(" "), dtype=int) position_mails_training = self.training_info[ self.training_info["mid"].isin(mids_sender)].index.values lsi_mails_sender = self.lsi_train_matrix[position_mails_training] index = similarities.MatrixSimilarity(lsi_mails_sender) # This dictionnary is used to recover the positions in the #training set from the position in the similarity matrix dict_Sim_Training = dict( zip(position_mails_training, range(len(position_mails_training)))) # get IDs of the emails for which recipient prediction is needed mids_predict = np.array(row[1].split(" "), dtype=int) # initialize list to store predictions lsi_preds = [] for mid_predict in mids_predict: # get the position of current mail in test_info dataset position_mail_predict = test_info[test_info["mid"] == mid_predict].index.values lsi_mail_predict = lsi_test_matrix[position_mail_predict] #sims: similarity score ordered by untrained document id sims = index[lsi_mail_predict][0] scores = [] for recipient, nb_occurrences in self.address_books[sender]: mids_recipient = self.mids_sender_recipient[(sender, recipient)] positions_mids_recipient = self.training_info[ self.training_info["mid"].isin( mids_recipient)].index.values ind_sim = np.array([ dict_Sim_Training[ind] for ind in positions_mids_recipient ]) similarities_recipient = sims[ind_sim] scores.append((recipient, similarities_recipient.mean())) # sort the scores and get the 10 recipients with higher scores prediction = [ recipient for recipient, score in sorted( scores, key=lambda elt: elt[1], reverse=True) [:self.nb_recipients_to_predict] ] lsi_preds.append(prediction) predictions_per_sender[sender] = [mids_predict, lsi_preds] return predictions_per_sender
def prune_dict(docs, lower=0.1, upper=0.9): dicti = dictionary.Dictionary(docs) lower *= len(docs) dicti.filter_extremes(no_above=upper, no_below=int(lower)) return dicti
def corpus(words): doc_dict = dictionary.Dictionary(words) doc_corpus = doc_dict.doc2bow(words) return doc_corpus