def create_corpus(src, out_dir, no_below=20, keep_words=_DEFAULT_KEEP_WORDS): """\ """ wordid_filename = os.path.join(out_dir, 'cables_wordids.pickle') bow_filename = os.path.join(out_dir, 'cables_bow.mm') tfidf_filename = os.path.join(out_dir, 'cables_tfidf.mm') predicate = None # Could be set to something like pred.origin_filter(pred.origin_germany) # 1. Create word dict dct = Dictionary() dct_handler = DictionaryHandler(dct) handler = create_filter(dct_handler) handle_source(src, handler, predicate) dct.filter_extremes(no_below=no_below, no_above=0.1, keep_n=keep_words) dct.save(wordid_filename) # 2. Reiterate through the cables and create the vector space corpus_handler = CorpusHandler(out_dir, dct=dct, allow_dict_updates=False) handler = create_filter(corpus_handler) handle_source(src, handler, predicate) # 3. Load corpus mm = MmCorpus(bow_filename) # 4. Create TF-IDF model tfidf = TfidfModel(mm, id2word=dct, normalize=True) # 5. Save the TF-IDF model MmCorpus.serialize(tfidf_filename, tfidf[mm], progress_cnt=10000)
def lda(): data = pd.read_csv("./lda_model/process.csv", encoding="utf-8", header=None) # data[2] = data[1].apply(lambda x: re.split(r'\s*', x)) data[2] = data[1].apply(lambda x: x.split(' ')) corpora_documents = [] # print(data[2]) for item_str in data[2]: # print(item_str) corpora_documents.append(item_str) del corpora_documents[0] print(corpora_documents[0]) dict_1 = corpora.Dictionary(corpora_documents) dict_1.save('./lda_model/dict_v2') dict_corpora = [dict_1.doc2bow(i) for i in corpora_documents] print('字典构建完成') # 向量的每一个元素代表了一个word在这篇文档中出现的次数 # print(corpus) from gensim.corpora.mmcorpus import MmCorpus MmCorpus.serialize('ths_corpora.mm', dict_corpora) tfidf = models.TfidfModel(dict_corpora) corpus_tfidf = tfidf[dict_corpora] tfidf.save("./lda_model/my_model.tfidf") np.random.seed(SOME_FIXED_SEED) lda = models.LdaModel(corpus_tfidf, num_topics=78, id2word=dict_1, iterations=1000) # # # lda.save('./lda_model/mylda_v2') lda.show_topics()
def save_corpus(cls, tokens_file, corpus_file, dictionary_path): print "Instantiating corpus" corpus = UserCorpus(tokens_file) print "Filtering extremes" corpus.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=100000) print "Serializing corpus" MmCorpus.serialize(corpus_file, corpus, progress_cnt=10000) print "Serializing dictionary" corpus.dictionary.save_as_text(dictionary_path)
def SNAP_generateMmCorpus(self, topic): if topic == 'all': topics = ['bieber', 'cyrus', 'syria', 'ufo'] for t in topics: self.SNAP_generateMmCorpus(t) return corpus = self.SNAP_corpusForTopic(topic) outputPath = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'snap_data', "gensim_snap_mmcorpus_%s.mm" % topic ) id2word = self.SNAP_id2word() MmCorpus.save_corpus(outputPath, corpus, id2word) return
def convert_corpus_to_sparse_tfidf( metadata_index_outpath: Path, vectorized_corpus_outpath: Path, path_to_jsonl_index: Path = BIOPAPERS_JSON_PATH, path_to_bow: Path = BOW_PATH, tfidf_vectorizer: Path = TFIDF_VECTORIZER, ): """ Convert corpora of a specific category into a tfidf sparse matrix. It saves: 1. MM Corpus sparse matrix indexed by id 2. metadata index as gz pickle file. Parameters ---------- metadata_index_outpath: Path Path to metadata index (without extension) vectorized_corpus_outpath: Path Path to corpus matrix, does not require extension path_to_jsonl_index: Path Path to jsonl_index for that specific category path_to_bow: Path Path to BOW dictionary tfidf_vectorizer: Path to TFIDF model for vectorization of BOW corpus """ # Load dictionary if path_to_bow.exists(): bow_dictionary = Dictionary.load(str(path_to_bow)) else: bow_dictionary = create_bow_from_biopapers() # Load tfidf model: if tfidf_vectorizer.exists(): tfidf_model = TfidfModel.load(str(tfidf_vectorizer)) else: tfidf_model = create_tfidf_from_papers() # Add pickle suffix: metadata_index_outpath = metadata_index_outpath.with_suffix(".jsonl") # Load corpus generator: tfidf_corpus = BiopapersCorpus( bow_dictionary=bow_dictionary, path_to_JSONL_index=path_to_jsonl_index, tfidf_vectorizer=tfidf_model, metadata_index_outpath=metadata_index_outpath, ) # Save corpus and index to file: MmCorpus.serialize(str(vectorized_corpus_outpath), tfidf_corpus)
def load_model(output_path): """ Load working model Loads working model, BoW corpus and initial dataframe with tokens Outputs: Writes a .txt file of top 7 words per topic for subsequent inspection. """ # load data combined_df = pd.read_csv(pkg_resources.resource_filename( resource_package, "data/data_processed.csv"), index_col=0) corpus = MmCorpus( pkg_resources.resource_filename(resource_package, "data/BoW_corpus.mm")) print('Data loaded.') # load the mallet model ldamallet = gensim.models.wrappers.LdaMallet.load( pkg_resources.resource_filename( resource_package, 'model/working_ldamallet_model.gensim')) print('Model loaded.') # write out topics to a text file topics = ldamallet.print_topics(num_topics=-1, num_words=7) with open(os.path.join(output_path, 'LDA_topics.txt'), 'w') as topic_file: for topic in topics: topic_file.write(str(topic) + '\n') print('Topics written to data folder.') return (corpus, ldamallet, combined_df)
def main(training_datafile, output_path): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logger = logging.getLogger('Archive.gensim') filters = [strip_punctuation, strip_multiple_whitespaces, strip_numeric, remove_stopwords, strip_short] logger.info('Creating Corpus object...') corpus = ArchiveCorpus(datafile=training_datafile, preprocess=filters) filename = ''.join(training_datafile.split('/')[-1]) if not os.path.exists(output_path): os.makedirs(output_path) outfile_path = os.path.join(output_path, filename) logger.info('Saving corpus to disk: {}.mm'.format(filename)) MmCorpus.serialize('{}.mm'.format(outfile_path), corpus, progress_cnt=1000) logger.info('Saving dictionary to disk: {}.dict'.format(filename)) corpus.dictionary.save('{}.dict'.format(outfile_path))
def setUp(self): self.corpus = MmCorpus(datapath('testcorpus.mm')) self.class_ = ldamodel.LdaModel self.model = self.class_(common_corpus, id2word=common_dictionary, num_topics=2, passes=100)
def create_corpus(column, shortname, type_n): my_corpus = Corpus_Column(fname, column) dicti = my_corpus.dictionary #shortname1 has only freq < 5 removed others have filter_extremes once_ids = [tokenid for tokenid, docfreq in dicti.dfs.iteritems() if docfreq < 5] dicti.filter_tokens(bad_ids=once_ids) #dicti.filter_extremes() dicti.compactify() dicti.save(path_join(cache_dir, "%s_%s_nltk_filtered_dic.pickle" % (type_n, shortname))) MmCorpus.serialize(path_join(cache_dir, "%s_%s_nltk_filtered.corpus.mtx" % (type_n, shortname)), my_corpus, id2word=dicti) print my_corpus.dictionary print "50 most used in %s" % column i = 0 for k, v in sorted(dicti.dfs.items(), key=operator.itemgetter(1), reverse=True): if i < 50: print dicti[k], v i = i + 1
def load_models(args): """ Load tfidf model, corpus, and dictionary if specified in arguments. input: args (arparse object): input arguments return loaded tfidf object, corpus, and dictionary if specified """ try: tfidf = models.TfidfModel.load(os.path.join(args.tfidf_model_dir_path, "model")) corpus = mm = MmCorpus(os.path.join(args.tfidf_model_dir_path, "corpus")) mydict = corpora.Dictionary.load(os.path.join(args.tfidf_model_dir_path, "dictionary")) except FileNotFoundError: print(timestamp(), "Tf-idf model directory path must contain model, corpus, and dictionary.", file=sys.stderr) exit(1) return tfidf, corpus, mydict
def train(self, corpus, chunksize=10000, use_temp_files=True): """ train the underlying linear mappings. @param corpus is a gensim corpus compatible format @param use_temp_files determines whether to use temporary files to store the intermediate representations of the corpus to train the next layer. Setting flag True will not greatly affect memory usage, but will temporarily require a significant amount of disk space. Using temp files will strongly speed up training, especially as the number of layers increases. """ ln.info("Training mSDA with %s layers. ") if not use_temp_files: ln.warn("Training without temporary files. May take a long time!") self.reduction_layer.train(corpus, chunksize=chunksize) current_representation = self.reduction_layer[corpus] for layer_num, layer in enumerate(self.mda_layers): # We feed the corpus through all intermediate layers to get the current representation # that representation is then used to train the next layer # this is memory-independent, but will probably be very slow. ln.info("Training layer %s.", layer_num) layer.train(current_representation, chunksize=chunksize) if layer_num < len(self.mda_layers) - 1: current_representation = layer[current_representation] else: ln.info("Using temporary files to speed up training.") ln.info("Beginning training on %s layers." % (len(self.mda_layers) + 1)) self.reduction_layer.train(corpus, chunksize=chunksize) # serializing intermediate representation MmCorpus.serialize(".msda_intermediate.mm", self.reduction_layer[corpus], progress_cnt=chunksize) # load corpus to train next layer current_representation = MmCorpus(".msda_intermediate.mm") for layer_num, layer in enumerate(self.mda_layers): layer.train(current_representation, chunksize=chunksize) os.remove(".msda_intermediate.mm") os.remove(".msda_intermediate.mm.index") if layer_num < len(self.mda_layers) - 1: MmCorpus.serialize(".msda_intermediate.mm", layer[current_representation], progress_cnt=chunksize) current_representation = MmCorpus(".msda_intermediate.mm") ln.info("mSDA finished training.")
def corpus_tfidf(): path = "" corpus = MmCorpus(path + "corpus.mm") id2word = Dictionary.load(path + 'corpus.mm.dict') # TF-IDF the corpus tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] tfidf.save("5_topics_tfidf_only.model") lda_model_tfidf = models.LdaModel(corpus_tfidf, num_topics=5, id2word=id2word)#models.LdaMulticore(corpus_tfidf, num_topics=5, id2word=id2word, passes=2, workers=4) # better model print('\nPerplexity: ', lda_model_tfidf.log_perplexity(corpus)) # a measure of how good the model is. lower the better. for idx, topic in lda_model_tfidf.print_topics(-1): print('Topic: {} Word: {}'.format(idx, topic)) lda_model_tfidf.save(path + "5_topics_test.model") lda_model_tfidf.wv.save(path + "5_topics_test_kv.model")
def exp_variable_selection(dict_name, corpus_name, N=2, n_noise_term=10, n_epoches=20, \ learning_rate=.001, batch_size=30, n_hidden=50): """ Main function for selecting variables and calculating embeddings for selected embeddings vectors by using vanilla RBM. """ # load existing dictionary (or creat a new dictionary from scratch) # code for creating new dictionary ... ngram_dict = corpora.Dictionary.load(dict_name) # select key ides of some random ngram terms from loaded dictionary as dictionary noise random_terms = list(set(ngram_dict.keys()) - set(PRESERV_TERMS)) noise_terms = random.sample(random_terms, n_noise_term) print("[%s] [Var Select] %d noise terms has been added: %s" % \ (arrow.now(), len(noise_terms), [ngram_dict[key] for key in noise_terms]), file=sys.stderr) # # shrink dictionary to a subset in accordance with PRESERV_TERMS # sub_ngram_dict = sub_dictionary(ngram_dict, PRESERV_TERMS, by_key=True) # load existing corpus corpus = MmCorpus(corpus_name) dense_corpus = corpus2dense(corpus, num_terms=len(ngram_dict)).transpose() print("[%s] [Var Select] raw corpus has been loaded with size (%d, %d)" % \ (arrow.now(), dense_corpus.shape[0], dense_corpus.shape[1]), file=sys.stderr) # slice the corpus by PRESERV_TERMS and corpus # (remove columns which are not included in PRESERV_TERMS) # noted: indexing arrays could not be broadcast together # e.g. dense_corpus[PRESERV_DOCS, PRESERV_TERMS] corpus_slice = dense_corpus[:, PRESERV_TERMS + noise_terms] corpus_slice = corpus_slice[PRESERV_DOCS, :] print("[%s] [Var Select] corpus has been sliced with size (%d, %d)" % \ (arrow.now(), corpus_slice.shape[0], corpus_slice.shape[1]), file=sys.stderr) # mat2img(np.log(corpus_slice)) rbm = GBRBM(n_visible=corpus_slice.shape[1], n_hidden=n_hidden, \ learning_rate=learning_rate, momentum=0.95, err_function='mse', \ use_tqdm=False, sample_visible=False, sigma=1.) rbm.fit(corpus_slice, n_epoches=n_epoches, batch_size=batch_size, \ shuffle=True, verbose=True) embeddings = rbm.transform(corpus_slice).round().astype(int) # w, vbias, hbias = rbm.get_weights() # mat2img(w) return corpus_slice, embeddings
# -*- coding: utf-8 -*- """ Created on Thu Oct 10 18:21:00 2019 @author: evefl """ from pprint import pprint from gensim.corpora.mmcorpus import MmCorpus from gensim.corpora.dictionary import Dictionary from gensim import models # set path to wherever you download the files path = '/data/' corpus = MmCorpus("corpus.mm") #MmCorpus('%scorpus.mm' % path) # BOW id2word = Dictionary.load('corpus.mm.dict') #'%scorpus.mm.dict' % path) for doc in corpus[:1]: for word in doc[:2000]: print(word) print(id2word[word[0]]) # TF-IDF the corpus tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] # #for doc in corpus_tfidf: # preview tfidf scores for first document # pprint(doc) # break
print "we will lemmatize ('you were'->'be/VB')" mname = prefix + '_lemmatized_tfidf' else: print "you don't have pattern: we will tokenize ('you were'->'you','were')" mname = prefix + '_tokenized_tfidf' try: id2token = Dictionary.load_from_text(mname + '_wordids.txt') mm = MmCorpus(mname + '_bow.mm') print ">>> Loaded corpus from serialized files" except: print ">>> Extracting articles..." corpus = CDS_Corpus(FOLDER) corpus.dictionary.save_as_text(mname + '_wordids.txt') print ">>> Saved dictionary as " + mname + "_wordids.txt" MmCorpus.serialize(mname + '_bow.mm', corpus, progress_cnt=1000) print ">>> Saved MM corpus as " + mname + "_bow.mm" id2token = Dictionary.load_from_text(mname + '_wordids.txt') mm = MmCorpus(mname + '_bow.mm') del corpus print ">>> Using TF-IDF" tfidf = models.TfidfModel(mm, id2word=id2token, normalize=True) corpus_tfidf = tfidf[mm] lda = models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=id2token, num_topics=N_TOPICS, alpha='auto', update_every=1, chunksize=800, passes=50) f = open(mname + '.ldamodel', 'w') cPickle.dump(lda, f)
for topic_id in range(num_topics): # cal p(w) : p(w) = sumz(p(z)*p(w|z)) prob_topic = doc_topics_ist[i][topic_id][1] prob_topic_word = topic_word_list[topic_id][word] prob_word += prob_topic * prob_topic_word prob_doc += math.log(prob_word) # p(d) = sum(log(p(w))) prob_doc_sum += prob_doc testset_word_num += doc_word_num prep = math.exp(-prob_doc_sum / testset_word_num) # perplexity = exp(-sum(p(d)/sum(Nd)) #print("the perplexity of this ldamodel is : %s" % prep) return prep topicnum_perplexity = [] corpus = MmCorpus('./ths_corpora.mm') testset = [] import random for i in random.sample(range(corpus.num_docs), corpus.num_docs // 100): testset.append(corpus[i]) for topic_num in range(20, 60, 3): # lda = models.LdaModel(dict_corpora, num_topics=topic_num, id2word=dict_1, iterations=1000) # prep = perplexity(lda, testset, dict_1, len(dict_1.keys()), topic_num) # print(topic_num, "success!!!!!!!!!!!!!!!!!!!", prep) # topicnum_perplexity.append([topic_num, prep]) lda_tfidf = models.LdaModel(corpus_tfidf, num_topics=topic_num, id2word=dict_1, iterations=1000)
def __iter__(self): self.titles = [] for title, tokens in itertools.islice(iter_wiki(self.dump_file), self.clip_docs): self.titles.append(title) yield self.dictionary.doc2bow(tokens) def __len__(self): return self.clip_docs # create a stream of bag-of-words vectors wiki_corpus = WikiCorpus(fileLocation+'enwiki-latest-pages-articles.xml.bz2', id2word_wiki) vector = next(iter(wiki_corpus)) print(vector) # print the first vector in the stream MmCorpus.serialize(fileLocation+'wikiModels/wiki_bow.mm', wiki_corpus) mm_corpus = MmCorpus(fileLocation+'wikiModels/wiki_bow.mm') print(mm_corpus) clipped_corpus = gensim.utils.ClippedCorpus(mm_corpus, 4000) lda_model = gensim.models.LdaModel(clipped_corpus, num_topics=10, id2word=id2word_wiki, passes=4) # store all trained models to disk lda_model.save(fileLocation+'wikiModels/lda_wiki.model') #lsi_model.save('./data/lsi_wiki.model') #tfidf_model.save('./data/tfidf_wiki.model') id2word_wiki.save(fileLocation+'wikiModels/wiki.dictionary') loaded_lda_model = gensim.models.LdaModel.load(fileLocation+'wikiModels/lda_wiki.model')
def search(request): if request.method == 'POST': global catch catch = request.POST['title'] data = [catch] stop_words = stopwords.words('indonesian') stop_words2 = stopwords.words('english') stop_words.extend(stop_words2) stop_words.extend([ 'of', 'in', 'and', 'the', 'for', 'on', 'using', 'based', 'from', 'with', 'to', 'by', 'as', 'an', 'pengaruh', 'effect', 'analisis', 'at', 'pre', 'pro', 'analysis', 'berbasis', 'tahun', 'between', 'kualitas', 'method', 'metode', 'through', 'menggunakan', 'hasil' ]) # Remove Numbers data = [re.sub(" \d+", ' ', sent) for sent in data] data = [re.sub('[^a-zA-Z]', ' ', sent) for sent in data] # Remove new line characters data = [re.sub('\s+', ' ', sent) for sent in data] # Remove distracting single quotes data = [re.sub("\'", "", sent) for sent in data] def sent_to_words(sentences): for sentence in sentences: yield (gensim.utils.simple_preprocess(str(sentence), deacc=True) ) # deacc=True removes punctuations coba = sent_to_words(data) data_words = list(coba) # Build the bigram and trigram models bigram = gensim.models.Phrases( data_words, min_count=5, threshold=100) # higher threshold fewer phrases. trigram = gensim.models.Phrases(bigram[data_words], threshold=100) # Faster way to get a sentence clubbed as a trigram/bigram bigram_mod = gensim.models.phrases.Phraser(bigram) trigram_mod = gensim.models.phrases.Phraser(trigram) # Define functions for stopwords, bigrams, trigrams and lemmatization # from Sastrawi.Stemmer.StemmerFactory import StemmerFactory def remove_stopwords(texts): return [[ word for word in simple_preprocess(str(doc)) if word not in (stop_words or stop_words2) ] for doc in texts] def make_bigrams(texts): return [bigram_mod[doc] for doc in texts] def make_trigrams(texts): return [trigram_mod[bigram_mod[doc]] for doc in texts] def lemmatization(texts): """https://spacy.io/api/annotation""" texts_out = [] for sent in texts: doc = nlp(" ".join(sent)) texts_out.append([token.lemma_ for token in doc]) return texts_out # Remove Stop Words data_words_nostops = remove_stopwords(data_words) # # Form Bigrams data_words_bigrams = make_bigrams(data_words_nostops) nlp = spacy.load('en_core_web_sm') data_lemmatized = lemmatization(data_words_bigrams) #stem masing-masing kata yang ada factory = StemmerFactory() stemmer = factory.create_stemmer() for x in range(len(data_lemmatized) - 1): for y in range(len(data_lemmatized[x]) - 1): data_lemmatized[x][y] = stemmer.stem(data_lemmatized[x][y]) id2wordd = corpora.Dictionary(data_lemmatized) # Create Corpus texts = data_lemmatized # Term Document Frequency corpuss = [id2wordd.doc2bow(text) for text in texts] id2word = Dictionary.load('papers/id2word_new.dict') corpus = MmCorpus('papers/corpus_new.mm') # import gensim model = gensim.models.ldamodel.LdaModel.load( 'papers/mallet_18_lda.mdl', mmap='r') new_doc_bow = id2word.doc2bow(data_lemmatized[0]) hasil = model.get_document_topics(new_doc_bow) topic = 0 nilai = -99 for i, row in (hasil): if (row > nilai): topic = i nilai = row keywords = [] for i, nilai in model.show_topic(topic): keywords.append(i) # topics = Topics.objects.filter(id_topic=topic).values_list('id_publication', flat=True) #load data df = pd.read_csv('papers/label18baru.csv') with open("papers/lemma_new.txt", "rb") as fp: #Pickling data_lemmatizedd = pickle.load(fp) #init tempat menyimpan hasil hasil_cosine_keseluruhan = [] hasil_cosine = [] #mengambil data yang sesuai dengan topik # topic=df topik = df.loc[df['Topic1'] == topic] ##membuat data lemma, corpus dan dictionary berdasarkan data dalam 1 topik res_list = [data_lemmatizedd[i] for i in topik.index] # Create Dictionary id2worddd = corpora.Dictionary(res_list) # Create Corpus texts = res_list # Term Document Frequency corpusss = [id2worddd.doc2bow(text) for text in res_list] #menghitung cosine sim judul dibandingkan dengan keseluruhan judul yang ada index_tmpfile = get_tmpfile("index") index = Similarity(index_tmpfile, corpusss, num_features=len(id2worddd)) index = MatrixSimilarity(corpusss, num_features=len(id2worddd)) sims = index[corpuss] sort_index = np.argsort(sims[0]) reversed_arr = sort_index[::-1] hasil = pd.DataFrame(reversed_arr) hasilbaru = hasil.iloc[:40, :] hasilmantep = hasilbaru.to_numpy() idfix = [] for i in range(0, 40): idfix.append(hasilmantep[i][0]) ngetest = topik.to_numpy() id_artikel = [] for i in idfix: id_artikel.append(ngetest[i][9]) global user_list user_list = Papers.objects.filter( id_pub__in=id_artikel).order_by('id_pub') topic_dict = { '0': 'Kimia', '1': 'Industri', '2': 'Biologi-Tumbuhan', '3': 'Biologi-Pangan', '4': 'Mikrobiologi', '5': 'Studi-Penemuan', '6': 'Sosial-Masyarakat-Sejarah', '7': 'Habitat Makhluk Hidup', '8': 'Elektro-Mesin', '9': 'Pendidikan', '10': 'Sosial-Pengaruh', '11': 'Pertanian', '12': 'Data-Citra-Statistik', '13': 'Jawa-Indonesia', '14': 'Masyarakat', '15': 'Biokimia', '16': 'Kesehatan', '17': 'Kesehatan 2', } global hasiltopik hasiltopik = topic_dict.get(str(topic)) page = request.GET.get('page', 1) paginator = Paginator(user_list, 10) try: users = paginator.page(page) except PageNotAnInteger: users = paginator.page(1) except EmptyPage: users = paginator.page(paginator.num_pages) context = { 'title': 'Halaman Utama', 'topic': hasiltopik, 'catch': catch, 'users': users, } return render(request, 'papers/index.html', context) else: page = request.GET.get('page', 1) paginator = Paginator(user_list, 10) try: users = paginator.page(page) except PageNotAnInteger: users = paginator.page(1) except EmptyPage: users = paginator.page(paginator.num_pages) context = { 'title': 'Halaman Utama', 'topic': hasiltopik, 'catch': catch, 'users': users, } return render(request, 'papers/index.html', context)
async def load_corpus(): if 'corpus' not in model: model['corpus'] = MmCorpus(await tasks['corpus']) return model['corpus']
def setUp(self): self.corpus = MmCorpus(datapath('testcorpus.mm'))
metadata_queue = [] class MetadataRemovedCorpus: def __init__(self, corpus): self.corpus = corpus def __iter__(self): for doc, metadata in self.corpus: metadata_queue.append(metadata) yield doc tfidf_corpus = tfidf[MetadataRemovedCorpus(wiki_corpus)] class MetadataAddedCorpus: def __init__(self, corpus): self.corpus = corpus self.metadata = True def __iter__(self): for doc in self.corpus: yield doc, metadata_queue.pop() tfidf_metadata_corpus = MetadataAddedCorpus(tfidf_corpus) if vector_format == 'tfidf': corpus = tfidf_metadata_corpus elif vector_format == 'bow': corpus = wiki_corpus MmCorpus.serialize(mm_fname, corpus, progress_cnt=10000, metadata=True)
files. Only `.txt` files will be taken into account. dataset_name : str A name for the directory where the processed corpus is to be placed. """ # Parse command line arguments parser = argparse.ArgumentParser() parser.add_argument("-docs_dir", type=str) parser.add_argument("-dataset_name", type=str) args = vars(parser.parse_args()) documents_dir = args["docs_dir"] dataset_name = args["dataset_name"] document_paths = [ os.path.join(documents_dir, d) for d in os.listdir(documents_dir) if d.endswith(".txt") ] # Write document index to id mapping doc_ids = [ d.replace(".txt", "") for d in os.listdir(documents_dir) if d.endswith(".txt") ] doc_idxs = {i: doc_ids[i] for i in range(len(doc_ids))} os.makedirs(os.path.dirname(doc_idxs_file.format(id=dataset_name))) with open(doc_idxs_file.format(id=dataset_name), "w") as f: json.dump(doc_idxs, f, indent=2) # Create tokenizer and tokenize corpus in a single pass corpus_file = corpus_file.format(id=dataset_name) tokenizer_file = tokenizer_file.format(id=dataset_name) os.makedirs(os.path.dirname(corpus_file), exist_ok=True) corpus_builder = IterativeCorpusBuilder(document_paths, 2 * 10**6) MmCorpus.serialize(fname=corpus_file, corpus=corpus_builder) corpus_builder.tokenizer.save(tokenizer_file)
from gensim.corpora import WikiCorpus from gensim.corpora.mmcorpus import MmCorpus if __name__ == '__main__': # Log both to a file and the console log_name = 'tut4.log' logger = logging.getLogger() logger.setLevel(logging.INFO) fh = logging.FileHandler(log_name) fh.setLevel(logging.INFO) ch = logging.StreamHandler() ch.setLevel(logging.INFO) formatter = logging.Formatter('%(asctime)s : %(levelname)s : %(message)s') ch.setFormatter(formatter) fh.setFormatter(formatter) logger.addHandler(ch) logger.addHandler(fh) logger.info("running %s" % ' '.join(sys.argv)) fname = 'simplewiki-20120313-pages-articles.xml.bz2' wiki = WikiCorpus(fname) # save dictionary and bag-of-words (term-document frequency matrix) output = 'simple_wiki' wiki.dictionary.save_as_text(output + '_wordids.txt') MmCorpus.serialize(output + '_bow.mm', wiki, progress_cnt=10000) del wiki
if len(sys.argv) < 3: print globals()["__doc__"] % locals() sys.exit(1) input, output = sys.argv[1:3] if len(sys.argv) > 3: keep_words = int(sys.argv[3]) else: keep_words = DEFAULT_DICT_SIZE # build dictionary. only keep 100k most frequent words (out of total ~8.2m unique tokens) # takes about 9h on a macbook pro, for 3.5m articles (june 2011 wiki dump) wiki = WikiCorpus(input, keep_words=keep_words) # save dictionary and bag-of-words (term-document frequency matrix) # another ~9h wiki.dictionary.save_as_text(output + "_wordids.txt") MmCorpus.serialize(output + "_bow.mm", wiki, progress_cnt=10000) del wiki # initialize corpus reader and word->id mapping id2token = Dictionary.load_from_text(output + "_wordids.txt") mm = MmCorpus(output + "_bow.mm") # build tfidf, # ~30min from gensim.models import TfidfModel tfidf = TfidfModel(mm, id2word=id2token, normalize=True) # save tfidf vectors in matrix market format # ~2h; result file is 15GB! bzip2'ed down to 4.5GB MmCorpus.serialize(output + "_tfidf.mm", tfidf[mm], progress_cnt=10000)
def save(self, fname): self.model.save(fname) ln.info("preprocessing corpus") dictionary = Dictionary() preprocessor = Preprocessor(use_stemming=False, dictionary=dictionary) corpusname = "brown" corpus = [preprocessor.preprocess(" ".join(text), return_bow=True) for text in brown.sents()] preprocessor.dictionary.filter_extremes(15, 0.1, 30000) corpus = [preprocessor.preprocess(" ".join(text), allow_update=False, return_bow=True) for text in brown.sents()] ln.debug("saving/loading corpus") save = MmCorpus.serialize("test.mm", corpus) corpus = MmCorpus("test.mm") dimensions = 2000 params = [{"num_layers": 5, "noise": 0.7}, {"num_layers": 3, "noise": 0.3}][0] ln.info("training mSDA with %s dimensions. params: %s" % (dimensions, params)) model = mSDAWrapper.train(corpus, dimensions, dictionary, params) paramstring = "_".join(["%s-%s" % (k, v) for k, v in params.items()]) savestring = "mSDA_%s_%s_" % (corpusname, paramstring) model.save(savestring) msda_wrapper = mSDAWrapper(savestring, preprocessor)
preprocessor = Preprocessor(use_stemming=False, dictionary=dictionary) corpusname = "brown" corpus = [ preprocessor.preprocess(" ".join(text), return_bow=True) for text in brown.sents() ] preprocessor.dictionary.filter_extremes(15, 0.1, 30000) corpus = [ preprocessor.preprocess(" ".join(text), allow_update=False, return_bow=True) for text in brown.sents() ] ln.debug("saving/loading corpus") save = MmCorpus.serialize("test.mm", corpus) corpus = MmCorpus("test.mm") dimensions = 2000 params = [{"num_layers": 5, "noise": 0.7}, {"num_layers": 3, "noise": 0.3}][0] ln.info("training mSDA with %s dimensions. params: %s" % (dimensions, params)) model = mSDAWrapper.train(corpus, dimensions, dictionary, params) paramstring = "_".join(["%s-%s" % (k, v) for k, v in params.items()]) savestring = "mSDA_%s_%s_" % (corpusname, paramstring) model.save(savestring) msda_wrapper = mSDAWrapper(savestring, preprocessor) def get_synonyms(word):
def __init__(self, docs, topic_number=500): # Create a dictionary representation of the documents. print('training LSI models with topic number = ' + str(topic_number)) if (not os.path.isfile('./lsi/lsi_dict.dict')): print('creating dict') dictionary = Dictionary(docs) dictionary.save('./lsi/lsi_dict.dict') else: print('dict already exists') dictionary = Dictionary.load("./lsi/lsi_dict.dict") self.dictionary = dictionary # Create corpora if (not os.path.isfile('./lsi/lsi_corpus.mm')): # Filter out words that occur less than 20 documents, or more than 50% of the documents. print('creating bow corpus') dictionary.filter_extremes(no_below=20, no_above=0.5) corpus = [dictionary.doc2bow(doc) for doc in docs] MmCorpus.serialize("lsi/lsi_corpus.mm", corpus) else: print('bow corpus already exists') corpus = MmCorpus("./lsi/lsi_corpus.mm") self.tfidf = models.TfidfModel(corpus) if (not os.path.isfile('./lsi/lsi_tf_corpus.mm')): print('creating tf corpus') tf_corp = self.tfidf[corpus] MmCorpus.serialize("lsi/lsi_tf_corpus.mm", tf_corp) else: print('tf corpus already exists') tf_corp = MmCorpus("./lsi/lsi_tf_corpus.mm") # Make a index to word dictionary. temp = dictionary[0] # This is only to "load" the dictionary. id2word = dictionary.id2token #Create the models and vectors if (not os.path.isfile('./lsi/lsi_bow_model' + str(topic_number) + '.model')): print('creating bow model') bow_model = models.LsiModel(corpus=corpus, num_topics=topic_number, id2word=id2word) bow_model.save('lsi/lsi_bow_model' + str(topic_number) + '.model') else: print('bow model already exists') bow_model = models.LsiModel.load('./lsi/lsi_bow_model' + str(topic_number) + '.model') bow_vector = bow_model[corpus] self.bow_model = bow_model if (not os.path.isfile('./lsi/lsi_tf_model' + str(topic_number) + '.model')): print('creating tfidf model') tf_model = models.LsiModel(corpus=tf_corp, num_topics=topic_number, id2word=id2word) tf_model.save('./lsi/lsi_tf_model' + str(topic_number) + '.model') else: print('tfidf model already exists') tf_model = models.LsiModel.load('./lsi/lsi_tf_model' + str(topic_number) + '.model') tf_vector = tf_model[tf_corp] self.tf_model = tf_model #Create indices if (not os.path.isfile('./lsi/lsi_bow_model' + str(topic_number) + '.index')): print('creating bow index') bow_index = similarities.MatrixSimilarity( bow_vector) # index corpus in bow LSI space bow_index.save('lsi/lsi_bow_model' + str(topic_number) + '.index') else: print('bow index already exists') bow_index = similarities.MatrixSimilarity.load( './lsi/lsi_bow_model' + str(topic_number) + '.index') self.bow_index = bow_index if (not os.path.isfile('./lsi/lsi_tf_model' + str(topic_number) + '.index')): print('creating tf index') tf_index = similarities.MatrixSimilarity( tf_vector) # index corpus in tf LSI space tf_index.save('lsi/lsi_tf_model' + str(topic_number) + '.index') else: print('tf index already exists') tf_index = similarities.MatrixSimilarity.load( './lsi/lsi_tf_model' + str(topic_number) + '.index') self.tf_index = tf_index print('model created!')
def load(id): tokenizer = Dictionary.load(BM25Engine.tokenizer_file.format(id=id)) corpus = MmCorpus(BM25Engine.corpus_file.format(id=id)) with open(BM25Engine.doc_idxs_file.format(id=id), "r") as f: idxs2id = json.load(f) return BM25Engine(tokenizer, corpus, idxs2id)
def setUp(self): self.corpus = MmCorpus(datapath('testcorpus.mm')) self.model = lsimodel.LsiModel(self.corpus, num_topics=2)
def setUp(self): self.corpus_small = MmCorpus(datapath('test_corpus_small.mm')) self.corpus_ok = MmCorpus(datapath('test_corpus_ok.mm')) self.corpus_empty = []
def saveGensim(self, topic): if topic is None: # generate all self.saveGensim('movie') self.saveGensim('celebrity') self.saveGensim('syria') self.saveGensim('ufo') return posDocs = [] negDocs = [] if topic == 'movie': topic = 'movie_reviews' elif topic == 'celebrity': topic = 'bieber' if topic == 'movie_reviews': count = 100 posDocs = self.movieReviews('positive', count) negDocs = self.movieReviews('negative', count) else: posDocs = self.getArticlesHelper('positive', topic) negDocs = self.getArticlesHelper('negative', topic) listOfTokens = [] # dictionary docs = [] # corpus for posDoc in posDocs: processed = self.processDocForGensim(posDoc) tokens = self.tokensFromText(processed) listOfTokens.append(tokens) docs.append(processed) for negDoc in negDocs: processed = self.processDocForGensim(negDoc) tokens = self.tokensFromText(processed) listOfTokens.append(tokens) docs.append(processed) dictionaryFilename = 'gensim_dictionary.txt' corpusFilename = 'gensim_corpus.mm' # make destination files if they don't exist dictionaryPath = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'james_data', topic, dictionaryFilename ) corpusPath = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'james_data', topic, corpusFilename ) corpusTempPath = corpusPath + '.tmp' if os.path.exists(dictionaryPath): os.remove(dictionaryPath) if os.path.exists(corpusPath): os.remove(corpusPath) if os.path.exists(corpusTempPath): os.remove(corpusTempPath) with open(dictionaryPath, 'w') as f: f.write(' ') with open(corpusPath, 'w') as f: f.write(' ') # save dictionary and corpus d = Dictionary(listOfTokens) d.save(dictionaryPath) with open(corpusTempPath, 'w') as f: f.write('\n'.join(docs)) corpus = TextCorpus(corpusTempPath) MmCorpus.save_corpus(corpusPath, corpus) return
# optional argv[3] = keep_words if len(sys.argv) < 3: print globals()['__doc__'] % locals() #sys.exit(1) input, output = sys.argv[1:3] if len(sys.argv) > 3: keep_words = int(sys.argv[3]) else: keep_words = DEFAULT_DICT_SIZE # build dictionary. only keep 100k most frequent words (out of total ~900k unique tokens) enron = EnronCorpus(input, keep_words=keep_words) # save dictionary and bag-of-words (term-document frequency matrix) enron.dictionary.save_as_text(output + '_wordids.txt') MmCorpus.serialize(output + '_bow.mm', enron, progress_cnt=10000) del enron # initialize corpus reader and word->id mapping id2token = Dictionary.load_from_text(output + '_wordids.txt') mm = MmCorpus(output + '_bow.mm') # build tfidf from gensim.models import TfidfModel tfidf = TfidfModel(mm, id2word=id2token, normalize=True) # save tfidf vectors in matrix market format MmCorpus.serialize(output + '_tfidf.mm', tfidf[mm], progress_cnt=10000) logger.info("finished running %s" % program)
def rekomendasi(input): data = [input] id2word = Dictionary.load('pdupt_website/id2word_new.dict') corpus = MmCorpus('pdupt_website/corpus_new.mm') df = pd.read_csv('pdupt_website/reduksifix.csv') with open("pdupt_website/lemma_new.txt", "rb") as fp: #Pickling data_lemmatized = pickle.load(fp) stop_words = stopwords.words('indonesian') stop_words2 = stopwords.words('english') stop_words.extend(stop_words2) stop_words.extend([ 'of', 'in', 'and', 'the', 'for', 'on', 'using', 'based', 'from', 'with', 'to', 'by', 'as', 'an', 'pengaruh', 'effect', 'analisis', 'at', 'pre', 'pro', 'analysis', 'berbasis', 'tahun', 'between', 'kualitas', 'method', 'metode', 'through', 'menggunakan', 'hasil' ]) # Remove Numbers data = [re.sub(" \d+", ' ', sent) for sent in data] data = [re.sub('[^a-zA-Z]', ' ', sent) for sent in data] # Remove new line characters data = [re.sub('\s+', ' ', sent) for sent in data] # Remove distracting single quotes data = [re.sub("\'", "", sent) for sent in data] def sent_to_words(sentences): for sentence in sentences: yield (gensim.utils.simple_preprocess(str(sentence), deacc=True) ) # deacc=True removes punctuations data = sent_to_words(data) data_words = list(data) # Build the bigram and trigram models bigram = gensim.models.Phrases( data_words, min_count=5, threshold=100) # higher threshold fewer phrases. trigram = gensim.models.Phrases(bigram[data_words], threshold=100) # Faster way to get a sentence clubbed as a trigram/bigram bigram_mod = gensim.models.phrases.Phraser(bigram) trigram_mod = gensim.models.phrases.Phraser(trigram) # Define functions for stopwords, bigrams, trigrams and lemmatization # from Sastrawi.Stemmer.StemmerFactory import StemmerFactory def remove_stopwords(texts): return [[ word for word in simple_preprocess(str(doc)) if word not in (stop_words or stop_words2) ] for doc in texts] def make_bigrams(texts): return [bigram_mod[doc] for doc in texts] def make_trigrams(texts): return [trigram_mod[bigram_mod[doc]] for doc in texts] def lemmatization(texts): """https://spacy.io/api/annotation""" texts_out = [] for sent in texts: doc = nlp(" ".join(sent)) texts_out.append([token.lemma_ for token in doc]) return texts_out # Remove Stop Words data_words_nostops = remove_stopwords(data_words) # # Form Bigrams data_words_bigrams = make_bigrams(data_words_nostops) nlp = spacy.load('en_core_web_sm') data_lemmatized_search = lemmatization(data_words_bigrams) #stem masing-masing kata yang ada factory = StemmerFactory() stemmer = factory.create_stemmer() for x in range(len(data_lemmatized_search) - 1): for y in range(len(data_lemmatized_search[x]) - 1): data_lemmatized_search[x][y] = stemmer.stem( data_lemmatized_search[x][y]) # import gensim model = gensim.models.ldamodel.LdaModel.load( 'pdupt_website/mallet_18_lda.mdl', mmap='r') new_doc_bow = id2word.doc2bow(data_lemmatized_search[0]) hasil = model.get_document_topics(new_doc_bow) topic = 0 nilai = -99 for i, row in (hasil): if (row > nilai): topic = i nilai = row df_topik = df.loc[df['Topic1'] == topic] df_topik = df_topik.astype({"id_judul": int}) df_topik = df_topik.reset_index(drop=True) ##membuat data lemma, corpus dan dictionary berdasarkan data dalam 1 topik res_list = [data_lemmatized[int(i) - 1] for i in df_topik.id_judul] # Create Dictionary id2word_topik = corpora.Dictionary(res_list) # Create Corpus texts = res_list # Term Document Frequency corpus_topik = [id2word_topik.doc2bow(text) for text in res_list] #membuat indexing untuk perhitungan cossim index_tmpfile = get_tmpfile("index") index = Similarity(index_tmpfile, corpus_topik, num_features=len(id2word_topik)) #query diambil dari term document berdasarkan corpus per topik dari data lemma hasil search query = id2word_topik.doc2bow(data_lemmatized_search[0]) similarities = index[query] sort_index = np.argsort(similarities) sort_index reversed_arr = sort_index[::-1] reversed_arr list_idx = reversed_arr[:10] list_id_artikel = list(df_topik[df_topik.index.isin(list_idx)].id_judul) return (list_id_artikel, topic + 1)
mean_jaccard.append(np.mean(jacc_np)) mean_bleu.append(np.mean(bleu_np)) mean_cos.append(np.mean(cos_np)) mean_fscore.append(np.mean(fscore_np)) return np.max(np.asarray(mean_bleu)), np.max( np.asarray(mean_jaccard)), np.max(np.asarray(mean_cos)), np.max( np.asarray(mean_fscore)) GH_IDs, SO_IDs, GH_annotation_intersect, GH_annotation_union, SO_annotation_intersect, SO_annotation_union = load_annotations( ) path = "/home/norberteke/PycharmProjects/Thesis/data/" dictionary = Dictionary.load(path + 'GH_full_processed_Dictionary.dict') corpus = MmCorpus(datapath(path + 'corpus_processed_GH_full.mm')) texts = [] with open(path + 'GH_full_processed_corpus.csv', 'r') as f: reader = csv.reader(f) texts = list(reader) terms = [] for (key, value) in dictionary.iteritems(): terms.append(value) def write_results_to_file(path, lda_model, max_bleu, max_jaccard, max_cos, max_fscore): with open(path, 'a') as f: writer = csv.writer(f,
def build_corpus(dictionary): MmCorpus.serialize(CORPUS_FILE, BowCorpus(wiki_index.ARTICLES_FILE, dictionary)) return MmCorpus(CORPUS_FILE)