def read_project_data(mtc,csc, fname): d1 = Dictionary.load(mtc + ".dict") d2 = Dictionary.load(csc + ".dict") #d3 = Dictionary.load('data/postgresql-d4f8dde3-CommitLogCorpus.mallet.dict') MultiTextCorpus = MalletCorpus(mtc, d1) ChangesetCorpus = MalletCorpus(csc, d2) #CommitLogCorpus = MalletCorpus('data/postgresql-d4f8dde3-CommitLogCorpus.mallet', d3) u1 = set(d1.values()) u2 = set(d2.values()) #u3 = set(d3.values()) common = u1.intersection(u2) uc_set = (len(u1),len(u2)) u1_uniq = u1.difference(common) u2_uniq = u2.difference(common) print(u1_uniq) fname = "common_words_comparison.txt" with open(fname, 'a') as f: parts = mtc.split("-") f.write(str(parts[0]) + "\n") f.write("length of MultiTextCorpus: " + str(len(MultiTextCorpus)) + "\n") f.write("length of ChangesetCorpus: " + str(len(ChangesetCorpus)) + "\n" + "\n") f.write("(MTC,CSC) in common" + "\n") f.write(str(uc_set) + " " + str(len(common))) f.write('\n' + '\n')
def evaluate_log(context, config): logger.info('Evalutating models for: %s' % config.project.name) model_fname = config.model_fname % ChangesetCorpus.__name__ changeset_fname = config.corpus_fname % ChangesetCorpus.__name__ commit_fname = config.corpus_fname % CommitLogCorpus.__name__ try: commit_id2word = Dictionary.load(commit_fname + '.dict') commit_corpus = MalletCorpus(commit_fname, id2word=commit_id2word) changeset_id2word = Dictionary.load(changeset_fname + '.dict') changeset_corpus = MalletCorpus(changeset_fname, id2word=changeset_id2word) except: error('Corpora not built yet -- cannot evaluate') try: model = LdaModel.load(model_fname) logger.info('Opened previously created model at file %s' % model_fname) except: error('Cannot evalutate LDA models not built yet!') changeset_doc_topic = get_doc_topic(changeset_corpus, model) commit_doc_topic = get_doc_topic(commit_corpus, model) first_shared = dict() for id_ in commit_doc_topic: i = 0 commit_topics = [topic[0] for topic in commit_doc_topic[id_]] try: changeset_topics = [topic[0] for topic in changeset_doc_topic[id_]] except: continue maximum = 101 minimum = maximum for i, topic in enumerate(commit_topics): if topic in changeset_topics: j = changeset_topics.index(topic) minimum = min(minimum, max(i, j)) for i, topic in enumerate(changeset_topics): if topic in commit_topics: j = commit_topics.index(topic) minimum = min(minimum, max(i, j)) first_shared[id_] = minimum if minimum == maximum: logger.info('No common topics found for %s' % str(id_)) del first_shared[id_] mean = sum(first_shared.values()) / len(first_shared) with open('data/evaluate-log-results.csv', 'a') as f: w = csv.writer(f) w.writerow([model_fname, mean] + list(first_shared.values()))
def test_constructor_with_file_wikicorpus(self): #load tf-idf corpus tfidf_corpus = MmCorpus('/media/sdc1/test_dump/result/test_tfidf_corpus.mm') #load lda corpus #lda_corpus = MmCorpus('/media/sdc1/test_dump/result/test_lda_corpus.mm') #load dictionary id2token = Dictionary.load("/media/sdc1/test_dump/result/test_wordids.dict") #load article titles document_titles = DocumentTitles.load("/media/sdc1/test_dump/result/test_articles.txt") #train esa model esa_model = EsaModel(tfidf_corpus, num_clusters = 15, document_titles = document_titles, num_features = len(id2token)) print esa_model esa_model.save('/media/sdc1/test_dump/result/wiki_esa.model') tmp_esa = EsaModel.load('/media/sdc1/test_dump/result/wiki_esa.model') print tmp_esa
def write_topics(model_path, csv_name, k): model = LdaModel.load(model_path) topics = [] for topic_id in range(model.num_topics): topics.append(model.return_topic(topicid=topic_id)) dictionary = Dictionary.load('data/dictionary/tweets.dict') word_indices = dictionary.id2token writer = csv.writer(file(csv_name, 'w')) output = [[0 for i in range(model.num_topics)] for j in range(k)] for topic_id, topic in enumerate(topics): for rank, index in enumerate(topic.argsort()[::-1]): output[rank][topic_id] = {} output[rank][topic_id]['word'] = word_indices[index] output[rank][topic_id]['p'] = topic[index] rank += 1 if rank >= k: break for topic_id in range(model.num_topics): row = ['z = ' + str(topic_id)] for rank in range(k): row.append(output[rank][topic_id]['word'] + ':' + str(output[rank][topic_id]['p'])) writer.writerow(row)
def __iter__(self): list_dict = Dictionary.load('terms.dict') # list_dict.filter_extremes(no_below=1000,no_above=0.99) counter = 0 doc_id = 0 for member_id, count in self.members: if counter % 100 == 0: print('Done', counter) self.cursor.execute(self.query, (member_id,)) expert_text = Counter() for result in self.cursor: parsed_text = self.parser.parse_list(title=result[1], description=result[2]) expert_text.update(parsed_text['text']) terms = sorted([(e, v) for e, v in expert_text.items() if v > 1], key=operator.itemgetter(1), reverse=True) counter += 1 if len(terms): if terms[0][1] > 10: word_bag = [] for k, v in terms: try: word_bag.append((list_dict.token2id[k], v)) except KeyError: pass expert2doc[member_id] = doc_id doc_id += 1 yield word_bag
def main(): logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') parser = OptionParser() parser.add_option('-f', '--corpus-file') parser.add_option('-p', '--parse-procs', default=1, type=int) parser.add_option('-s', '--sublexicalize-procs', default=1, type=int) parser.add_option('-t', '--tfidf-model') parser.add_option('-v', '--vocabulary') parser.add_option('-m', '--model-file') opts, args = parser.parse_args() corpus_fn = opts.corpus_file or sys.exit() n_proc_parse = opts.parse_procs n_proc_sublex = opts.sublexicalize_procs vocab_fn = opts.vocabulary tfidf_fn = opts.tfidf_model model_fn = opts.model_file or sys.exit() with BZ2File(corpus_fn) as f: corpus = SublexicalizedCorpus(WikiCorpus(corpus_fn, processes=n_proc_parse, dictionary=Dictionary()), order=(3, 6), clean_func=normalize_whitespace, n_proc=n_proc_sublex, create_dictionary=False) if vocab_fn and os.path.exists(vocab_fn): logging.info("Loading vocabulary from %s" % vocab_fn) vocab = Dictionary.load(vocab_fn) else: logging.info("Creating vocabulary") start = time.clock() vocab = Dictionary(corpus.get_texts()) end = time.clock() logging.info("Vocabulary created in %d seconds" % (end - start)) if vocab_fn: logging.info("Saving dictionary to %s" % vocab_fn) vocab.save(vocab_fn) corpus.dictionary = vocab corpus.dictionary.filter_extremes(no_below=5, no_above=.8) corpus.dictionary.compactify() if tfidf_fn and os.path.exists(tfidf_fn): logging.info("Reading TF-IDF model from %s" % tfidf_fn) tfidf = TfidfModel.load(tfidf_fn) else: logging.info("creating TF-IDF model") tfidf = TfidfModel(corpus) if tfidf_fn: logging.info("Saving TFF-IDF model to %s" % tfidf_fn) tfidf.save(tfidf_fn) bow_corpus = (tfidf[art] for art in corpus) model = LsiModel(corpus=bow_corpus, num_topics=10, id2word=corpus.dictionary) model.save(model_fn)
def load(self): if os.path.exists(self._lexicon_path): self.lexicon = Dictionary.load(self._lexicon_path) if os.path.exists(self._tfidf_path): self.tfidf = TfidfModel().load(self._tfidf_path)
def load(self): '''读取 lda 模型和 dic 词典. ''' lda_file = config.get('dmp', 'lda_file') dic_file = config.get('dmp', 'dic_file') self.lda = LdaModel.load(lda_file) self.dic = Dictionary.load(dic_file)
def addHarassingTweet(self, txt): ''' Add an harassing tweet to the model corpus While gensim purports to train models incrementally, it'll crash if you try. Instead, we just rebuild the model each time we get a new tweet, remembering all the old ones as we go. ''' if txt in self.harassment: return words = txt.split() if os.path.exists('/tmp/MyDict.dict'): self.d = Dictionary.load('/tmp/MyDict.dict') self.d = self.d.merge(Dictionary([words])) else: # build dictionary self.d = Dictionary([words]) # generate bag of words #bow = self.d.doc2bow(words, allow_update=True) #self.harassment[txt] = bow #corpus = [] #for txt in self.harassment: # corpus.append(self.harassment[txt]) self.model = LdaModel(id2word=self.d)
def __iter__(self): list_dict = Dictionary.load('terms.dict') # list_dict.filter_extremes(no_below=1000,no_above=0.99) counter = 0 doc_id = 0 for member_id, count in self.members: if counter % 1000 == 0: print('Done', counter) print(member_id, count) self.cursor.execute(self.query, (member_id,)) expert_text = Counter() for result in self.cursor: parsed_text = self.parser.parse_list(title=result[1], description=result[2]) expert_text.update(parsed_text['text']) # expert_text.update(parsed_text['bigrams']) terms = ((e, v) for e, v in expert_text.items() if v > 10 and any([e.startswith(t) for t in topics])) counter += 1 print(list(terms)) word_bag = [] for k, v in terms: try: word_bag.append((list_dict.token2id[k], v)) except KeyError: pass expert2doc[member_id] = doc_id doc_id += 1 yield word_bag
def load_data(self): if not self.tf_idf_model: if not os.path.exists(self.tf_idf_model_path): raise Exception('TF-IDF model file not found') self.dictionary = Dictionary.load(self.dictionary_path) self.tf_idf_model = TfidfModel.load(self.tf_idf_model_path)
def create_corpus(project, repos, Kind, use_level=True, forced_ref=None): corpus_fname_base = project.full_path + Kind.__name__ if use_level: corpus_fname_base += project.level if forced_ref: corpus_fname_base += forced_ref[:8] corpus_fname = corpus_fname_base + '.mallet.gz' dict_fname = corpus_fname_base + '.dict.gz' made_one = False if not os.path.exists(corpus_fname): combiner = CorpusCombiner() for repo in repos: try: if repo or forced_ref: corpus = Kind(project=project, repo=repo, lazy_dict=True, ref=forced_ref, ) else: corpus = Kind(project=project, lazy_dict=True) except KeyError: continue except TaserError as e: if repo == repos[-1] and not made_one: raise e # basically, if we are at the last repo and we STILL # haven't sucessfully extracted a corpus, ring some bells else: # otherwise, keep trying. winners never quit. continue combiner.add(corpus) made_one = True # write the corpus and dictionary to disk. this will take awhile. combiner.metadata = True MalletCorpus.serialize(corpus_fname, combiner, id2word=combiner.id2word, metadata=True) combiner.metadata = False # write out the dictionary combiner.id2word.save(dict_fname) # re-open the compressed versions of the dictionary and corpus id2word = None if os.path.exists(dict_fname): id2word = Dictionary.load(dict_fname) corpus = MalletCorpus(corpus_fname, id2word=id2word) return corpus
def train(text_corpus_file, dict_file): """train lsi model from text corpus""" gutenberg_corpus = TextCorpus(text_corpus_file) dict = Dictionary.load(dict_file) lsi = LsiModel(corpus=gutenberg_corpus, id2word=dict, num_topics=400) lsi.save(model_file) print lsi.projection.u print lsi.projection.u.size print lsi.projection.u[0].size
def analyze(self, docs): # load dictionary and model self.dictionary = Dictionary.load(self.getModelFilePath("common.dictionary.file")) self.ldaModel = LdaModel.load(self.getModelFilePath("common.model.file")) # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above. docTermMatrix = [self.dictionary.doc2bow(doc) for doc in docs] docTopicDistr = self.getDocumentTopics(docTermMatrix) return docTopicDistr
def __init__(self, dictionary, **kwargs): super(GensimIDFProvider, self).__init__(**kwargs) if {'missing', 'linear', 'linear'} <= set(kwargs): logging.warning('<%s> argumemts to GensimIDFProvider can generate incorrect weights and should not be used' % '|'.join({'missing', 'linear', 'linear'})) if isinstance(dictionary, (str, unicode)): dictionary = Dictionary.load(dictionary) self.dictionary = dictionary self.tfidf = TfidfModel(dictionary=dictionary, normalize=False)
def create_evaluation_corpora(config, Kind): corpus_fname = config.corpus_fname % Kind.__name__ try: id2word = Dictionary.load(corpus_fname + '.dict') corpus = MalletCorpus(corpus_fname, id2word=id2word) except: error('Corpora not built yet -- cannot evaluate') word_freq = list(reversed(sorted(count_words(corpus)))) print("Top 10 words in %s: %s", (corpus_fname, str(word_freq[:10]))) print("Bottom 10 words in %s: %s", (corpus_fname, str(word_freq[-10:])))
def main(): args = parse_args() if args.text: print('Creating text') create_text(args.text_file_name) elif args.dict: print('Creating dict') stream_dict(args.dict) elif args.corp: print('Creating corpi') dictionary = Dictionary.load('cars2.txt') corpi()
def __init__(self): self.dictionary = Dictionary.load(app.config["RCMDR_DICT"]) self.corpus = corpora.MmCorpus(app.config["RCMDR_CORPUS"]) self.tfidf = TfidfModel.load(app.config["RCMDR_TFIDF_MODEL"]) self.lda_model = LdaModel.load(app.config["RCMDR_LDA_MODEL"]) self.lsi_model = LsiModel.load(app.config["RCMDR_LSI_MODEL"]) self.lda_index = Similarity.load(app.config["RCMDR_LDA_INDEX"]) self.lsi_index = Similarity.load(app.config["RCMDR_LSI_INDEX"]) self.job_labels = { int(k): v for k, v in (line.split("=") for line in open(app.config["RCMDR_JOB_LABELS"]).read().strip().split("\n")) }
def update(self, docs): # load dictionary and model self.dictionary = Dictionary.load(self.getModelFilePath("common.dictionary.file")) self.ldaModel = LdaModel.load(self.getModelFilePath("common.model.file")) # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above. docTermMatrix = [self.dictionary.doc2bow(doc) for doc in docs] numPass = self.config.getIntConfig("train.num.pass")[0] self.ldaModel.update(docTermMatrix, passes=numPasses) docTopicDistr = self.getDocumentTopics(docTermMatrix) return docTopicDistr
def test_cluster(self): #load tf-idf corpus tfidf_corpus = MmCorpus('/media/sdc1/test_dump/result/test_tfidf_corpus.mm') #load dictionary id2token = Dictionary.load("/media/sdc1/test_dump/result/test_wordids.dict") kmedoids = self.get_kmedoids(tfidf_corpus, len(id2token), num_clusters = 15, max_iterations = 5) clusters = self.cluster(kmedoids) print clusters
def create_evaluation_corpora_cosine(config, Kind, Kind2): corpus1_fname = config.corpus_fname % Kind.__name__ corpus2_fname = config.corpus_fname % Kind2.__name__ try: id2word1 = Dictionary.load(corpus1_fname + '.dict') corpus1 = MalletCorpus(corpus1_fname, id2word=id2word1) id2word2 = Dictionary.load(corpus2_fname + '.dict') corpus2 = MalletCorpus(corpus2_fname, id2word=id2word2) except: error('Corpora not built yet -- cannot evaluate') word_freq1 = get_word_freq(corpus1) word_freq2 = get_word_freq(corpus2) total1 = float(sum(x[1] for x in word_freq1.items())) total2 = float(sum(x[1] for x in word_freq2.items())) all_words = set(word_freq1.keys()) | set(word_freq2.keys()) for word in all_words: if word not in word_freq1: word_freq1[word] = 0 if word not in word_freq2: word_freq2[word] = 0 dist1 = [x[1]/total1 for x in sorted(word_freq1.items())] dist2 = [x[1]/total2 for x in sorted(word_freq2.items())] rdist = numpy.random.random_sample(len(all_words)) res = utils.hellinger_distance(dist1, dist2, filter_by=0.0) res1 = utils.hellinger_distance(dist1, rdist, filter_by=0.0) res2 = utils.hellinger_distance(dist2, rdist, filter_by=0.0) logger.info("Cosine distance between corpora: %f" % res) with open(config.path + 'evaluate-hellinger-results.csv', 'a') as f: w = csv.writer(f) w.writerow([corpus1_fname, corpus2_fname, res, res1, res2])
def create(pathtomapping, pathtocorpus, corpusname, window, numtokeep=50000, save_raw=True, shifts=(1, 5, 10)): """ Creates an Shifted Positive Pointwise Mutual Information matrix. :param pathtomapping: The path to the id2word mapping. If this is left empty, the id2word mapping gets recreated. Warning: this takes a long time. :param pathtocorpus: The path to the corpus folder. The corpus can be spread out over multiple files or folders, and is read iteratively. :param corpusname: The name of the corpus. Used for saving the files. :param window: The window used to consider co-occurrences. :param numtokeep: The number of most frequent words to keep. Note that the matrix is non-sparse. Because of this, the memory requirements of the code are quadratic. :param save_raw: Whether to save the raw co-occurrence matrix as a numpy matrix. :param shifts: The shifts to apply to the co-occurrence matrix. Each shifted matrix gets saved as a separate model. """ start = time.time() if not pathtomapping: id2word = Dictionary(SentenceIter(pathtocorpus), prune_at=None) id2word.filter_extremes(no_below=5, keep_n=numtokeep) id2word.compactify() logger.info("Creating the word2id took {0} seconds".format(time.time() - start)) else: id2word = Dictionary.load(pathtomapping) inter = time.time() word2id = gensim.utils.revdict(id2word) corpus = SentenceIter(pathtocorpus) raw = get_cooccur(corpus, word2id, window=window) logger.info("Creating raw co-occurrence matrix took {0} seconds".format(time.time() - inter)) if save_raw: np.save('{0}-cooccur.npy'.format(corpusname), raw) SPPMIFactory._save_word2id(word2id, "{0}mapping.json".format(corpusname)) SPPMIFactory._save_freqs(id2word, "{0}freqs.json".format(corpusname)) raw = SPPMIFactory.raw2pmi(raw) for k in shifts: sparse = SPPMIFactory.shift_clip_pmi(np.copy(raw), k_shift=k) SPPMIFactory._save_sparse_mtr(sparse, "{0}-SPPMI-sparse-{1}-shift.npz".format(corpusname, k)) del sparse
def main(): file = 'f:/projects/elasticsearch-enterprise-system/data/topic_models/nowiki_v2_3pass_lda_250' mod = LdaModel.load(file) dict = 'f:/projects/elasticsearch-enterprise-system/data/topic_models/voc_vocabulary_0.vocab' vocab = Dictionary.load(dict) corpfile = 'f:/projects/comperio-text-analytics/models/topicmodel/mojo_lda_100.corp' corpus = gensim.corpora.MmCorpus(corpfile) print mod.show_topic(0) print mod.id2word mod.id2word = vocab print mod.show_topic(0) pydavis = pyLDAvis.gensim.prepare(mod, corpus, vocab) pyLDAvis.save_html(pydavis, 'pydavis_250_v2_3passes.html') pyLDAvis.show(pydavis)
def main(param_file=None): # setup p, base_path, output_dir = tools.setup(param_file) result_path = path.join(base_path, p['result_path']) logger = tools.get_logger('gensim', path.join(output_dir, "run.log")) logger.info("running %s" % ' '.join(sys.argv)) logger.info('load the articles..') article_path = path.join(result_path, p['article_label']) wiki = pickle.load(open(path.join(article_path, 'articles.pickle'))) logger.info('load dictionary and models') dictionary = Dictionary.load(path.join(result_path, p['model_label'], 'dic.dict')) model_path = path.join(result_path, p['model_label']) lsi = LsiModel.load(path.join(model_path, 'lsi.model')) pre = pickle.load(open(path.join(model_path, 'pre.model'))) if int(p['num_topics']) > lsi.num_topics: logger.error('model to small') lsi.num_topics = int(p['num_topics']) data = {} for topic, entries in wiki.iteritems(): logger.info('working on: %s' % topic) data[topic] = {} data[topic]['keys'] = [] vecs = [] data[topic]['ratings'] = [] for key, val in entries.iteritems(): data[topic]['keys'].append(key) vecs.append(lsi[pre[dictionary.doc2bow(val['text'])]]) data[topic]['ratings'].append(val['rating']) data[topic]['vecs'] = np.squeeze(np.array(vecs)[:, :, 1:2]).T U, d, _ = np.linalg.svd(data[topic]['vecs'], full_matrices=False) data[topic]['U'] = U data[topic]['d'] = d f = open(os.path.join(output_dir, "data.pickle"), 'wb') pickle.dump(data, f)
def create_model(config, Kind): model_fname = config.model_fname % Kind.__name__ corpus_fname = config.corpus_fname % Kind.__name__ if not os.path.exists(model_fname): try: id2word = Dictionary.load(corpus_fname + '.dict') corpus = MalletCorpus(corpus_fname, id2word=id2word) logger.info('Opened previously created corpus: %s' % corpus_fname) except: error('Corpora for building file models not found!') file_model = LdaModel(corpus, id2word=corpus.id2word, alpha=config.alpha, passes=config.passes, num_topics=config.num_topics) file_model.save(model_fname)
def get_topics_lda(tokens, n_topics=10): """ Using the `gensim` package for LDA. LDA is a little better than LSA as it provides a reasonal mixture of topics (Wikipedia). `gensim` is a package for topic modeling only. So for a particular topic modeling task, it is a lighter option to install and run. Also it can be run distributed and updated over an existing model :param tokens: Preprocessed tokens for faster dictionary building :param n_topics: Number of topics to decompose data to :return: list() of topics """ dict_file = 'resources/deals.dict' if not os.path.isfile(dict_file): print "Dictionary file does not exist. Creating one" dictionary = Dictionary(tokens) freq1 = [id for id, freq in dictionary.dfs.iteritems() if freq == 1] dictionary.filter_tokens(freq1) dictionary.compactify() dictionary.save(dict_file) dictionary = Dictionary.load(dict_file) # print dictionary corpus_file = 'resources/deals.mm' if not os.path.isfile(corpus_file): print "Corpus file does not exist. Creating one" corpus = [dictionary.doc2bow(token) for token in tokens] MmCorpus.serialize(corpus_file, corpus) mm = MmCorpus(corpus_file) # print mm # tfidf = TfidfModel(mm) # corpus_tfidf = tfidf[mm] lda = LdaModel(corpus=mm, id2word=dictionary, num_topics=n_topics, update_every=1, chunksize=1000, passes=1) topics = [] for i in range(0, n_topics): words = lda.print_topic(i).split('+') topic = [] for word in words: score, w = word.split('*') topic.append((w, score)) topics.append(topic) return topics
def create_queries(project): corpus_fname_base = project.full_path + 'Queries' corpus_fname = corpus_fname_base + '.mallet.gz' dict_fname = corpus_fname_base + '.dict.gz' if not os.path.exists(corpus_fname): pp = GeneralCorpus(lazy_dict=True) id2word = Dictionary() with open(os.path.join(project.full_path, 'ids.txt')) as f: ids = [x.strip() for x in f.readlines()] queries = list() for id in ids: with open(os.path.join(project.full_path, 'queries', 'ShortDescription' + id + '.txt')) as f: short = f.read() with open(os.path.join(project.full_path, 'queries', 'LongDescription' + id + '.txt')) as f: long = f.read() text = ' '.join([short, long]) text = pp.preprocess(text) # this step will remove any words not found in the dictionary bow = id2word.doc2bow(text, allow_update=True) queries.append((bow, (id, 'query'))) # write the corpus and dictionary to disk. this will take awhile. MalletCorpus.serialize(corpus_fname, queries, id2word=id2word, metadata=True) # re-open the compressed versions of the dictionary and corpus id2word = None if os.path.exists(dict_fname): id2word = Dictionary.load(dict_fname) corpus = MalletCorpus(corpus_fname, id2word=id2word) return corpus
def __init__(self, ldaModelFile, dictionaryfile, stopfile="english.stop.txt"): ''' Const Parameters: ldaModelFile: the model file that was trained dictionaryfile: id2word mapping file ''' logging.info("[Start] Loading the dictionary " + dictionaryfile) self.id2word = Dictionary.load(dictionaryfile) logging.info("[Stop] Loading the dictionary " + dictionaryfile) logging.info("[Start] Loading the model file " + ldaModelFile) self.ldamodel = LdaModel.load(ldaModelFile) logging.info("[Done] Loading the model file " + ldaModelFile) logging.info("[Start] Loading all topics") self.alltopics = self.ldamodel.show_topics(-1) logging.info("[Start] Loading all topics") self.stopwords = self.loadStop(stopfile)
def __init__(self, text, dictionary, stopwords=False, stemming=False): self.text = text self.remove_stopwords = stopwords self.stemming = stemming self.dictionary = Dictionary.load(dictionary) # blackist words to be removed from text # combines stopwords from nltk, gensim and stop_words package self.en_stopwords = set( stop_words.get_stop_words('en') + nltk.corpus.stopwords.words("english") + list(gensim.parsing.preprocessing.STOPWORDS) ) # keep -, +, # in words self.punctuation = re.sub("[-+#.]", " ", punctuation) # make translation dictionary converting punctuations to white spaces self.translate_dict = maketrans(punctuation, ' '*len(punctuation)) # replace patterns self.invalid_char = re.compile(r'[0-9]|\\~|\`|\@|\$|\%|\^|\& \ |\*|\(|\)|\_|\=|\[|\]|\\|\<|\<|\>|\?|\/|\;|\\.') self.url_pattern = re.compile(r'(' + # Scheme (HTTP, HTTPS, FTP and SFTP): r'(?:(https?|s?ftp):\/\/)?' + # www: r'(?:www\.)?' + r'(' + # Host and domain (including ccSLD): r'(?:(?:[A-Z0-9][A-Z0-9-]{0,61}[A-Z0-9]\.)+)' + # TLD: r'([A-Z]{2,6})' + # IP Address: r'|(?:\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' + r')' + # Port: r'(?::(\d{1,5}))?' + # Query path: r'(?:(\/\S+)*)' + r')', re.IGNORECASE)
def get_topics(): '''Computes distribution over topics for each abstract''' dictionary = Dictionary.load('lda.dict') lda = LdaMulticore.load('lda.gensim') base = 'datasets/dspace' new_base = 'datasets/dspace_topics' for filename in tqdm(os.listdir(base)): path = os.path.join(base, filename) with open(path, 'r') as f: d = json.load(f) abstract = d['abstract'] if abstract is not None: words = tokenize(abstract.split()) bow = dictionary.doc2bow(words) topics = lda.get_document_topics(bow, minimum_probability=0) topics = to_vec(topics) d['topics'] = topics new_path = os.path.join(new_base, filename) with open(new_path, 'w') as new_f: json.dump(d, new_f)
def load_dictionary(self): path = os.path.join( self.model_directory, "dictionary-%d-%d-%d.pkl" % (self.ndocs, self.phrase_min_count, self.vocabulary_size)) self.dictionary = Dictionary.load(path)
from gensim.corpora import Dictionary from gensim.models.lsimodel import LsiModel from nltk.corpus import stopwords as nltk_stopwords from os.path import dirname, realpath try: path_to_directory_of_this_file = dirname(realpath(__file__)) stopwords = [] with open(path_to_directory_of_this_file + "/stopwords.txt") as f: stopwords.extend([word for word in f.read().decode("utf-8").split("\n") if word and not word.startswith("#")]) stopwords = set(stopwords) lsi = LsiModel.load(path_to_directory_of_this_file + "/model") dictionary = Dictionary.load(path_to_directory_of_this_file + "/dictionary") except Exception as e: print("Exception trying to load LSI index. You can most likely ignore this:", e) def run(text): try: words = text.lower().replace("#"," ").replace("_"," ").replace("("," ").replace(")"," ").replace("/"," ").replace(":"," ").replace("."," ").split() words = [word for word in words if len(word) > 3 and word not in stopwords] if words: probabilities = lsi[dictionary.doc2bow(words)] if probabilities: return sorted(probabilities, key=lambda tup: -1*tup[1])[0][0]
#!/usr/bin/env python from gensim.models import LdaModel from gensim.corpora import MmCorpus, Dictionary import sys, os import pyLDAvis.gensim if len(sys.argv) < 2: print("usage: {0} [path to model.lda]\n".format(sys.argv[0])) sys.exit(1) path, file = os.path.split(sys.argv[1]) corpusname = file.split(".")[0] dictionary = Dictionary.load(path + "/" + corpusname + ".dict") corpus = MmCorpus(path + "/" + corpusname + ".mm") model = LdaModel.load(sys.argv[1]) ############## # cf. https://pyldavis.readthedocs.org/en/latest/modules/API.html vis = pyLDAvis.gensim.prepare(model, corpus, dictionary) pyLDAvis.save_html(vis, path + "/" + corpusname + "_interactive.html") pyLDAvis.show(vis)
### Generating a large training/background corpus using Wikipedia from gensim.corpora import WikiCorpus, wikicorpus articles = "enwiki-latest-pages-articles.xml.bz2" # available from http://en.wikipedia.org/wiki/Wikipedia:Database_download wiki_corpus = WikiCorpus( articles ) # This will take many hours! Output is Wikipedia in bucket-of-words (BOW) sparse matrix. wiki_corpus.dictionary.save("wiki_dict.dict") MmCorpus.serialize("wiki_corpus.mm", wiki_corpus) # File will be several GBs. ### Working with persisted corpus and dictionary bow_corpus = MmCorpus("wiki_corpus.mm") # Revive a corpus dictionary = Dictionary.load("wiki_dict.dict") # Load a dictionary ### Transformations among vector spaces from gensim.models import LsiModel, LogEntropyModel logent_transformation = LogEntropyModel( wiki_corpus, id2word=dictionary ) # Log Entropy weights frequencies of all document features in the corpus tokenize_func = wikicorpus.tokenize # The tokenizer used to create the Wikipedia corpus document = "Some text to be transformed." bow_document = dictionary.doc2bow( tokenize_func(document) ) # First, tokenize document using the same tokenization as was used on the background corpus, and then convert it to BOW representation using the dictionary created when generating the background corpus. logent_document = logent_transformation[[ bow_document
import numpy as np from gensim.corpora import Dictionary import os import keras from k_max_pooling import * from keras.models import load_model from keras.preprocessing import sequence from keras.utils import to_categorical from keras.optimizers import Adam from keras.callbacks import ModelCheckpoint, TensorBoard, ReduceLROnPlateau label_dict = Dictionary() label_dict = label_dict.load(CURRENT_MAIN_PATH + '/dicts/label_dict.dict') total_label = len(label_dict) print("Total classes : %d" % total_label) data = np.load(CURRENT_MAIN_PATH + '/npz_data/train.npz') x_data = data['x_data'] y_data = data['y_data'] # Shuffle the data indices = np.random.permutation(x_data.shape[0]) x_data = x_data[indices] y_data = y_data[indices] x_data = sequence.pad_sequences(x_data, maxlen=SEQ_LEN,
def LDA_Analysis(): #http://nbviewer.jupyter.org/github/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb if 0 == 1: with open('data/review_text_all.txt','w') as myfile: myfile.write("") ''' loop through db and write jobs descriptions ''' with open('data/review_text_all.txt','a') as myfile: with Job() as db: a=0 max_ = int(db.getNoJobs()[0][0]) while (a < max_): #print(a) sample_review = db.readJobDetailClean(a)[0][1] if (sample_review != 'Json Error'): myfile.write(str(sample_review)+'\n') a += 1 #unigram_sentences_filepath = os.path.join(intermediate_directory, 'unigram_sentences_all.txt') if 0 == 1: with codecs.open('data/unigram_sentences_all.txt', 'w', encoding='utf_8') as f: for sentence in lemmatized_sentence_corpus('data/review_text_all.txt'): f.write(sentence + '\n') unigram_sentences = LineSentence('data/unigram_sentences_all.txt') ''' for unigram_sentence in it.islice(unigram_sentences, 230, 240): print(u' '.join(unigram_sentence)) print(u'') ''' #bigram_model_filepath = os.path.join(intermediate_directory, 'bigram_model_all') if 0 == 1: bigram_model = Phrases('data/unigram_sentences_all.txt') bigram_model.save('data/bigram_model_all') # load the finished model from disk bigram_model = Phrases.load('data/bigram_model_all') #bigram_sentences_filepath = os.path.join(intermediate_directory, 'bigram_sentences_all.txt') if 0 == 1: with codecs.open('data/bigram_sentences_all.txt', 'w', encoding='utf_8') as f: for unigram_sentence in unigram_sentences: bigram_sentence = u' '.join(bigram_model[unigram_sentence]) f.write(bigram_sentence + '\n') bigram_sentences = LineSentence('data/bigram_sentences_all.txt') ''' for bigram_sentence in it.islice(bigram_sentences, 230, 240): print(u' '.join(bigram_sentence)) print(u'') ''' #trigram_model_filepath = os.path.join(intermediate_directory, 'trigram_model_all') if 0 == 1: trigram_model = Phrases(bigram_sentences) trigram_model.save('data/trigram_model_all') # load the finished model from disk trigram_model = Phrases.load('data/trigram_model_all') #trigram_sentences_filepath = os.path.join(intermediate_directory, 'trigram_sentences_all.txt') if 0 == 1: with codecs.open('data/trigram_sentences_all.txt', 'w', encoding='utf_8') as f: for bigram_sentence in bigram_sentences: trigram_sentence = u' '.join(trigram_model[bigram_sentence]) f.write(trigram_sentence + '\n') trigram_sentences = LineSentence('data/trigram_sentences_all.txt') ''' for trigram_sentence in it.islice(trigram_sentences, 230, 240): print(u' '.join(trigram_sentence)) print(u'') ''' #trigram_reviews_filepath = os.path.join(intermediate_directory, 'trigram_transformed_reviews_all.txt') if 0 == 1: import csv ''' Variant A: Use Stopwords 1) download StopWords.csv from MySQL table: KeyWords. 2) Remove all relevant words by hand ;) ''' with open('data/StopWords.csv', newline='') as csvfile: stopwords_ = csv.reader(csvfile, delimiter=' ', quotechar='|') for words_ in stopwords_: #print(words_[0]) STOP_WORDS.add(words_[0]) #print(STOP_WORDS) ''' Varaint B: Use Dictionary ''' with open('data/Dictionary.csv', 'r', newline='') as csvfile: file_ = csv.reader(csvfile, delimiter=',', quotechar='"') dictionary_ = [] for row in file_: dictionary_.append(row[0]) #with open('file.csv', 'r') as f: #reader = csv.reader(f) #your_list = list(reader) with codecs.open('data/trigram_transformed_reviews_all.txt', 'w', encoding='utf_8') as f: for parsed_review in nlp.pipe(line_review('data/review_text_all.txt'), batch_size=10000, n_threads=4): # lemmatize the text, removing punctuation and whitespace unigram_review = [token.lemma_ for token in parsed_review if not punct_space(token)] # apply the first-order and second-order phrase models bigram_review = bigram_model[unigram_review] trigram_review = trigram_model[bigram_review] # remove any remaining stopwords ''' Variant A: ''' #trigram_review = [term for term in trigram_review # if term not in STOP_WORDS]#spacy.en.STOPWORDS] !!!!! CHECK THIS !!!!! module 'spacy' has no attribute 'en' ''' Variant B: ''' trigram_review = [term for term in trigram_review if term in dictionary_]# # write the transformed review as a line in the new file trigram_review = u' '.join(trigram_review) f.write(trigram_review + '\n') ''' print(u'Original:' + u'\n') for review in it.islice(line_review('review_text_all.txt'), 11, 12): print(review) print(u'----' + u'\n') print(u'Transformed:' + u'\n') with codecs.open('trigram_transformed_reviews_all.txt', encoding='utf_8') as f: for review in it.islice(f, 11, 12): print(review) ''' #trigram_dictionary_filepath = os.path.join(intermediate_directory, 'trigram_dict_all.dict') if 0 == 1: trigram_reviews = LineSentence('data/trigram_transformed_reviews_all.txt') # learn the dictionary by iterating over all of the reviews trigram_dictionary = Dictionary(trigram_reviews) # filter tokens that are very rare or too common from # the dictionary (filter_extremes) and reassign integer ids (compactify) trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)#,keep_n=100000)#,) trigram_dictionary.compactify() trigram_dictionary.save('data/trigram_dict_all.dict') # load the finished dictionary from disk trigram_dictionary = Dictionary.load('data/trigram_dict_all.dict') #trigram_bow_filepath = os.path.join(intermediate_directory, 'trigram_bow_corpus_all.mm') if 0 == 1: # generate bag-of-words representations for # all reviews and save them as a matrix MmCorpus.serialize('data/trigram_bow_corpus_all.mm', trigram_bow_generator(trigram_dictionary,'data/trigram_transformed_reviews_all.txt')) # load the finished bag-of-words corpus from disk trigram_bow_corpus = MmCorpus('data/trigram_bow_corpus_all.mm') #lda_model_filepath = os.path.join(intermediate_directory, 'lda_model_all') if 0 == 1: with warnings.catch_warnings(): warnings.simplefilter('ignore') # workers => sets the parallelism, and should be # set to your number of physical cores minus one lda = LdaMulticore(trigram_bow_corpus, num_topics=15, id2word=trigram_dictionary, workers=1) lda.save('data/lda_model_all') # load the finished LDA model from disk lda = LdaMulticore.load('data/lda_model_all') #explore_topic(lda, topic_number=1) topic_names = {0:u'Risk Management Bank', 1:u'Big Data Report', 2:u'Automotive SAP', 3:u'Microsoft Java Scrum', 4:u'Medical Consultant', 5:u'Java Engineer', 6:u'Computer Vision Developer', 7:u'Data Analyst', 8:u'BI SAP BW', 9:u'IOT Reporting R', 10:u'Global Project Presentation', 11:u'Cloud Engineer IOT', 12:u'Industry 4.0', 13:u'Risk Consulting', 14:u'Machine Learning Data Science'} #topic_names_filepath = os.path.join(intermediate_directory, 'topic_names.pkl') with open('data/topic_names.pkl', 'wb') as f: pickle.dump(topic_names, f) #load sameple_review from database #sample_review = get_sample_review(10) #lda_description(bigram_model, trigram_model, trigram_dictionary, lda, topic_names, sample_review) #LDAvis_data_filepath = os.path.join(intermediate_directory, 'ldavis_prepared') if 0 == 1: #term_ix = np.sort(topic_info.index.unique().values) LDAvis_prepared = pyLDAvis.gensim_.prepare(lda, trigram_bow_corpus, trigram_dictionary) with open('data/ldavis_prepared', 'wb') as f: pickle.dump(LDAvis_prepared, f) ''' export LDA file ''' # load the pre-prepared pyLDAvis data from disk with open('data/ldavis_prepared', 'rb') as f: LDAvis_prepared = pickle.load(f) with open('data/DSJobs_LDA.html', 'w') as f: pyLDAvis.save_html(LDAvis_prepared, f)
def run_training_batch(self, batch, batch_idx): """ :param batch: dict; contains three keys: input_ids, attention_mask, decoder_input_ids Example for 'batch': batch: {'input_ids': tensor([[ 0, 36, 230, ..., 8, 41, 2]]), 'attention_mask': tensor([[1, 1, 1, ..., 1, 1, 1]]), 'decoder_input_ids': tensor([[ 0, 287, 10, 2107, 111, 10468, 226, 47385, 11579, 1012, 2156, 5, 5302, 47385, 281, 47385, 10003, 255, 47385, 347, 111, 2107, 47385, 574, 47385, 1000, 47385, 398, 47385, 245, 16, 10, 205, 1374, 12576, 479, 646, 1000, 1215, 3388, 510, 742, 85, 128, 579, 65, 9, 5, 357, 3092, 23, 63, 1836, 11, 5, 3555, 111, 672, 2156, 26180, 47385, 642, 111, 3547, 4120, 479, 646, 1000, 1215, 3388, 510, 742, 7192, 8806, 10262, 3444, 7951, 2170, 1318, 2]])} :param batch_idx: number of batch :return: """ # load tokenizer tokenizer = BartTokenizer.from_pretrained('facebook/bart-large') # load config for GSM config = yaml_load(f"{self.default_root_dir}/data/config/gsm.yaml") # load dict dictionary = Dictionary.load(datapath('dict-www-cnndm-unigram')) # remove [SEP] sep_list = [ '[SEP_0]', '[SEP_1]', '[SEP_2]', '[SEP_3]', '[SEP_4]', '[SEP_5]', '[SEP_6]', '[SEP_7]', '[SEP_8]', '[SEP_9]', '<S_SEP>' ] # vocab size for topic modeling vocab_size = len(dictionary) # model config['hidden']['features'][0] = vocab_size # trainer batch config['trainer_batch']['test_sample'] = 1 config = extend_config_reference(config) gsm_trainer = config['GSMtrainer'] gsm_trainer[ 'base_dir'] = f"{self.default_root_dir}/log/bart-large-cnn-finetune" gsm_trainer = GSMTrainer.from_config(gsm_trainer) # number of topics K = config['gsmtopic']['k'] # yaml_dump(gsm_trainer, # os.path.join(f"{self.default_root_dir}/log/bart-large-cnn-finetune", "gsm_trainer.yaml")) # ----------------------------------------- # Topic Modeling - GSM # ----------------------------------------- batch_size = batch['input_ids'].size()[0] docs = [] for batch_num in range(batch_size): # extract the batch_sentence batch_sentence = tokenizer.decode( batch['input_ids'][batch_num].tolist(), skip_special_tokens=True) # change to lowercase and split to list batch_sentence_list = batch_sentence.split(" ") # remove [SEP] batch_sentence_list_nosep = [ item for item in batch_sentence_list if item not in sep_list ] text = ' '.join([x for x in batch_sentence_list_nosep]) fine_text = text.replace(' ##', '').lower() batch_sentence = re.sub(r'[^\w\s]', '', fine_text) # batch_sentence: change to the cleaned news for topic modeling # change to training data format in topic modeling gsm_data_bow = dictionary.doc2bow(batch_sentence.split(" ")) docs.append(gsm_data_bow) # gsm_data: data for topic modeling gsm_data = DataLoader(DocDataset(docs, len(dictionary), device='cuda'), batch_size=config['dataset']['batch_size'], drop_last=False, num_workers=0) gsm_trainer.__dict__['train_iterator'] = gsm_data gsm_loss, gsm_p = gsm_trainer.co_train(vocab_size, training=True) del gsm_data # track grad norms grad_norm_dic = {} # track all metrics for callbacks batch_callback_metrics = [] # track metrics to log batch_log_metrics = [] if batch is None: return AttributeDict(signal=0, grad_norm_dic=grad_norm_dic) # Batch start events with self.profiler.profile('on_batch_start'): # callbacks self.on_batch_start() # hooks if self.is_function_implemented('on_batch_start'): response = self.get_model().on_batch_start(batch) if response == -1: return AttributeDict(signal=-1, grad_norm_dic=grad_norm_dic) splits = [batch] if self.truncated_bptt_steps is not None: model_ref = self.get_model() with self.profiler.profile('tbptt_split_batch'): splits = model_ref.tbptt_split_batch(batch, self.truncated_bptt_steps) self.hiddens = None for split_idx, split_batch in enumerate(splits): self.split_idx = split_idx for opt_idx, optimizer in self._get_optimizers_iterable(): # make sure only the gradients of the current optimizer's parameters are calculated # in the training step to prevent dangling gradients in multiple-optimizer setup. if len(self.optimizers) > 1: for param in self.get_model().parameters(): param.requires_grad = False for group in optimizer.param_groups: for param in group['params']: param.requires_grad = True # ------------------- # calculate loss # ------------------- beta = 0.01 opt_closure_result = self.optimizer_closure( split_batch, batch_idx, opt_idx, optimizer, self.hiddens, gsm_p, # topic distribution gsm_loss, # loss for topic modeling K, # number of topics beta, ) # ------------------------------ # POST forward bookkeeping # ------------------------------ batch_callback_metrics.append( opt_closure_result.training_step_output.callback_metrics) batch_log_metrics.append( opt_closure_result.training_step_output.log_metrics) self.add_progress_bar_metrics( opt_closure_result.training_step_output.pbar_on_batch_end) # track hiddens self.hiddens = opt_closure_result.hiddens # check if loss or model weights are nan if self.terminate_on_nan: self.detect_nan_tensors(opt_closure_result.loss) # track total loss for logging (avoid mem leaks) self.batch_loss_value.append(opt_closure_result.loss) # ------------------------------ # BACKWARD PASS # ------------------------------ # gradient update with accumulated gradients if (self.batch_idx + 1) % self.accumulate_grad_batches == 0: # backward grad_norm_dic = self.run_batch_backward_pass( split_batch, batch_idx, opt_idx, optimizer) # calculate running loss for display self.running_loss.append(self.batch_loss_value.mean()) # reset for next set of accumulated grads self.batch_loss_value.reset() # Batch end events with self.profiler.profile('on_batch_end'): # callbacks self.on_batch_end() # model hooks if self.is_function_implemented('on_batch_end'): self.get_model().on_batch_end() # collapse all metrics into one dict batch_log_metrics = { k: v for d in batch_log_metrics for k, v in d.items() } # track all metrics for callbacks self.callback_metrics.update( {k: v for d in batch_callback_metrics for k, v in d.items()}) result = AttributeDict( signal=0, grad_norm_dic=grad_norm_dic, batch_log_metrics=batch_log_metrics, training_step_output_for_epoch_end=opt_closure_result. training_step_output_for_epoch_end) return result
def train(self): questions = copy.copy(self.additional) for i, q1id in enumerate(self.trainset): question = self.trainset[q1id] if self.proctrain: q1 = [w.lower() for w in question['tokens'] ] if self.lowercase else question['tokens'] q1 = self.remove_punctuation(q1) if self.punctuation else q1 q1 = self.remove_stopwords(q1) if self.stop else q1 else: q1 = question['tokens'] questions.append(q1) duplicates = question['duplicates'] for duplicate in duplicates: rel_question = duplicate['rel_question'] if self.proctrain: q2 = [w.lower() for w in rel_question['tokens'] ] if self.lowercase else rel_question['tokens'] q2 = self.remove_punctuation( q2) if self.punctuation else q2 q2 = self.remove_stopwords(q2) if self.stop else q2 else: q2 = rel_question['tokens'] questions.append(q2) rel_comments = duplicate['rel_comments'] for rel_comment in rel_comments: if self.proctrain: q3 = [w.lower() for w in rel_comment['tokens'] ] if self.lowercase else rel_comment['tokens'] q3 = self.remove_punctuation( q3) if self.punctuation else q3 q3 = self.remove_stopwords(q3) if self.stop else q3 else: q3 = rel_comment['tokens'] if len(q3) == 0: q3 = ['eos'] questions.append(q3) fname = 'transdict' if self.lowercase: fname += '.lower' if self.stop: fname += '.stop' if self.punctuation: fname += '.punct' if self.proctrain: fname += '.proctrain' fname += '.model' path = os.path.join(self.path, fname) if not os.path.exists(path): self.vocabulary = Dictionary(questions) self.vocabulary.save(path) else: self.vocabulary = Dictionary.load(path) self.w_C = compute_w_C(questions, self.vocabulary) # background lm self.model = TRLM([], self.w_C, self.alignments, len(self.vocabulary), alpha=self.alpha, sigma=self.sigma) del self.additional del self.trainset
comments_text = data['comment_text'] data.drop(['comment_text'], inplace=True, axis=1) docs = lematize_comments(comments_text, nthreads=16) # XXX Add phrasing comments_dictionary = None if doTrain: print("Creating dictionary....") comments_dictionary = Dictionary(docs) comments_dictionary.filter_extremes(no_below=10, no_above=0.3) comments_dictionary.compactify() comments_dictionary.save(FLAGS.dictFile) else: print("Loading dictionary...") comments_dictionary = Dictionary.load(FLAGS.dictFile) print("Converting to BOW vectors...") comments_corpus = [comments_dictionary.doc2bow(d) for d in docs] model_tfidf = None if doTrain: print("Creating tfidf model...") model_tfidf = TfidfModel(comments_corpus) model_tfidf.save(FLAGS.tfidfFile) else: print("Loading tfidf model...") model_tfidf = TfidfModel.load(FLAGS.tfidfFile) print("Converting to tfidf vectors...") comments_tfidf = model_tfidf[comments_corpus]
STOPWORDS = f.readlines() STOPWORDS = set([item.strip(string.whitespace) for item in STOPWORDS]) STOP_WORDS = STOP_WORDS.union(STOPWORDS) # encodings: replace_dict = { '\ufb01' : 'fi', '\u2019' : '', '\u00e9' : 'e', '\u00a8' : '', 'ямБ': 'fi', } # tfidf model dct = Dictionary.load("../data/models/tfidf/dictionary.model") tfidf = TfidfModel.load("../data/models/tfidf/tfidf.model") def clean_chunk(chunk): result = [] for token in chunk: # if token.text.lower() == 'the': # print(token.text.lower().strip(), token.text.lower().strip() in STOP_WORDS)
parser.add_argument('-d', '--dictionary', metavar='PATH', default='dict.pk', help="Pickled dictionary file (Gensim)") parser.add_argument('-e', '--epochs', type=int, metavar='N', default=5, help="Number of epochs to train for") parser.add_argument('-b', '--batch_size', type=int, metavar='N', default=32, help="Batch size used in training.") parser.add_argument('-l', '--load', metavar='FILE', help="Load model from file.") parser.add_argument('-s', '--save', metavar='FILE', help="Save model to file.") parser.add_argument('-v', '--vector_size', metavar='SIZE', type=int, default=0, help="Size of input vectors (if sequence of vectors)") args = parser.parse_args() dictionary = Dictionary.load(args.dictionary) # Input dataset data = pk.load(open(args.datafile, 'rb')) model = RNNModel(vocab_size=len(dictionary), load=args.load, vector_size=args.vector_size) # Fit and test expect a Dataset object (they use the proper subset) model.fit(data, epochs=args.epochs, batch_size=args.batch_size) model.test(data, batch_size=args.batch_size) if args.save: model.save(args.save)
def load_current_dictionary(): return Dictionary.load(os.path.join(module_path, "models", f"dictionary_{datetime.now().strftime('%Y-%m-%d')}"))
def load(self): if os.path.exists(self.path): self.id2word = Dictionary.load(self.path)
review_txt_filepath = os.path.join('../Reviews', 'review_text_all.txt') wrt_trigram_rvs_to_txt(trigram_reviews_filepath, review_txt_filepath, trigram_model, bigram_model) """ create bag of words """ trigram_reviews_filepath = os.path.join( 'results', 'trigram_transformed_reviews_all.txt') trigram_dictionary_filepath = os.path.join('trigram_dict_all.dict') learn_vocab_corpus(trigram_reviews_filepath, trigram_dictionary_filepath) # load the finished dictionary from disk trigram_dictionary = Dictionary.load(trigram_dictionary_filepath) trigram_bow_filepath = os.path.join('trigram_bow_corpus_all.mm') create_bow(trigram_reviews_filepath, trigram_bow_filepath, trigram_dictionary) # load the finished bag-of-words corpus from disk trigram_bow_corpus = MmCorpus(trigram_bow_filepath) """ find topics """ lda_model_filepath = os.path.join('lda_model_all') create_topics(lda_model_filepath, trigram_bow_corpus, trigram_dictionary)
def LDALoad(self): self.ldamodel = LdaModel.load("fixed_time_window_lda.model") self.dictionary = Dictionary.load("lda_dictionary.model")
0.10):int(len(trash_tokens) * .90)] cleared_docs = [[token for token in document if token in cleared_tokens] for document in cleared_docs] ## Save dictionary in serialized form dictionary = Dictionary(cleared_docs) dictionary.save('./dictionaries/python_tags.dict') corpus = [dictionary.doc2bow(document) for document in cleared_docs] MmCorpus.serialize('./dictionaries/python_tags.mm', corpus) ######################################## ## Load Data ######################################## if (os.path.exists("./dictionaries/python_tags.dict")): dictionary = Dictionary.load('./dictionaries/python_tags.dict') corpus = MmCorpus('./dictionaries/python_tags.mm') print("Used dictionary generated") else: print("Please run the preprocessing to generate a dictionary file") ######################################## ## Create Model ######################################## print(corpus) tfidf = TfidfModel(corpus) corpus_tfidf = tfidf[corpus] ######################################## ## Applying LSI ########################################
import re, nltk, spacy, gensim # Sklearn from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.model_selection import GridSearchCV from pprint import pprint # Plotting tools import pyLDAvis import pyLDAvis.sklearn import matplotlib.pyplot as plt # %matplotlib inline from gensim.corpora import Dictionary, MmCorpus trigram_dictionary = Dictionary.load('./models2/trigram_dict_all.dict') trigram_bow_corpus = MmCorpus('./models2/trigram_bow_corpus.nm') # Document to matrix import numpy as np from scipy.sparse import csr_matrix rows = [] cols = [] data = [] Nrow = 1000000 #len(trigram_bow_corpus) Ncol = len(trigram_dictionary) for i in range(0, Nrow): # line = trigram_bow_corpus[i] for indx, freq in line: rows.append(i) cols.append(indx) data.append(freq)
# with open(files[0], 'r') as f: # s = json.load(f) # pprint(s) """ FILTER AND SAVE CORPUS """ print("---[" + "FILTER AND SAVE CORPUS" + "]---") news = glob('news_corpus/*.txt') corpus = BOWCorpus(news) tfidf = TfidfModel(corpus) filter_low_tfidf(corpus, tfidf) del tfidf gc.collect() corpus.dictionary.save('bow_corpus.dict') MmCorpus.serialize('bow_corpus.mm', corpus) print("-" * 6) """ """ from gensim.corpora import Dictionary d = Dictionary.load('bow_corpus.dict') pprint(d.token2id) # pic.twitter.com/funssqbvdr
def convert_docid2from_from2docids(docid2from): from2docids = defaultdict(list) for docid, from_name in enumerate(docid2from): from2docids[from_name].append(docid) return from2docids if __name__ == '__main__': # logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) interval = WEEK # only WEEK is implemented for now model = LdaModel.load('result/model_wiki.lda') dictionary = Dictionary.load('data/dictionary/report_(NN).dict') from2docids = convert_docid2from_from2docids(dictionary.docid2from) time2docids = sort_by_time(dictionary.docid2date, interval) p_z_d = model.inference(dictionary.corpus)[0].T p_z_d = p_z_d / p_z_d.sum(axis=0).reshape(1, p_z_d.shape[1]) # normalize to make it probability # iterate over every interval from_similarity = {} for time in range(max(time2docids.keys())): print('\ncompute similarity for time = ' + str(time) + '...') from_vectors, from_frequencies = create_from_vectors(p_z_d, from2docids, time2docids, time) from_matrix, from_indices = convert_from_vectors(from_vectors) similarities = compute_similarity(from_matrix) id_frequencies = convert_from_id(from_frequencies, from_indices)
phraser = Phraser( Phrases(transformed_paragraphs, **phrases_parameters)) phraser.save( phraser_filename.format(phraser_iteration + 1)) reader_kwargs['phraser'] = phraser transformed_paragraphs = ArXMLivParagraphIterator( *reader_args, **phraser_reader_kwargs) del transformed_paragraphs reader_kwargs['phraser'] = phraser paragraphs = ArXMLivParagraphIterator(*reader_args, **reader_kwargs) try: dictionary = Dictionary.load(dictionary_filename) except IOError: dictionary = Dictionary(paragraphs) dictionary.save(dictionary_filename) try: topic_tfidf = TfidfModel.load(topic_tfidf_filename) except IOError: topic_tfidf = TfidfModel(dictionary=dictionary, smartirs='dtb', slope=0.2) topic_tfidf.save(topic_tfidf_filename) try: document_tfidf = TfidfModel.load(document_tfidf_filename) except IOError:
def LDA_Review(review_df, min_topic_freq=0): """ Takes Pandas series as input, consisting of one review as text string per row """ from tqdm import tqdm """ accept the original text of a review and (1) parse it with spaCy, (2) apply text pre-proccessing steps, (3) create a bag-of-words representation, (4) create an LDA representation, and (5) print a sorted list of the top topics in the LDA representation """ text = review_df['FullReview'] # parse the review text with spaCy with codecs.open('./uni_temporary.txt', 'w', encoding='utf_8') as f: for sentence in tqdm(lemmatized_sentence_corpus(text)): # print(sentence) f.write(sentence + '\n') f.close() # load and apply the first-order and secord-order phrase models bigram_model = Phrases.load('./models2/bigram_model.txt') trigram_model = Phrases.load('./models2/trigram_model.txt') unigram_review = LineSentence('./uni_temporary.txt') bigram_review = bigram_model[unigram_review] trigram_review = trigram_model[bigram_review] # remove any remaining stopwords trigram_review = [ term for term in trigram_review if term not in spacy.lang.en.stop_words.STOP_WORDS ] with codecs.open('./tri_temporary.txt', 'w', encoding='utf_8') as ftri: for sentence in trigram_review: sentence = u' '.join(sentence) ftri.write(sentence + '\n') ftri.close() trigram_dictionary = Dictionary.load('./models2/trigram_dict_all.dict') lda = LdaMulticore.load('./models2/lda_model') trigram_review = LineSentence('./tri_temporary.txt') # create a bag-of-words representation review_bow = trigram_dictionary.doc2bow(trigram_review) # create an LDA representation review_lda = lda.get_document_topics(review_bow) review_lda = sorted(review_lda, key=itemgetter(1), reverse=True) for topic_number, freq in review_lda: if freq < min_topic_freq: break # print the most highly related topic names and frequencies print('{:25} {}'.format(lda_topics[topic_number], round(freq, 3))) ### Step 2: Generate the contents of the doctors' snapshots. counter = 0 # The temporary string that stores all of the review highlights in each round of the for loop below. big_str = [] # For every doctor, find two things: # 1. The most mentioned FIVE topics in their reviews. # 1.1 The sentiments of these topics. # 2. The 3 most positive sentences and the 3 most negative sentences. # 2.1 Rank all sentences according to sentiment analysis. # I do NOT keep info about individual reviews. All sentences are stored in a # long list regardless of whether they are from the same reviews or not! ########################################################################### # Build sentence dataframe for the current doctor. ########################################################################### this_hotel = pd.DataFrame(columns=[ "HotelName", "Sentence", "Sentiment_neg", "Sentiment_neu", "Sentiment_pos", "Sentiment_compound", "topic_1", "topic1_score", "topic_2", "topic2_score" ]) sent_count = 0 # For every review sentence for sentence in unigram_review: # Assess sentiment. sentiments = senti.polarity_scores(sentence) sentiment_neg = sentiments["neg"] sentiment_neu = sentiments["neu"] sentiment_pos = sentiments["pos"] sentiment_compound = sentiments["compound"] # Assign topic. # Default topic to -1. this_topic = -1 # Preprocess sentence. sent_tokens = tokenizer.tokenize(str(sentence).lower()) cleaned_sent = [p_stemmer.stem(i) for i in sent_tokens] # Evaluate for topic. sent_topics = [] for mod_id in range(0, mod_num): model = ldamodel[mod_id] dicti = dictionary[mod_id] lda_score = model[dicti.doc2bow(cleaned_sent)] for item in lda_score: sent_topics.append((mod_id, item[0], item[1])) sent_topics = sorted(sent_topics, key=lambda x: x[2], reverse=True) # Assign the most relevant topic to a sentence only if the topic is more than 70% dominant. if sent_topics[0][2] > 0.7: this_topic = topics_matrix[sent_topics[0][0]][sent_topics[0][1]] # Add procressed sentence and its meta information to the sentence dataframe. this_doc.loc[sent_count] = [ sentence, sentiment, this_topic, sent_topics[0][2] ] sent_count += 1 ########################################################################### # Compiling results for a hotel. ########################################################################### # Review highlights. # Save the most positive and negative sentiments. this_doc2 = this_doc.sort_values(["sentiment"], ascending=[0]).reset_index(drop=True) this_doc2 = this_doc2.loc[this_doc2["topic"] != -1].reset_index(drop=True) this_doc2 = this_doc2.loc[this_doc2["topic_score"] > 0.5].reset_index( drop=True) sent_count_2 = len(this_doc2) composite = "NONE" # Save the most polarizing sentiments only if there are at least 6 sentences. if sent_count_2 > 5: sent1 = sent2 = sent3 = sent4 = sent5 = sent6 = "" # Only keep positive sentiment if its score is above 0.4 (within [-1, 1]). if this_doc2.loc[0]["sentiment"] > 0.4: sent1 = this_doc2.loc[0]["sentence"] if this_doc2.loc[1]["sentiment"] > 0.4: sent2 = this_doc2.loc[1]["sentence"] if this_doc2.loc[2]["sentiment"] > 0.4: sent3 = this_doc2.loc[2]["sentence"] # Only keep positive sentiment if its score is below -0.2 (within [-1, 1]). if this_doc2.loc[sent_count_2 - 1]["sentiment"] < -0.2: sent4 = this_doc2.loc[sent_count_2 - 1]["sentence"] if this_doc2.loc[sent_count_2 - 2]["sentiment"] < -0.2: sent5 = this_doc2.loc[sent_count_2 - 2]["sentence"] if this_doc2.loc[sent_count_2 - 3]["sentiment"] < -0.2: sent6 = this_doc2.loc[sent_count_2 - 3]["sentence"] composite = sent1 + "SSEEPP" + sent2 + "SSEEPP" + sent3 + "SSEEPP" + sent4 + "SSEEPP" + sent5 + "SSEEPP" + sent6 + "SSEEPP" + str( sent_count) # Add review highlights to the doctor dataframe. doctor_info.set_value(doctor_id, "summary", composite) # Top topics and their ratings. # Ratings are the percent positive sentences belonging to a topic. doc_topics = [[0 for i in range(2)] for j in range(topic_num)] # [total count, count positive] for index2 in range(0, len(this_doc2)): topic_index = this_doc2.loc[index2]["topic"] if topic_index != -1: doc_topics[topic_index][0] += 1 topic_sentiment = this_doc2.loc[index2]["sentiment"] # A topic sentence if positive if its sentiment is bigger than 0.1. if topic_sentiment > 0.1: doc_topics[topic_index][1] += 1 # Do not display dentist stuff for non-dentist if not is_dentist: doc_topics[3][0] = 0 # Do not output "positive comment" as a topic. It is non-informative. doc_topics[0][0] = 0 # Putting the results into a format to be sparsed by the webapp. doc_topic_tuples = [] for index3, item in enumerate(doc_topics): doc_topic_tuples.append((index3, item[0], item[1])) doc_topic_tuples = sorted(doc_topic_tuples, key=lambda x: x[1], reverse=True) for index4 in range(0, 5): if doc_topic_tuples[index4][1] >= 10: topic_name = topics[doc_topic_tuples[index4][0]][0] percent_positive = str( int(doc_topic_tuples[index4][2] / doc_topic_tuples[index4][1] * 100)) composite = topic_name + "SSEEPP" + percent_positive + "SSEEPP" + str( doc_topic_tuples[index4][1]) doctor_info.set_value(doctor_id, "percent{0}".format(str(index4 + 1)), composite) print(topic_name, "XXXXXX", doctor_info.loc[doctor_id]["specialty"]) big_str.append(topic_name + "XXXXXX" + str(doctor_info.loc[doctor_id]["specialty"])) else: doctor_info.set_value(doctor_id, "percent{0}".format(str(index4 + 1)), "NONE") # Print progress. print(counter / 5088) counter += doctor_review_count del this_doc del this_doc2
help='language of data', type=check_lang) parser.add_argument( '-f, --filter', dest='filter', action='store_true', help='remove unfrequent and too frequent words from dictionary') parser.set_defaults(lang='en', filter=True) args = parser.parse_args() logging.info("Creating training corpora from data in directories: %s" % args.dirs) logging.info("Language: %s" % args.lang) dictionary = None if not os.path.exists( dictionary_file) else Dictionary.load(dictionary_file) # Create BoW corpus and dictionary logging.info("Creating BoW corpus...") training_corpus = BowNewsCorpus(input=args.dirs, dictionary=dictionary, language=args.lang) dictionary = training_corpus.dictionary if args.filter: logging.info("Filtering dictionary...") # https://onlinelibrary.wiley.com/doi/epdf/10.1111/j.1756-8765.2010.01108.x dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=2000000) dictionary.compactify() # Serialize pre-processed BoW corpus and dictionary to files
import itertools as it import warnings import time trigram_posts_file = 'trigram_posts.txt' trigram_dict_file = 'trigram_dict.dict' trigram_posts = LineSentence(trigram_posts_file) trigram_dictionary = Dictionary(trigram_posts) trigram_dictionary.filter_extremes(no_below=10, no_above=0.4) trigram_dictionary.compactify() trigram_dictionary.save(trigram_dict_file) trigram_dictionary = Dictionary.load(trigram_dict_file) print(trigram_dictionary) # 34,487 unique tokens trigram_threads_bow_file = 'trigram_threads_bow_corpus.mm' trigram_users_bow_file = 'trigram_users_bow_corpus.mm' def trigram_bow_generator(filepath): """ generator function to read reviews from a file and yield a bag-of-words representation """ for post in LineSentence(filepath):
from gensim.models import LdaModel from gensim.corpora import Dictionary from extractor import Document from os import getcwd docs_file = "data/0607.head.uc.head" docs = [] line_counter = 0 with open(docs_file) as f: for line in f: splits = line.strip().split('\t') assert len(splits) == 2, len(splits) content = splits[-1] docs.append(Document(content).get_string_clean()) line_counter += 1 if line_counter == 200: break for i in range(3): print(i, docs[i]) print("=============================================") doc_clean = [doc.split() for doc in docs] path_dictionary = getcwd() + "/data/dictionary" path_ldamodel = getcwd() + "/data/ldamodel" dictionary = Dictionary.load(path_dictionary) lda_load = LdaModel.load(path_ldamodel) unseen_doc = dictionary.doc2bow(doc_clean[-1]) vector = lda_load[unseen_doc] print(vector)
def doc_processing(documents, stopwordsFilePath='', thres=10, doc=False, dicPath=False): # read the stopwords file if stopwordsFilePath != '': file = codecs.open(stopwordsFilePath, 'r', 'utf-8') stopwords = [line.strip() for line in file] file.close() else: stopwords = '' if dicPath: dictionary = Dictionary.load('../dictionary_2corpora.dic') # file = codecs.open(dicPath, 'r', 'utf-8') # dictionary = [line.strip() for line in file] # file.close() N = len(documents) wordCounts = [] word2id = {} id2word = {} currentId = 0 my_punctuation = '!"#$%&\'()*+,-./:;<=>?@[]^_`{|}~' # generate the word2id and id2word maps and count the number of times of words showing up in documents # bigram = gensim.models.Phrases(documents) # documents = bigram[documents] documents_ = [] for i, document in enumerate(documents): # if i%1000 == 0: # print('Document #%d ...' % i) if doc == False: words_in_sent = tokenize(document, deacc=False) wordCount = {} for word in words_in_sent: if len( word ) > 1 and word not in stopwords and word not in my_punctuation: if word not in word2id.keys(): word2id[word] = currentId id2word[currentId] = word currentId += 1 if word in wordCount: wordCount[word] += 1 else: wordCount[word] = 1 wordCounts.append(wordCount) i += 1 else: doc = [] words_in_sent = tokenize(document, deacc=False) for word in words_in_sent: if dicPath: if word in dictionary: doc.append(word) elif len( word ) > 1 and word not in stopwords and word not in my_punctuation: doc.append(word) # if stopwordsFilePath != '': # for word in words_in_sent: # if dicPath: # if word in dictionary: # doc.append(word) # elif len(word) > 1 and word not in stopwords and word not in my_punctuation: # doc.append(word) # else: # stopwords = '' # for word in words_in_sent: # doc.append(word) documents_.append(doc) if doc == False: word2id_ = {} id2word_ = {} M = len(word2id) # generate the document-word matrix X = np.zeros([N, M], dtype=np.int8) for i in range(N): for word in wordCounts[i]: j = word2id[word] if wordCounts[i][ word] < 0: # Esto no se porque pasa pero alguna vez suelta pone un numero negativo (random) si la palabra no aparece wordCounts[i][word] = 0 X[i, j] = wordCounts[i][word] # Elimino palabras en los extremos X2 = [] for w in range(X.shape[1]): thres_up = X.shape[0] * 10 if thres <= np.sum(X[:, w]) < thres_up: X2.append(X[:, w]) word = id2word[w] word2id_[word] = word2id[word] id2word_[len(X2) - 1] = word X2 = np.array(X2) X2 = X2.T M = X2.shape[1] print('Dictionary size: %d' % M) return N, M, word2id_, id2word_, X2 else: return documents_, stopwords
from gensim.models.phrases import Phraser from gensim.corpora import Dictionary import pandas as pd from sklearn.decomposition import TruncatedSVD from sklearn.manifold import TSNE from util import * import matplotlib.pyplot as plt from imblearn.under_sampling import RandomUnderSampler import time parquetpath = './dataset/final/' trigram = Phraser.load("./vocab/trigram") bigram = Phraser.load("./vocab/bigram") dct = Dictionary.load("./gensim_dct") reviews = pd.read_parquet(path=parquetpath) reviews = reviews[reviews["length"] > 5].sample(10000) rus = RandomUnderSampler() X_resampled, y_resampled = rus.fit_resample(reviews[["text"]], reviews["sentiment"]) X_resampled = pd.DataFrame(X_resampled) y_resampled = pd.DataFrame(y_resampled) X_resampled.columns = reviews[["text"]].columns y_resampled.columns = reviews[["sentiment"]].columns model = TfidfModel(dictionary=dct) t0 = time.time() X_csc = apply_tfidf(dct, model, X_resampled, bigram, trigram) t1 = time.time() print("Applied tfidf:", t1 - t0) # use SVD only
for i, a_tweet in enumerate(TweetRawCorpusStream(file_path)): token_f = [ x for x in a_tweet.tokens_str.split(",") if len(x) > 1 ] dct.add_documents([token_f], prune_at=None) sizeofCorpus = i - 1 print(f"Totally {sizeofCorpus} tweets in {each_collection}.") print("Original size of vocabs: {}".format(len(dct))) # control the vocabulary dct.filter_extremes(no_below=40, no_above=0.5, keep_n=len(dct), keep_tokens=None) print("Truncated size of vocabs: {}".format(len(dct))) elif preDictTag != None: dct = Dictionary.load('{}{}.dict'.format(corpora_path, preDictTag)) #### Step 2, apply Tf-IDF representation #### bow_corpus = [] meta_wf = open("{}{}-Meta.csv".format(corpora_path, fileTag), "w") meta_wf.write("position_index,id_str,created_time\n") # use Timer to print elapsed time with Timer(): for each_collection in collections: print("Transforming the corpus for {}".format(each_collection)) file_path = f"{corpora_path}{each_collection}-raw-corpus.tsv" for i, a_tweet in enumerate(TweetRawCorpusStream(file_path)): # gensim's Dictionary.doc2bow will ignore words that are not in dictionary by default bow_per_doc = dct.doc2bow(a_tweet.tokens_str.split(",")) if len(bow_per_doc) > 4:
def __init__(self,path = 'model'): self.ldamodel = LdaModel.load(path+"/fixed_time_window_lda.model") self.dictionary = Dictionary.load(path+"/lda_dictionary.model")
def categorisation(semi, model_name, category, update): ''' Apply pre-trained LDA model to a set of space mission design requirements Inputs: - semi: if True, the model to be used for the categorisation is semi-supervised - model_name: name of saved model to load, usually under the format: 'model_topicNumber', all saved models can be found under LDAmodels - category: category of requirements to use for the categorisation test, one of the following options, found in Corpora/requirementsCorpus: 'AOCS', 'com', 'environment', 'GS', 'Launch', 'MA', 'OBDH', 'payload', 'Power', 'prop', 'thermal' - update: If yes the unsupervised LDA model retained will be updated with the Update corpus found in Corpora/updateCorpus, for the chosen category. Outputs: the Accuracy Score and Mean Reciprocal Ranking of the categorisation CAREFUL 1 : The LDA model generation being a stochastic process, in the case of an updated model, the User will need to manually label the topic dictionaries, and save them as .txt, under TopicModeling/inputs4Categorisation See labels txt file for LDA models in this folder as examples. CAREFUL 2: Same applies for the semi-supervised and unsupervised models. Manual labels files are provided for the unsupervised and semi-supervised models used in the paper. But new trained models requires new labels, each time. CAREFUL 3: some modifications in the requirement pre-processing may have changed the semi-supervised model result w.r.t the original paper presented at the IAC 2019.''' start = time.time() # Unsupervised LDA model case if not semi: # Load LDA model and corresponding dictionary ------------------------------------------------------------------ ldaModel = parentDir + '/TopicModeling/LDAmodels/unsupervised/' + str(model_name) lda = models.ldamodel.LdaModel.load(ldaModel) print('Model Topics Number:', lda.num_topics) dic = parentDir + '/TopicModeling/LDAmodels/unsupervised/dic_' + str(model_name) + '.dict' modelDic = Dictionary.load(dic) # Recreating the topics dictionaries --------------------------------------------------------------------------- ldaTopics = lda.show_topics(formatted=False, num_topics=lda.num_topics, num_words=15) print('Loaded LDA Topics Dictionaries, top 15 words:', *ldaTopics, sep='\n') # Get manual labels -------------------------------------------------------------------------------------------- labels = [] with open(parentDir + '/TopicModeling/inputs4Categorisation/manualLabels_' + model_name + '.txt', 'r', encoding="utf-8") as labelsFile: labelLine = labelsFile.read().split('\n') for line in labelLine: if line: labels.append(line.split(', ')) labels = [[int(label[0]), label[1]] for label in labels] labels = list(itertools.chain.from_iterable(labels)) print('\n Loaded Model Labels:', labels) if update: # Updated Unsupervised LDA model case # Update LDA model with wikipedia pages focused on one topic ----------------------------------------------- print('\n Generating a specific LDA model for category', category, ':') # Only currently available for GS (Ground Segment), Launch, MA (Mission Analysis), OBDH, payload categories filepath = parentDir + '/TopicModeling/Corpora/updateCorpus/' + category + '_update/' # Pre-processing of .json docs into tokens reqdoc = corpusProcessing(filepath) # Use lda model dictionary to transform into document-term matrix understood by the model addcorpus = [modelDic.doc2bow(text) for text in reqdoc] # Update model lda.update(addcorpus, passes=600, offset=1500) # Print new dictionary of topics ldaTopics = lda.show_topics(formatted=False, num_topics=lda.num_topics, num_words=15) print('\n LDA Topics after update', *ldaTopics, sep='\n') # Get manual labels ---------------------------------------------------------------------------------------- labels = [] with open(parentDir + '/TopicModeling/inputs4Categorisation/manualLabels_' + model_name + '_'+ category +'.txt', 'r', encoding="utf-8") as labelsFile: labelLine = labelsFile.read().split('\n') for line in labelLine: if line: labels.append(line.split(', ')) labels = [[int(label[0]), label[1]] for label in labels] labels = list(itertools.chain.from_iterable(labels)) print('\n Labels:', labels) else: # Semi-unsupervised LDA model case # Load LDA model and corresponding dictionary ldaModel = parentDir+'/TopicModeling/LDAmodels/semisupervised/guided'+str(model_name) lda = models.ldamodel.LdaModel.load(ldaModel) print('topics number:', lda.num_topics) dic = parentDir +'/TopicModeling/LDAmodels/semisupervised/dic_guided'+str(model_name)+'.dict' modelDic = Dictionary.load(dic) # Recreating the topics dictionaries ldaTopics = lda.show_topics(formatted=False, num_topics=lda.num_topics, num_words=20) print('LDA Topics ', *ldaTopics, sep='\n') # Get manual labels -------------------------------------------------------------------------------------------- labels = [] with open(parentDir + '/TopicModeling/inputs4Categorisation/manualLabels_'+model_name+'_semisupervised.txt', 'r', encoding="utf-8") as labelsFile: labelLine = labelsFile.read().split('\n') for line in labelLine: if line: labels.append(line.split(', ')) labels = [[int(label[0]), label[1]] for label in labels] labels = list(itertools.chain.from_iterable(labels)) print('\n Labels:', labels) # Get test requirements List --------------------------------------------------------------------------------------- requirementsList = [] with open(parentDir + '/TopicModeling/Corpora/requirementsCorpus/req_'+ category+'.txt', 'r', encoding="utf-8") as filteredList: requirements = filteredList.read().split('\n') for req in requirements: if req: requirementsList.append(req.split(" | ")) # Categorisation --------------------------------------------------------------------------------------------------- gt = [] allResults = [] all_req=[] for item in requirementsList: req = item[0] gt.append(item[1]) # pre-process requirement req = NLPPipe(req) all_req.append(req) # Use the same dictionary as pre-trained model to convert a list of words into bag of word format unseen_doc = modelDic.doc2bow(req) # get topic probability distribution for the unseen document vector = lda[unseen_doc] sorted_vector = sorted(vector, key=itemgetter(1), reverse=True) # Treshold - keep top 2 topics associated, with probabilities results = list(map(list, sorted_vector[0:2])) # associate top results with manually assigned labels for item in results: item[0] = labels[labels.index(item[0]+1) + 1] allResults.append(results) #print('\n All requirements:\n', *all_req, sep='\n') print('\n All Results for category', category, ' :') print(len(requirementsList), ' requirements were analysed.') print(*allResults, sep='\n') # Categorisation Evaluation ------------------------------------------------------------------------------------------- # we have per requirement i, the ground truth gt[i] and the LDA model topic distribution results[i] # Accuracy calculation firstChoice = [item[0][0] for item in allResults] firstChoiceAccuracy = accuracy_score(gt, firstChoice) print('First Choice Accuracy : ', firstChoiceAccuracy) # Mean Reciprocal Ranking bigScore = 0 for item in allResults: i = allResults.index(item) score = 0 if item[0][0] == gt[i]: score = 1 elif len(item)>1: if item[1][0] == gt[i]: score = 0.5 bigScore = bigScore +score meanReciprocalrank = bigScore / len(requirementsList) print('Mean Reciprocal Rank : ', meanReciprocalrank, '\n ---------') print('Computation Time:', round((time.time() - start) / 60, 2), 'minutes') return