def testMerge(self): d = Dictionary(self.texts) f = Dictionary(self.texts[:3]) g = Dictionary(self.texts[3:]) f.merge_with(g) self.assertEqual(sorted(d.token2id.keys()), sorted(f.token2id.keys()))
def build_dictionary(db): dictionary = Dictionary() for article in db.articles.find(): dictionary.doc2bow(article['clean_text'], allow_update=True) # print dictionary # dictionary.save('data/cnn.dict') # store the dictionary, for future reference return dictionary
def similarity_matrix(self): """Test similarity_matrix returns expected results.""" corpus = [["government", "denied", "holiday"], ["holiday", "slowing", "hollingworth"]] dictionary = Dictionary(corpus) corpus = [dictionary.doc2bow(document) for document in corpus] # checking symmetry and the existence of ones on the diagonal similarity_matrix = self.similarity_matrix(corpus, dictionary).todense() self.assertTrue((similarity_matrix.T == similarity_matrix).all()) self.assertTrue((np.diag(similarity_matrix) == similarity_matrix).all()) # checking that thresholding works as expected similarity_matrix = self.similarity_matrix(corpus, dictionary, threshold=0.45).todense() self.assertEquals(18, np.sum(similarity_matrix == 0)) # checking that exponent works as expected similarity_matrix = self.similarity_matrix(corpus, dictionary, exponent=1.0).todense() self.assertAlmostEqual(9.5788956, np.sum(similarity_matrix)) # checking that nonzero_limit works as expected similarity_matrix = self.similarity_matrix(corpus, dictionary, nonzero_limit=4).todense() self.assertEquals(4, np.sum(similarity_matrix == 0)) similarity_matrix = self.similarity_matrix(corpus, dictionary, nonzero_limit=3).todense() self.assertEquals(20, np.sum(similarity_matrix == 0))
def _generate_vocabulary(self): vocab = Dictionary() session = DBSession() i = 0 for question in session.query(Question).yield_per(self.yield_per): i += 1 if i % self.print_per == 0: logger.info('Processed %d / %d questions :: %d unique tokens' % (i, self.n_questions, vocab.num_docs)) strings = [question.title, question.content] if question.content is not None else [question.title] vocab.add_documents([CorpusDictionary.tokenize(s) for s in strings]) i = 0 for answer in session.query(Answer).yield_per(self.yield_per): i += 1 if i % self.print_per == 0: logger.info('Processed %d / %d answers :: %d unique tokens' % (i, self.n_answers, vocab.num_docs)) vocab.add_documents([CorpusDictionary.tokenize(answer.content)]) # commit and close the session session.commit() session.close() return vocab
def main(): parser = ArgumentParser() parser.add_argument('-d', '--dataset') parser.add_argument('-p', '--dataset-path', default=default_dataset_path()) parser.add_argument('-o', '--output') opts = parser.parse_args() dataset_name = opts.dataset dataset_path = opts.dataset_path out_fn = opts.output if not out_fn: logging.error('--output argument required ...') parser.print_usage() sys.exit(1) if not dataset_name: logging.error('--dataset argument required ...') parser.print_usage() sys.exit(1) if dataset_name == 'newsgroups': corpus = (preprocess_ng(doc) for doc in newsgroups.iterator(download_file(newsgroups.NEWSGROUPS_ARCHIVE_URL, dataset_path))) if dataset_name == 'ndt': dataset = NDTDataset(dataset_path=dataset_path) dataset.install() corpus = (preprocess_ndt(doc) for doc in dataset) else: logging.error('Unknown dataset %s ...' % dataset_name) sys.exit(1) d = Dictionary(corpus) d.save_as_text(out_fn, sort_by_word=False)
def process_text(corpus, stoplist=None, bigrams=None, trigrams=None, keep_all=False, no_below=10, no_above=0.8): """ Extracts text data from the corpus Cleans and tokenizes text data Computes most frequent phrases, creates a dictionary and converts the corpus to a BOW model :param corpus: :return: processed corpus with phrases, dictionary and BOW corpus """ logging.info("Cleaned and tokenzed dataset") text_dataset = clean_and_tokenize(corpus, stoplist=stoplist, keep_all=keep_all) if bigrams is not None: bi_grams = Phrases(text_dataset, threshold=bigrams, min_count=no_below) text_dataset = bi_grams[text_dataset] elif trigrams is not None: bi_grams = Phrases(text_dataset, threshold=bigrams) tri_grams = Phrases(bi_grams[text_dataset], threshold=trigrams) text_dataset = tri_grams[bi_grams[text_dataset]] dictionary = Dictionary(text_dataset) dictionary.filter_extremes(no_below=no_below, no_above=no_above) bow_corpus = [dictionary.doc2bow(text) for text in text_dataset] return text_dataset, dictionary, bow_corpus
class LDA(Step): def __init__(self, num_topics): self._model = None self._dictionary = None self._n_topics = num_topics def fit(self, filename): contents = [x for _, x in Reader(filename)] self._dictionary = Dictionary(contents) corpus = [self._dictionary.doc2bow(text) for text in contents] self._model = LdaModel(corpus, num_topics=self._n_topics) def transform(self, filename): uuids, vectors = self._transform(filename) return uuids, vectors def _transform(self, filename): vectors = [] uuids = [] for uuid, tokens in Reader(filename): bow = self._dictionary.doc2bow(tokens) lda_probs = {dim: prob for dim, prob in self._model[bow]} lda_vec = [lda_probs.get(i, 0) for i in range(self._n_topics)] vectors.append(lda_vec) uuids.append(uuid) return uuids, np.array(vectors) @classmethod def _read(cls, filename): for uuid, tokens in Reader(filename): yield ' '.join(tokens)
def test_saveAsText(self): """`Dictionary` can be saved as textfile. """ tmpf = get_tmpfile('save_dict_test.txt') small_text = [ ["prvé", "slovo"], ["slovo", "druhé"], ["druhé", "slovo"]] d = Dictionary(small_text) d.save_as_text(tmpf) with codecs.open(tmpf, 'r', encoding='utf-8') as file: serialized_lines = file.readlines() self.assertEqual(serialized_lines[0], u"3\n") self.assertEqual(len(serialized_lines), 4) # We do not know, which word will have which index self.assertEqual(serialized_lines[1][1:], u"\tdruhé\t2\n") self.assertEqual(serialized_lines[2][1:], u"\tprvé\t1\n") self.assertEqual(serialized_lines[3][1:], u"\tslovo\t3\n") d.save_as_text(tmpf, sort_by_word=False) with codecs.open(tmpf, 'r', encoding='utf-8') as file: serialized_lines = file.readlines() self.assertEqual(serialized_lines[0], u"3\n") self.assertEqual(len(serialized_lines), 4) self.assertEqual(serialized_lines[1][1:], u"\tslovo\t3\n") self.assertEqual(serialized_lines[2][1:], u"\tdruhé\t2\n") self.assertEqual(serialized_lines[3][1:], u"\tprvé\t1\n")
def test_from_corpus(self): """build `Dictionary` from an existing corpus""" documents = ["Human machine interface for lab abc computer applications", "A survey of user opinion of computer system response time", "The EPS user interface management system", "System and human system engineering testing of EPS", "Relation of user perceived response time to error measurement", "The generation of random binary unordered trees", "The intersection graph of paths in trees", "Graph minors IV Widths of trees and well quasi ordering", "Graph minors A survey"] stoplist = set('for a of the and to in'.split()) texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents] # remove words that appear only once all_tokens = sum(texts, []) tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1) texts = [[word for word in text if word not in tokens_once] for text in texts] dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] dictionary_from_corpus = Dictionary.from_corpus(corpus) #we have to compare values, because in creating dictionary from corpus #informations about words are lost dict_token2id_vals = sorted(dictionary.token2id.values()) dict_from_corpus_vals = sorted(dictionary_from_corpus.token2id.values()) self.assertEqual(dict_token2id_vals, dict_from_corpus_vals) self.assertEqual(dictionary.dfs, dictionary_from_corpus.dfs) self.assertEqual(dictionary.num_docs, dictionary_from_corpus.num_docs) self.assertEqual(dictionary.num_pos, dictionary_from_corpus.num_pos) self.assertEqual(dictionary.num_nnz, dictionary_from_corpus.num_nnz)
def load_data(fname): print 'input file name:', fname target = [] #ラベル source = [] #文書ベクトル #文書リストを作成 document_list = [] word_list = [] for l in open(fname, 'r').readlines(): sample = l.strip().split(' ', 1) label = sample[0] target.append([label]) #ラベル word_list = preprocess_string(sample[1]) #ストップワード除去, ステミング document_list.append(word_list) #文書ごとの単語リスト #辞書を作成 #低頻度と高頻度のワードは除く dct = Dictionary(document_list) dct.filter_extremes(no_below=3, no_above=0.6) #文書のBOWでベクトル化 for doc in document_list: tmp = dct.doc2bow(doc) # ex.[(4, 1), (23,1),..., (119,2)] dense = list(matutils.corpus2dense([tmp], num_terms=len(dct)).T[0]) source.append(dense) dataset = {} dataset['target'] = np.array(target) dataset['source'] = np.array(source) return dataset #, max_len, width
def test_corpus_summarization(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f: text = f.read() # Generate the corpus. sentences = text.split("\n") tokens = [sentence.split() for sentence in sentences] dictionary = Dictionary(tokens) corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens] # Extract the most important documents. selected_documents = summarize_corpus(corpus) # They are compared to the method reference. with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.summ.txt"), mode="r") as f: summary = f.read() summary = summary.split('\n') # Each sentence in the document selection has to be in the model summary. for doc_number, document in enumerate(selected_documents): # Retrieves all words from the document. words = [dictionary[token_id] for (token_id, count) in document] # Asserts that all of them are in a sentence from the model reference. self.assertTrue(any(all(word in sentence for word in words)) for sentence in summary)
def read_project_data(mtc,csc, fname): d1 = Dictionary.load(mtc + ".dict") d2 = Dictionary.load(csc + ".dict") #d3 = Dictionary.load('data/postgresql-d4f8dde3-CommitLogCorpus.mallet.dict') MultiTextCorpus = MalletCorpus(mtc, d1) ChangesetCorpus = MalletCorpus(csc, d2) #CommitLogCorpus = MalletCorpus('data/postgresql-d4f8dde3-CommitLogCorpus.mallet', d3) u1 = set(d1.values()) u2 = set(d2.values()) #u3 = set(d3.values()) common = u1.intersection(u2) uc_set = (len(u1),len(u2)) u1_uniq = u1.difference(common) u2_uniq = u2.difference(common) print(u1_uniq) fname = "common_words_comparison.txt" with open(fname, 'a') as f: parts = mtc.split("-") f.write(str(parts[0]) + "\n") f.write("length of MultiTextCorpus: " + str(len(MultiTextCorpus)) + "\n") f.write("length of ChangesetCorpus: " + str(len(ChangesetCorpus)) + "\n" + "\n") f.write("(MTC,CSC) in common" + "\n") f.write(str(uc_set) + " " + str(len(common))) f.write('\n' + '\n')
class TermFrequency(object): """ Computes a term frequency distance_matrix """ def __init__(self, documents): logging.log(logging.INFO, "Creating Term Frequency") self.id2Word = Dictionary(documents) self.num_unique_words = len(self.id2Word) self.distance_matrix = self.to_term_frequency_matrix(documents) def to_term_frequency_vector(self, document): return self.id2Word.doc2bow(document) def to_binary_vector(self, document): tf = self.id2Word.doc2bow(document) vect = sparse2full(tf, len(self.id2Word.keys())) return np.array( vect > 0, dtype=int ) # concerts to binary def to_term_frequency_matrix(self, documents): return [self.to_term_frequency_vector(d) for d in documents] def binary_matrix(self): """ Turns a regular tf distance_matrix into a binary distance_matrix """ def get_binary_data(val): if val <= 0: return 0 return 1 full_matrix = MatrixHelper.gensim_to_python_mdarray(self.distance_matrix, self.num_unique_words) return [[get_binary_data(cell) for cell in row] for row in full_matrix]
def main(): logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') parser = OptionParser() parser.add_option('-f', '--corpus-file') parser.add_option('-p', '--parse-procs', default=1, type=int) parser.add_option('-s', '--sublexicalize-procs', default=1, type=int) parser.add_option('-t', '--tfidf-model') parser.add_option('-v', '--vocabulary') parser.add_option('-m', '--model-file') opts, args = parser.parse_args() corpus_fn = opts.corpus_file or sys.exit() n_proc_parse = opts.parse_procs n_proc_sublex = opts.sublexicalize_procs vocab_fn = opts.vocabulary tfidf_fn = opts.tfidf_model model_fn = opts.model_file or sys.exit() with BZ2File(corpus_fn) as f: corpus = SublexicalizedCorpus(WikiCorpus(corpus_fn, processes=n_proc_parse, dictionary=Dictionary()), order=(3, 6), clean_func=normalize_whitespace, n_proc=n_proc_sublex, create_dictionary=False) if vocab_fn and os.path.exists(vocab_fn): logging.info("Loading vocabulary from %s" % vocab_fn) vocab = Dictionary.load(vocab_fn) else: logging.info("Creating vocabulary") start = time.clock() vocab = Dictionary(corpus.get_texts()) end = time.clock() logging.info("Vocabulary created in %d seconds" % (end - start)) if vocab_fn: logging.info("Saving dictionary to %s" % vocab_fn) vocab.save(vocab_fn) corpus.dictionary = vocab corpus.dictionary.filter_extremes(no_below=5, no_above=.8) corpus.dictionary.compactify() if tfidf_fn and os.path.exists(tfidf_fn): logging.info("Reading TF-IDF model from %s" % tfidf_fn) tfidf = TfidfModel.load(tfidf_fn) else: logging.info("creating TF-IDF model") tfidf = TfidfModel(corpus) if tfidf_fn: logging.info("Saving TFF-IDF model to %s" % tfidf_fn) tfidf.save(tfidf_fn) bow_corpus = (tfidf[art] for art in corpus) model = LsiModel(corpus=bow_corpus, num_topics=10, id2word=corpus.dictionary) model.save(model_fn)
def main(): collection_name = "nips" years = xrange(2010, 2015) # 10 ~ 14 n_topics = 10 corpus_paths = map(lambda y: "data/{}-{}.dat".format(collection_name, y), years) all_corpus = [] year2corpus = {} for year, path in zip(years, corpus_paths): corpus = list(load_line_corpus(path)) all_corpus.append(proc_corpus(corpus)) year2corpus[year] = corpus all_corpus = list(itertools.chain.from_iterable(all_corpus)) dictionary = Dictionary(all_corpus) all_corpus = [dictionary.doc2bow(doc) for doc in all_corpus] import pdb pdb.set_trace() # print all_corpus model = LdaModel(all_corpus, num_topics=n_topics, id2word=dictionary, eval_every=10, passes=100) print model.show_topics()
def evaluate_log(context, config): logger.info('Evalutating models for: %s' % config.project.name) model_fname = config.model_fname % ChangesetCorpus.__name__ changeset_fname = config.corpus_fname % ChangesetCorpus.__name__ commit_fname = config.corpus_fname % CommitLogCorpus.__name__ try: commit_id2word = Dictionary.load(commit_fname + '.dict') commit_corpus = MalletCorpus(commit_fname, id2word=commit_id2word) changeset_id2word = Dictionary.load(changeset_fname + '.dict') changeset_corpus = MalletCorpus(changeset_fname, id2word=changeset_id2word) except: error('Corpora not built yet -- cannot evaluate') try: model = LdaModel.load(model_fname) logger.info('Opened previously created model at file %s' % model_fname) except: error('Cannot evalutate LDA models not built yet!') changeset_doc_topic = get_doc_topic(changeset_corpus, model) commit_doc_topic = get_doc_topic(commit_corpus, model) first_shared = dict() for id_ in commit_doc_topic: i = 0 commit_topics = [topic[0] for topic in commit_doc_topic[id_]] try: changeset_topics = [topic[0] for topic in changeset_doc_topic[id_]] except: continue maximum = 101 minimum = maximum for i, topic in enumerate(commit_topics): if topic in changeset_topics: j = changeset_topics.index(topic) minimum = min(minimum, max(i, j)) for i, topic in enumerate(changeset_topics): if topic in commit_topics: j = commit_topics.index(topic) minimum = min(minimum, max(i, j)) first_shared[id_] = minimum if minimum == maximum: logger.info('No common topics found for %s' % str(id_)) del first_shared[id_] mean = sum(first_shared.values()) / len(first_shared) with open('data/evaluate-log-results.csv', 'a') as f: w = csv.writer(f) w.writerow([model_fname, mean] + list(first_shared.values()))
def load_dictionary(self, filepath): dictionary = Dictionary() with open(filepath, "rb") as f: for line in f.readlines(): # example = SampleTrainingExample(line) # context = example.context dictionary.add_documents([[word.lower() for word in line.split()]]) return dictionary
def bag_of_words(lemma): "Takes in lemmatised words and returns a bow." # Create bag of words from dictionnary dictionary = Dictionary(lemma) dictionary.save('text.dict') # Term frequency–inverse document frequency (TF-IDF) bow = [dictionary.doc2bow(l) for l in lemma] # Calculates inverse document counts for all terms return (bow, dictionary)
def save_dictionary( dic: corpora.Dictionary, filename: str ) -> None: dic.save(filename) print("saved dictionary: {} items to {}".format( len(dic.values()), filename ))
def test_run(self, data): dictionary = Dictionary(data) dictionary.filter_extremes(no_above=0.5) bags_of_words = [ dictionary.doc2bow(t) for t in data] #This can take a while to run: lda = LdaModel(bags_of_words, id2word = dictionary, num_topics=30, passes=2) results = self.assemble_topics(lda) return results
def to_corpus(documents): """ Make into a corpus @documents:list[list[tuple[str,int]]] of bows @returns Dictionary, Corpus """ d = Dictionary() corpus = [d.doc2bow(doc, allow_update=True) for doc in documents] return d, corpus
def do_ir2(db, param): print 'Computazione di IR2', db, param, '...' def words(text): stopwords = set(nltk.corpus.stopwords.words('english')) return [w for w in nltk.word_tokenize(text.lower()) if w not in string.punctuation and w not in stopwords] class BigramsCorpus: def __init__(self, db, collection): self.client = MongoClient()[db][collection] def __iter__(self): for doc in self.client.find(): yield [doc['_id']] def __len__(self): return self.client.count() bigram_corpus = BigramsCorpus('cordis', 'bi_grams') bigrams = Dictionary(bigram_corpus) project ={'$project': {'_id': 0, 'title': 1, 'reference': 1}} a = [project] project_corpus = MongoCorpus('cordis', 'projects', aggregate=a) n = max(bigrams.keys()) dataset = [] for doc in project_corpus: temp = bigrams.doc2bow([' '.join(x) for x in nltk.bigrams(words(doc['title']))]) x = [0]*(n+1) for bi, _ in temp: x[bi] = 1 dataset.append(x) alg = KMeans(n_clusters=int(param)) alg.fit(dataset) clusters = defaultdict(list) for i, doc in enumerate(project_corpus): temp = bigrams.doc2bow([' '.join(x) for x in nltk.bigrams(words(doc['title']))]) x = [0]*(n+1) for bi, _ in temp: x[bi] = 1 p = alg.predict([x]) clusters[p[0]].append(doc['reference']) mongo_clusters = [] for k, v in clusters.items(): mongo_clusters.append({'cluster': k, 'projects': v}) # Mongo da questo errore: InvalidDocument: Cannot encode object: 0 print mongo_clusters # Salva su collezione Mongo mongo = MongoClient()['g8']['ir2'] mongo.insert_many(mongo_clusters) print 'Fatto!'
def run(self, data): wordlists = [corpus.tokenized_contents for corpus in data] dictionary = Dictionary(wordlists) # dictionary.filter_extremes(no_above=0.5) bags_of_words = [ dictionary.doc2bow(t) for t in wordlists] #This can take a while to run: lda = LdaModel(bags_of_words, id2word = dictionary, num_topics=30, passes=10) results = [] return self.assemble_topics(lda)
def test_low_distinct_words_corpus_summarization_is_empty_list(self): text = self._get_text_from_test_data("testlowdistinctwords.txt") # Generate the corpus. sentences = text.split("\n") tokens = [sentence.split() for sentence in sentences] dictionary = Dictionary(tokens) corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens] self.assertEqual(summarize_corpus(corpus), [])
def test_saveAsText_and_loadFromText(self): """`Dictionary` can be saved as textfile and loaded again from textfile. """ tmpf = get_tmpfile('dict_test.txt') for sort_by_word in [True, False]: d = Dictionary(self.texts) d.save_as_text(tmpf, sort_by_word=sort_by_word) self.assertTrue(os.path.exists(tmpf)) d_loaded = Dictionary.load_from_text(tmpf) self.assertNotEqual(d_loaded, None) self.assertEqual(d_loaded.token2id, d.token2id)
def setUp(self): texts = [[u'senior', u'studios', u'studios', u'studios', u'creators', u'award', u'mobile', u'currently', u'challenges', u'senior', u'summary', u'senior', u'motivated', u'creative', u'senior'],[u'performs', u'engineering', u'tasks', u'infrastructure', u'focusing', u'primarily', u'programming', u'interaction', u'designers', u'engineers', u'leadership', u'teams', u'teams', u'crews', u'responsibilities', u'engineering', u'quality', u'functional', u'functional', u'teams', u'organizing', u'prioritizing', u'technical', u'decisions', u'engineering', u'participates', u'participates', u'reviews', u'participates', u'hiring', u'conducting', u'interviews'],[u'feedback', u'departments', u'define', u'focusing', u'engineering', u'teams', u'crews', u'facilitate', u'engineering', u'departments', u'deadlines', u'milestones', u'typically', u'spends', u'designing', u'developing', u'updating', u'bugs', u'mentoring', u'engineers', u'define', u'schedules', u'milestones', u'participating'],[ u'reviews', u'interviews', u'sized', u'teams', u'interacts', u'disciplines', u'knowledge', u'skills', u'knowledge', u'knowledge', u'xcode', u'scripting', u'debugging', u'skills', u'skills', u'knowledge', u'disciplines', u'animation', u'networking', u'expertise', u'competencies', u'oral', u'skills', u'management', u'skills', u'proven', u'effectively', u'teams', u'deadline', u'environment', u'bachelor', u'minimum', u'shipped', u'leadership', u'teams', u'location', u'resumes', u'jobs', u'candidates', u'openings', u'jobs'], [u'maryland', u'client', u'producers', u'electricity', u'operates', u'storage', u'utility', u'retail', u'customers', u'engineering', u'consultant', u'maryland', u'summary', u'technical', u'technology', u'departments', u'expertise', u'maximizing', u'output', u'reduces', u'operating', u'participates', u'areas', u'engineering', u'conducts', u'testing', u'solve', u'supports', u'environmental', u'understands', u'objectives', u'operates', u'responsibilities', u'handles', u'complex', u'engineering', u'aspects', u'monitors', u'quality', u'proficiency', u'optimization', u'recommendations', u'supports', u'personnel', u'troubleshooting', u'commissioning', u'startup', u'shutdown', u'supports', u'procedure', u'operating', u'units', u'develops', u'simulations', u'troubleshooting', u'tests', u'enhancing', u'solving', u'develops', u'estimates', u'schedules', u'scopes', u'understands', u'technical', u'management', u'utilize', u'routine', u'conducts', u'hazards', u'utilizing', u'hazard', u'operability', u'methodologies', u'participates', u'startup', u'reviews', u'pssr', u'participate', u'teams', u'participate', u'regulatory', u'audits', u'define', u'scopes', u'budgets', u'schedules', u'technical', u'management', u'environmental', u'awareness', u'interfacing', u'personnel', u'interacts', u'regulatory', u'departments', u'input', u'objectives', u'identifying', u'introducing', u'concepts', u'solutions', u'peers', u'customers', u'coworkers', u'knowledge', u'skills', u'engineering', u'quality', u'engineering'], [u'commissioning', u'startup', u'knowledge', u'simulators', u'technologies', u'knowledge', u'engineering', u'techniques', u'disciplines', u'leadership', u'skills', u'proven', u'engineers', u'oral', u'skills', u'technical', u'skills', u'analytically', u'solve', u'complex', u'interpret', u'proficiency', u'simulation', u'knowledge', u'applications', u'manipulate', u'applications', u'engineering'],[u'calculations', u'programs', u'matlab', u'excel', u'independently', u'environment', u'proven', u'skills', u'effectively', u'multiple', u'tasks', u'planning', u'organizational', u'management', u'skills', u'rigzone', u'jobs', u'developer', u'exceptional', u'strategies', u'junction', u'exceptional', u'strategies', u'solutions', u'solutions', u'biggest', u'insurers', u'operates', u'investment'], [u'vegas', u'tasks', u'electrical', u'contracting', u'expertise', u'virtually', u'electrical', u'developments', u'institutional', u'utilities', u'technical', u'experts', u'relationships', u'credibility', u'contractors', u'utility', u'customers', u'customer', u'relationships', u'consistently', u'innovations', u'profile', u'construct', u'envision', u'dynamic', u'complex', u'electrical', u'management', u'grad', u'internship', u'electrical', u'engineering', u'infrastructures', u'engineers', u'documented', u'management', u'engineering', u'quality', u'engineering', u'electrical', u'engineers', u'complex', u'distribution', u'grounding', u'estimation', u'testing', u'procedures', u'voltage', u'engineering'],[u'troubleshooting', u'installation', u'documentation', u'bsee', u'certification', u'electrical', u'voltage', u'cabling', u'electrical', u'engineering', u'candidates', u'electrical', u'internships', u'oral', u'skills', u'organizational', u'prioritization', u'skills', u'skills', u'excel', u'cadd', u'calculation', u'autocad', u'mathcad', u'skills', u'skills', u'customer', u'relationships', u'solving', u'ethic', u'motivation', u'tasks', u'budget', u'affirmative', u'diversity', u'workforce', u'gender', u'orientation', u'disability', u'disabled', u'veteran', u'vietnam', u'veteran', u'qualifying', u'veteran', u'diverse', u'candidates', u'respond', u'developing', u'workplace', u'reflects', u'diversity', u'communities', u'reviews', u'electrical', u'contracting', u'southwest', u'electrical', u'contractors'], [u'intern', u'electrical', u'engineering', u'idexx', u'laboratories', u'validating', u'idexx', u'integrated', u'hardware', u'entails', u'planning', u'debug', u'validation', u'engineers', u'validation', u'methodologies', u'healthcare', u'platforms', u'brightest', u'solve', u'challenges', u'innovation', u'technology', u'idexx', u'intern', u'idexx', u'interns', u'supplement', u'interns', u'teams', u'roles', u'competitive', u'interns', u'idexx', u'interns', u'participate', u'internships', u'mentors', u'seminars', u'topics', u'leadership', u'workshops', u'relevant', u'planning', u'topics', u'intern', u'presentations', u'mixers', u'applicants', u'ineligible', u'laboratory', u'compliant', u'idexx', u'laboratories', u'healthcare', u'innovation', u'practicing', u'veterinarians', u'diagnostic', u'technology', u'idexx', u'enhance', u'veterinarians', u'efficiency', u'economically', u'idexx', u'worldwide', u'diagnostic', u'tests', u'tests', u'quality', u'headquartered', u'idexx', u'laboratories', u'employs', u'customers', u'qualifications', u'applicants', u'idexx', u'interns', u'potential', u'demonstrated', u'portfolio', u'recommendation', u'resumes', u'marketing', u'location', u'americas', u'verification', u'validation', u'schedule', u'overtime', u'idexx', u'laboratories', u'reviews', u'idexx', u'laboratories', u'nasdaq', u'healthcare', u'innovation', u'practicing', u'veterinarians'], [u'location', u'duration', u'temp', u'verification', u'validation', u'tester', u'verification', u'validation', u'middleware', u'specifically', u'testing', u'applications', u'clinical', u'laboratory', u'regulated', u'environment', u'responsibilities', u'complex', u'hardware', u'testing', u'clinical', u'analyzers', u'laboratory', u'graphical', u'interfaces', u'complex', u'sample', u'sequencing', u'protocols', u'developers', u'correction', u'tracking', u'tool', u'timely', u'troubleshoot', u'testing', u'functional', u'manual', u'automated', u'participate', u'ongoing'],[u'testing', u'coverage', u'planning', u'documentation', u'testing', u'validation', u'corrections', u'monitor', u'implementation', u'recurrence', u'operating', u'statistical', u'quality', u'testing', u'global', u'multi', u'teams', u'travel', u'skills', u'concepts', u'waterfall', u'agile', u'methodologies', u'debugging', u'skills', u'complex', u'automated', u'instrumentation', u'environment', u'hardware', u'mechanical', u'components', u'tracking', u'lifecycle', u'management', u'quality', u'organize', u'define', u'priorities', u'organize', u'supervision', u'aggressive', u'deadlines', u'ambiguity', u'analyze', u'complex', u'situations', u'concepts', u'technologies', u'verbal', u'skills', u'effectively', u'technical', u'clinical', u'diverse', u'strategy', u'clinical', u'chemistry', u'analyzer', u'laboratory', u'middleware', u'basic', u'automated', u'testing', u'biomedical', u'engineering', u'technologists', u'laboratory', u'technology', u'availability', u'click', u'attach'], [u'scientist', u'linux', u'asrc', u'scientist', u'linux', u'asrc', u'technology', u'solutions', u'subsidiary', u'asrc', u'engineering', u'technology', u'contracts'], [u'multiple', u'agencies', u'scientists', u'engineers', u'management', u'personnel', u'allows', u'solutions', u'complex', u'aeronautics', u'aviation', u'management', u'aviation', u'engineering', u'hughes', u'technical', u'technical', u'aviation', u'evaluation', u'engineering', u'management', u'technical', u'terminal', u'surveillance', u'programs', u'currently', u'scientist', u'travel', u'responsibilities', u'develops', u'technology', u'modifies', u'technical', u'complex', u'reviews', u'draft', u'conformity', u'completeness', u'testing', u'interface', u'hardware', u'regression', u'impact', u'reliability', u'maintainability', u'factors', u'standardization', u'skills', u'travel', u'programming', u'linux', u'environment', u'cisco', u'knowledge', u'terminal', u'environment', u'clearance', u'clearance', u'input', u'output', u'digital', u'automatic', u'terminal', u'management', u'controller', u'termination', u'testing', u'evaluating', u'policies', u'procedure', u'interface', u'installation', u'verification', u'certification', u'core', u'avionic', u'programs', u'knowledge', u'procedural', u'testing', u'interfacing', u'hardware', u'regression', u'impact', u'reliability', u'maintainability', u'factors', u'standardization', u'missions', u'asrc', u'subsidiaries', u'affirmative', u'employers', u'applicants', u'disability', u'veteran', u'technology', u'location', u'airport', u'bachelor', u'schedule', u'travel', u'contributor', u'management', u'asrc', u'reviews'], [u'technical', u'solarcity', u'niche', u'vegas', u'overview', u'resolving', u'customer', u'clients', u'expanding', u'engineers', u'developers', u'responsibilities', u'knowledge', u'planning', u'adapt', u'dynamic', u'environment', u'inventive', u'creative', u'solarcity', u'lifecycle', u'responsibilities', u'technical', u'analyzing', u'diagnosing', u'troubleshooting', u'customers', u'ticketing', u'console', u'escalate', u'knowledge', u'engineering', u'timely', u'basic', u'phone', u'functionality', u'customer', u'tracking', u'knowledgebase', u'rotation', u'configure', u'deployment', u'sccm', u'technical', u'deployment', u'deploy', u'hardware', u'solarcity', u'bachelor', u'knowledge', u'dell', u'laptops', u'analytical', u'troubleshooting', u'solving', u'skills', u'knowledge', u'databases', u'preferably', u'server', u'preferably', u'monitoring', u'suites', u'documentation', u'procedures', u'knowledge', u'entries', u'verbal', u'skills', u'customer', u'skills', u'competitive', u'solar', u'package', u'insurance', u'vacation', u'savings', u'referral', u'eligibility', u'equity', u'performers', u'solarcity', u'affirmative', u'diversity', u'workplace', u'applicants', u'orientation', u'disability', u'veteran', u'careerrookie'], [u'embedded', u'exelis', u'junction', u'exelis', u'embedded', u'acquisition', u'networking', u'capabilities', u'classified', u'customer', u'motivated', u'develops', u'tests', u'innovative', u'solutions', u'minimal', u'supervision', u'paced', u'environment', u'enjoys', u'assignments', u'interact', u'multi', u'disciplined', u'challenging', u'focused', u'embedded', u'developments', u'spanning', u'engineering', u'lifecycle', u'specification', u'enhancement', u'applications', u'embedded', u'freescale', u'applications', u'android', u'platforms', u'interface', u'customers', u'developers', u'refine', u'specifications', u'architectures'],[u'java', u'programming', u'scripts', u'python', u'debug', u'debugging', u'emulators', u'regression', u'revisions', u'specialized', u'setups', u'capabilities', u'subversion', u'technical', u'documentation', u'multiple', u'engineering', u'techexpousa', u'reviews'], [u'modeler', u'semantic', u'modeling', u'models', u'skills', u'ontology', u'resource', u'framework', u'schema', u'technologies', u'hadoop', u'warehouse', u'oracle', u'relational', u'artifacts', u'models', u'dictionaries', u'models', u'interface', u'specifications', u'documentation', u'harmonization', u'mappings', u'aligned', u'coordinate', u'technical', u'peer', u'reviews', u'stakeholder', u'communities', u'impact', u'domains', u'relationships', u'interdependencies', u'models', u'define', u'analyze', u'legacy', u'models', u'corporate', u'databases', u'architectural', u'alignment', u'customer', u'expertise', u'harmonization', u'modeling', u'modeling', u'consulting', u'stakeholders', u'quality', u'models', u'storage', u'agile', u'specifically', u'focus', u'modeling', u'qualifications', u'bachelors', u'accredited', u'modeler', u'encompass', u'evaluation', u'skills', u'knowledge', u'modeling', u'techniques', u'resource', u'framework', u'schema', u'technologies', u'unified', u'modeling', u'technologies', u'schemas', u'ontologies', u'sybase', u'knowledge', u'skills', u'interpersonal', u'skills', u'customers', u'clearance', u'applicants', u'eligibility', u'classified', u'clearance', u'polygraph', u'techexpousa', u'solutions', u'partnership', u'solutions', u'integration'], [u'technologies', u'junction', u'develops', u'maintains', u'enhances', u'complex', u'diverse', u'intensive', u'analytics', u'algorithm', u'manipulation', u'management', u'documented', u'individually', u'reviews', u'tests', u'components', u'adherence', u'resolves', u'utilizes', u'methodologies', u'environment', u'input', u'components', u'hardware', u'offs', u'reuse', u'cots', u'gots', u'synthesis', u'components', u'tasks', u'individually', u'analyzes', u'modifies', u'debugs', u'corrects', u'integrates', u'operating', u'environments', u'develops', u'queries', u'databases', u'repositories', u'recommendations', u'improving', u'documentation', u'develops', u'implements', u'algorithms', u'functional', u'assists', u'developing', u'executing', u'procedures', u'components', u'reviews', u'documentation', u'solutions', u'analyzing', u'conferring', u'users', u'engineers', u'analyzing', u'investigating', u'areas', u'adapt', u'hardware', u'mathematical', u'models', u'predict', u'outcome', u'implement', u'complex', u'database', u'repository', u'interfaces', u'queries', u'bachelors', u'accredited', u'substituted', u'bachelors', u'firewalls', u'ipsec', u'vpns', u'technology', u'administering', u'servers', u'apache', u'jboss', u'tomcat', u'developing', u'interfaces', u'firefox', u'internet', u'explorer', u'operating', u'mainframe', u'linux', u'solaris', u'virtual', u'scripting', u'programming', u'oriented', u'programming', u'ajax', u'script', u'procedures', u'cobol', u'cognos', u'fusion', u'focus', u'html', u'java', u'java', u'script', u'jquery', u'perl', u'visual', u'basic', u'powershell', u'cots', u'cots', u'oracle', u'apex', u'integration', u'competitive', u'package', u'bonus', u'corporate', u'equity', u'tuition', u'reimbursement', u'referral', u'bonus', u'holidays', u'insurance', u'flexible', u'disability', u'insurance'], [u'technologies', u'disability', u'accommodation', u'recruiter', u'techexpousa'], ['bank','river','shore','water'],['river','water','flow','fast','tree'],['bank','water','fall','flow'],['bank','bank','water','rain','river'], ['river','water','mud','tree'],['money','transaction','bank','finance'], ['bank','borrow','money'], ['bank','finance'], ['finance','money','sell','bank'],['borrow','sell'],['bank','loan','sell']] # initializing using own LDA sufficient statistics so that we get same results each time. sstats = numpy.loadtxt(datapath('sstats_test.txt')) dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] self.ldaseq = ldaseqmodel.LdaSeqModel(corpus = corpus , id2word= dictionary, num_topics=2, time_slice=[10, 10, 11], initialize='own', sstats=sstats)
def create_dict(self, corpus_file): dictionary = Dictionary(); with open(corpus_file,"rb") as infile: lines = infile.readlines() #reads single line from file for line in lines: doc = line #.split() #doc as bag of words (bow) of tokens in this line dictionary.add_documents([doc]) #infile.close() return dictionary
def test_saveAsText_and_loadFromText(self): """ `Dictionary` can be saved as textfile and loaded again from textfile. """ tmpf = get_tmpfile('dict_test.txt') d = Dictionary(self.texts) d.save_as_text(tmpf) # does the file exists self.assertTrue(os.path.exists(tmpf)) d_loaded = Dictionary.load_from_text(get_tmpfile('dict_test.txt')) self.assertNotEqual(d_loaded, None) self.assertEqual(d_loaded.token2id, d.token2id)
def test_doc2bow(self): d = Dictionary([["žluťoučký"], ["žluťoučký"]]) # pass a utf8 string self.assertEqual(d.doc2bow(["žluťoučký"]), [(0, 1)]) # doc2bow must raise a TypeError if passed a string instead of array of strings by accident self.assertRaises(TypeError, d.doc2bow, "žluťoučký") # unicode must be converted to utf8 self.assertEqual(d.doc2bow([u'\u017elu\u0165ou\u010dk\xfd']), [(0, 1)])
def testMallet2ModelOn20NewsGroups(self): corpus = [simple_preprocess(doc["data"]) for doc in api.load("20-newsgroups")] dictionary = Dictionary(corpus) corpus = [dictionary.doc2bow(text) for text in corpus] lda_mallet_model = ldamallet.LdaMallet( self.mallet_path, corpus=corpus, num_topics=20, id2word=dictionary, iterations=500) lda_gensim_model = ldamallet.malletmodel2ldamodel(lda_mallet_model, iterations=1000) self.assertEqual(lda_mallet_model.show_topics(20, 50), lda_gensim_model.show_topics(20, 50))
LTP_DATA_DIR = 'E:/Program Files/workspace/ltp_data_v3.4.0' # ltp模型目录的路径 cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` segmentor = Segmentor() # 初始化实例 segmentor.load_with_lexicon(cws_model_path, LTP_DATA_DIR+'/user_dict.txt') # 加载模型,第二个参数是您的外部词典文件路径 postagger = Postagger() # 初始化实例 postagger.load(pos_model_path) # 加载模型 combain_comtent = [] for file in file_list: combain_comtent.append(get_content(file)) segmentor.release() # 释放模型 dictionary = Dictionary(combain_comtent) corpus = [ dictionary.doc2bow(text) for text in combain_comtent] lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=31) #词典的保存 dictionary.save_as_text(write_path+"dictionary.txt") # lda模型保存 lda.save(write_path+"model") for file in lda.print_topics(31): print(file[0]) topic_list = [] for i in lda.get_document_topics(corpus): listj=[]
def out_corp_dic(self, text): dictionary = Dictionary(text) corpus = [dictionary.doc2bow(doc) for doc in text] return {"dictionary": dictionary, "corpus": corpus}
print(string_similar('安定区妇幼保健站', '定西市安定区妇幼保健站')) print(string_similar('柬埔寨特大新闻', '柬埔寨新闻')) from jieba import lcut from gensim.similarities import SparseMatrixSimilarity from gensim.corpora import Dictionary from gensim.models import TfidfModel # 文本集和搜索词 texts = [ '吃鸡这里所谓的吃鸡并不是真的吃鸡,也不是谐音词刺激的意思', '而是出自策略射击游戏《绝地求生:大逃杀》里的台词', '我吃鸡翅,你吃鸡腿' ] keyword = '玩过吃鸡?今晚一起吃鸡' # 1、将【文本集】生成【分词列表】 texts = [lcut(text) for text in texts] # 2、基于文本集建立【词典】,并获得词典特征数 dictionary = Dictionary(texts) num_features = len(dictionary.token2id) # 3.1、基于词典,将【分词列表集】转换成【稀疏向量集】,称作【语料库】 corpus = [dictionary.doc2bow(text) for text in texts] # 3.2、同理,用【词典】把【搜索词】也转换为【稀疏向量】 kw_vector = dictionary.doc2bow(lcut(keyword)) # 4、创建【TF-IDF模型】,传入【语料库】来训练 tfidf = TfidfModel(corpus) # 5、用训练好的【TF-IDF模型】处理【被检索文本】和【搜索词】 tf_texts = tfidf[corpus] # 此处将【语料库】用作【被检索文本】 tf_kw = tfidf[kw_vector] # 6、相似度计算 sparse_matrix = SparseMatrixSimilarity(tf_texts, num_features) similarities = sparse_matrix.get_similarities(tf_kw) for e, s in enumerate(similarities, 1): print('kw 与 text%d 相似度为:%.2f' % (e, s))
assuming you want to use a local copy of the corpus, just read: /tutorials/_0/pr_wsj_ft.json ''' ''' assuming the corpus of text is stored in a Mongo DB ''' ## --+ open pipeline #client = MongoClient() ## --+ pick-up db #db = client.digitalTechs ## --+ load the data #df = pd.DataFrame(list(db.press_releases.find())) # dictionary (saved in tutorial _0) in_f = os.path.join('transformation', '.data', 'pr_dictionary.dict') dictionary = Dictionary.load(in_f) # corpus (saved in tutorial _0) in_f = os.path.join('transformation', '.data', 'pr_corpus.mm') corpus = MmCorpus(in_f) # docs phrased (saved in tutorial _0) in_f = os.path.join('transformation', '.data', 'pr_docs_phrased.pickle') with open(in_f, 'rb') as pipe: docs_phrased = pickle.load(pipe) # %% clean data with document attributes # basic cleaning # --+ get timespans df.loc[:, 'year'] = df['date'].dt.year
def _build_dictionary(documents: List[List[str]], filter_parameters: Dict[Any, Any]) -> Dictionary: d = Dictionary(documents) d.filter_extremes(**filter_parameters) return d
def preprocess_dataset(dataset: pd.DataFrame, extreme_no_below: int, extreme_no_above: float, enable_bigram: bool, min_bigram_count: int, basic_word_analysis: bool, lemmatizing: bool, temporality: str, language: str, path_to_texts_for_embedding: str, split_by_paragraph: bool) -> Dict[str, Any]: """Node for preprocessing the UN General Debates dataset. Parameters are taken from conf/base/parameters.yml. The data and the parameters will be loaded and provided to this function automatically when the pipeline is executed and it is time to run this node. Args: dataset: Source data. Must have a column named "text" to be processed. Dataset must be in catalog.yml Returns: Preprocessed dataset, vocabulary size, dictionnary, date range Parameters : extreme_no_below : if >1 : for a word w, delete this word from vocabulary if w in less than extreme_no_below documents. if in [0,1], for a word w, delete this word from vocabulary if w in less than extreme_no_below% documents extreme_no_above : in [0,1], for a word w, delete this word from vocabulary if w in more than extreme_no_below% documents enable_bigram : Boolean, decide if you want bigrams or not in the dictionary min_bigram_count : Int, threshold for bigrams : Bigram will be added to the dictionary if in more than min_bigram_count documents basic_word_analysis : Boolean, set to True if you want to print some basic word anaylis (basically the number of words removed from each preprocces steps.) lemmatizing : Boolean, set to True if lemmatizing is wanted temporality : 'year', 'month' or 'week' according to desired time slices language : source language for the corpus path_to_texts_for_embedding : txt file containting materials for fasttext training split_by_paragraph : boolean set to True if documents need to be split by paragraphs """ t0 = time() print('\n\nCurrent set of parameters :\n') print('\textreme_no_below : {}'.format(extreme_no_below)) print('\textreme_no_above : {}'.format(extreme_no_above)) print('\tenable_bigram : {}'.format(enable_bigram)) print('\tmin_bigram_count : {}'.format(min_bigram_count)) print('\tlemmatizing : {}'.format(lemmatizing)) print('\ttemporality : {}'.format(temporality)) print('\tlanguage : {}\n'.format(language)) print('\nStart preprocessing of dataset') if "text" not in dataset.columns: raise ValueError( 'Dataset does not have a column named "text". You must rename the your text column to "text".' ) if "timestamp" not in dataset.columns: raise ValueError( 'Dataset does not have a column named "timestamp". You must rename your time column to "timestamp".' ) if split_by_paragraph: print('\nSplitting by paragraphs...') dataset['text'], dataset['timestamp'] = split_by_paragraph( dataset['text'].values, dataset['timestamp'].values) dataset['raw_index'] = dataset.index.values init_n_obs = dataset.shape[0] print('Starting number of observations : {}'.format(init_n_obs)) ##Dropping NAN dataset.dropna(subset=['text', 'timestamp'], inplace=True) no_na_n_obs = dataset.shape[0] print( 'Number of observations after deleting missing values : {} = {} missing values' .format(no_na_n_obs, init_n_obs - no_na_n_obs)) #Dropping errors on date dataset = handle_errors(dataset, no_na_n_obs) final_n_obs = dataset.shape[0] print( 'Final number of observations after handling errors on date : {} = {} errors on date' .format(final_n_obs, no_na_n_obs - final_n_obs)) print('Deleted a total of {} observations.'.format(init_n_obs - final_n_obs)) dataset['timestamp'] = date_conversion(dataset) dataset.sort_values('timestamp', inplace=True) dataset.reset_index(drop=True, inplace=True) dataset['index'] = dataset.index.values docs = dataset['text'] docs = docs.str.lower() docs = docs.apply(lambda x: unidecode.unidecode(x)) print('\nTokenizing...') docs = tokenize(docs) if basic_word_analysis: print( '\nBasic word analysis enabled. It will take more time to compute...\n' ) if enable_bigram: print('\nAdding bigrams...') before_vocab = len(Dictionary(docs)) docs = add_bigram(docs, min_bigram_count) bigram_vocab = len(Dictionary(docs)) print('\nFound {} bigrams in text\n'.format(bigram_vocab - before_vocab)) len_starting_vocab = len(Dictionary(docs)) print('\nBeginning dictionary contains : {} words\n'.format( len_starting_vocab)) print('\nRemoving stopwords...') docs = remove_stop_words(docs, language) curr_len_vocab = len(Dictionary(docs)) len_rm_words = len_starting_vocab - curr_len_vocab len_vocab = curr_len_vocab freq = round(len_rm_words / len_starting_vocab, 3) * 100 print( '\tRemoved {} stopwords from dictionary. It represents {}% of total words in starting vocabulary' .format(len_rm_words, freq)) print('\tCurrent length of the vocabulary:', len_vocab) print('\nRemoving unique numbers (not words that contain numbers)...') docs = remove_numbers(docs) curr_len_vocab = len(Dictionary(docs)) len_rm_words = len_vocab - curr_len_vocab len_vocab = curr_len_vocab freq = round(len_rm_words / len_starting_vocab, 3) * 100 print( '\tRemoved {} numeric words from dictionary. It represents {}% of total words in starting vocabulary' .format(len_rm_words, freq)) print('\tCurrent length of the vocabulary:', len_vocab) print('\nRemoving words that contain only one character...') docs = remove_word_with_length(docs, length=1) curr_len_vocab = len(Dictionary(docs)) len_rm_words = len_vocab - curr_len_vocab len_vocab = curr_len_vocab freq = round(len_rm_words / len_starting_vocab, 3) * 100 print( '\tRemoved {} one length characters from dictionary. It represents {}% of total words in starting vocabulary' .format(len_rm_words, freq)) print('\tCurrent length of the vocabulary:', len_vocab) print('-' * 100) len_rm_words = len_starting_vocab - len_vocab freq = round(len_rm_words / len_starting_vocab, 3) * 100 print( '\nRemoved {} total words from beginning dictionary. It represents {}% of total words in starting vocabulary\n' .format(len_rm_words, freq)) print('-' * 100) else: print('\nWord analysis disabled') if enable_bigram: docs = add_bigram(docs, min_bigram_count) print('\nRemoving stopwords...') docs = remove_stop_words(docs, language) print('\nRemoving unique numbers (not words that contain numbers)...') docs = remove_numbers(docs) print('\nRemoving words that contain only one character...') docs = remove_word_with_length(docs, length=1) if lemmatizing: print('\nLemmatizing...') docs = lemmatize(docs) dataset['text'] = docs dictionary = Dictionary(dataset['text']) bef = len(dictionary) print('\nFiltering extremes...') dictionary.filter_extremes(no_below=extreme_no_below, no_above=extreme_no_above) if basic_word_analysis: print('\n') print('-' * 100) if (extreme_no_above != 1) or (extreme_no_below != 1): if extreme_no_below > 1: extreme_no_below_str = str( extreme_no_below) + ' ' + 'documents' else: extreme_no_below_str = str( extreme_no_below * 100) + '%' + ' ' + 'documents' if extreme_no_above > 1: extreme_no_above_str = str( extreme_no_above) + ' ' + 'documents' else: extreme_no_above_str = str( extreme_no_above * 100) + '%' + ' ' + 'documents' print( '\nKeeping words in no less than {} & in no more than {}:'.format( extreme_no_below_str, extreme_no_above_str)) print( 'Number of unique tokens reduced from {} to {}, representing {} % of total vocabulary.' .format(bef, len(dictionary), np.round(((bef - len(dictionary)) / bef) * 100, 3))) dataset['text'] = dataset['text'].apply( lambda x: [w for w in x if w in list(dictionary.token2id)]) print('\nRemoving words that contain only one character...') dataset['text'] = remove_word_with_length(dataset['text'], length=1) print('\nDeleting rows that do not contain any text...') dataset = remove_empty_docs(dataset) print('\tDeleted {} rows because of no text'.format(final_n_obs - dataset.shape[0])) print('\nNumber of unique tokens: %d' % len(dictionary)) print('\nNumber of documents: %d \n' % len(dataset)) print('\nPreprocessing timestamps...') n_years = int(str(dataset['timestamp'].iloc[-1]).split('-')[0]) - int( str(dataset['timestamp'].iloc[0]).split('-')[0]) n_months = int(str(dataset['timestamp'].iloc[-1]).split('-')[1]) - int( str(dataset['timestamp'].iloc[0]).split('-')[1]) dataset, date_range = timestamps_preprocessing(dataset, n_years, n_months, temporality) date_range = [str(i).split(' ')[0] for i in date_range] for ind in range(len(date_range) - 1): print('Timeslice {} date range : from {} to {}'.format( ind, date_range[ind], date_range[ind + 1])) for subsample in dataset.groupby('timeslice'): print('Number of observations for timeslice {} : {}'.format( subsample[0], subsample[1].shape[0])) print('-' * 100) mapper_date = dict(zip([i for i in range(len(date_range))], date_range)) dataset['text'] = dataset['text'].apply(lambda x: ' '.join(x)) good_idx = [] for idx in range(dataset.shape[0]): if dataset['text'].iloc[idx] != '': good_idx.append(idx) dataset = dataset.iloc[good_idx] print('\nBuilding file for fasttext training....') text_for_embeddings = list(dataset['text']) with open(path_to_texts_for_embedding, 'w') as f: for text in text_for_embeddings: f.write(text + '\n') print('Final data shape : {}'.format(dataset.shape)) print('\nDone in {} minutes'.format(int((time() - t0) / 60))) return dict(dataset_preprocessed=dataset, dictionary=dictionary, vocab_size=len(dictionary), date_range=date_range)
module_path = os.path.dirname( __file__ ) # needed because sample data files are located in the same folder datapath = lambda fname: os.path.join(module_path, 'test_data', fname) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARNING) # set up vars used in testing ("Deerwester" from the web tutorial) texts = [['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey']] dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] def testfile(): # temporary data will be stored to this file return os.path.join(tempfile.gettempdir(), 'gensim_models.tst') class TestLsiModel(unittest.TestCase): def setUp(self): self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) def testTransform(self): # create the transformation model model = lsimodel.LsiModel(self.corpus, numTopics=2)
import json from gensim.models import TfidfModel from gensim.corpora import Dictionary from gensim import matutils def vectorize(docs, vocab_size): ''' docs :: iterable of iterable of (int, number) vocab_size :: 词表大小 ''' return matutils.corpus2dense(docs, vocab_size) if __name__ == '__main__': with open('finance_news_test.json', encoding='utf-8') as f: data = json.load(f) data = [doc.split() for doc in data] dct = Dictionary.load('news.dict') corpus = [dct.doc2bow(doc) for doc in data] model = TfidfModel.load('news_tfidf.model') vocab_size = len(dct.token2id) for doc in corpus: # print(model[doc],len(model[doc]),len(vectorize([model[doc]],vocab_size)),len(vectorize([model[doc]],vocab_size)[0])) # break print(vectorize([model[doc]], vocab_size))
def recommend(self,s_title): docs = self.df['text'].copy() # Split the documents into tokens. tokenizer = RegexpTokenizer(r'\w+') for idx in range(len(docs)): docs[idx] = docs[idx].lower() # Convert to lowercase. docs[idx] = tokenizer.tokenize(docs[idx]) # Split into words. # Remove words that are only one character. docs = [[token for token in doc if len(token) > 1] for doc in docs] # Compute bigrams. from gensim.models import Phrases # Add bigrams and trigrams to docs (only ones that appear 20 times or more). bigram = Phrases(docs, min_count=5, threshold=10) for idx in range(len(docs)): for token in bigram[docs[idx]]: if '_' in token: # Token is a bigram, add to document. docs[idx].append(token) from gensim.corpora import Dictionary # Create a dictionary representation of the documents. dictionary = Dictionary(docs) # Filter out words that occur less than 20 documents, or more than 50% of the documents. dictionary.filter_extremes(no_below=20, no_above=0.5) corpus = [dictionary.doc2bow(doc) for doc in docs] # Train LDA model. from gensim.models import LdaModel, LdaMulticore # Set training parameters. num_topics = 15 chunksize = 2000 passes = 20 iterations = 100 eval_every = None # Don't evaluate model perplexity, takes too much time. # Make a index to word dictionary. temp = dictionary[0] # This is only to "load" the dictionary. id2word = dictionary.id2token from gensim.models import CoherenceModel topic_size = [1,5,10,15,20,25,30,35,40] coherence_score = [] print('|- Generating Model... -|') lda_model = gensim.models.LdaMulticore(corpus=corpus, id2word=id2word, num_topics=15, random_state=100, chunksize=100, passes=10, alpha=0.01, eta=0.9,iterations=300) for index, row in self.df.iterrows(): for i in range(0,num_topics): self.df.at[index,'topic_'+str(i)] = 0 for t in lda_model.get_document_topics(corpus[index]): self.df.at[index,'topic_'+str(t[0])] = t[1] # user has watched a title pick = s_title pick_row = self.df[self.df[self.indexer].str.lower() == pick.lower()] pick_index = pick_row.index.values[0] print('|- Generating Euclidean Distances... -|') def Euclidean(row, n_topics): pick_vec = [] row_vec = [] for i in range(0,n_topics): pick_vec.append(pick_row.iloc[0]['topic_'+str(i)]) row_vec.append(row['topic_'+str(i)]) # Get similarity based on top k topics of picked vector k=10 top_5_idx = np.argsort(pick_vec)[-k:] pick_vec = np.array(pick_vec)[top_5_idx] row_vec = np.array(row_vec)[top_5_idx] return np.linalg.norm(row_vec - pick_vec) # select nearest 10 def getTopNByLDA(df, col, n): return df.sort_values(by = col).head(n) # compute lda distances filteredData = self.df.copy() for index, row in filteredData.iterrows(): filteredData.at[index,'lda'] = Euclidean(filteredData.iloc[index], num_topics) print("|- Complete! Stored recommendation DataFrame under the 'recommendations' key! -|") filteredData = filteredData[filteredData.index != pick_index] return { 'result': getTopNByLDA(filteredData, 'lda', self.n_recommendations)[[self.indexer,'lda']].sort_values('lda'), 'n_recommendations': self.n_recommendations, 'indexer': self.indexer, 'feature_names': self.feature_names, }
parser.add_argument('-f', '--fileTag', type=str, required=True, dest="fileTag", help='fileTag as prefix for all exported files') args = parser.parse_args() fileTag = args.fileTag collections = args.collections corpora_path = "./corpora/" #### Step 1, build dictionary object #### print("Start to build dictionary object.") dct = Dictionary() # use Timer to print elapsed time with Timer(): for each_collection in collections: print("Reading the corpus for {}".format(each_collection)) file_path = f"{corpora_path}{each_collection}-raw-corpus.tsv" for i, a_tweet in enumerate(TweetRawCorpusStream(file_path)): token_f = [ x for x in a_tweet.tokens_str.split(",") if len(x) > 1 ] dct.add_documents([token_f], prune_at=None) sizeofCorpus = i - 1 print(f"Totally {sizeofCorpus} tweets in {each_collection}.") print("Original size of vocabs: {}".format(len(dct))) # control the vocabulary dct.filter_extremes(no_below=40,
class WikipediaDataSet: def __init__(self, src_dir_path: str, cache_dir_path: str): self.src_dir_path = src_dir_path self.cache_dir_path = cache_dir_path self.dictionary = Dictionary() self.cache_file_paths = [] tokenizer = MeCab.Tagger('-Ochasen') self.load_file(tokenizer) @staticmethod def tokenize(tokenizer: MeCab.Tagger, text): words = [] word_infos = tokenizer.parse(text).split('\n')[:-2] for word_info in word_infos: word_info = word_info.split('\t') if '名詞' in word_info[3] or '動詞' in word_info[ 3] or '形容詞' in word_info[3]: words.append(word_info[2]) return words @staticmethod def article_to_words(tokenizer: MeCab.Tagger, article: str): match = re.search(r'\<doc(.|\s)*?\>\n', article) article = article[match.end():] match = re.search(r'\</doc>', article) article = article[:match.start()] texts = [] for line in article.split('\n'): if not line: continue texts.append(WikipediaDataSet.tokenize(tokenizer, line)) return texts def load_file(self, tokenizer: MeCab.Tagger): os.makedirs(self.cache_dir_path, exist_ok=True) for subdir_name in os.listdir(self.src_dir_path): subdir_path = os.path.join(self.src_dir_path, subdir_name) file_path_to_save = os.path.join(self.cache_dir_path, subdir_name) if os.path.exists(file_path_to_save): with open(file_path_to_save, 'rb') as _: texts = dill.load(_) else: texts = [] for file_name in os.listdir(subdir_path): file_path = os.path.join(subdir_path, file_name) with bz2.open(file_path, 'r') as _: raw_articles = _.read().decode('utf-8') match = re.search(r'\<doc(.|\s)*?\</doc>\n', raw_articles) while match: start, end = match.span() article = raw_articles[start:end] texts += WikipediaDataSet.article_to_words( tokenizer, article) raw_articles = raw_articles[end:] match = re.search(r'\<doc(.|\s)*?\</doc>\n', raw_articles) file_path_to_save = os.path.join(self.cache_dir_path, subdir_name) with open(file_path_to_save, 'wb') as _: dill.dump(texts, _) self.dictionary.add_documents(texts) self.cache_file_paths.append(file_path_to_save) def get_text(self): for file_path_to_load in np.random.permutation(self.cache_file_paths): with open(file_path_to_load, 'rb') as _: texts = dill.load(_) for text in np.random.permutation(texts): yield text def __len__(self) -> int: return self.dictionary.num_docs
""" return os.path.join(tempfile.gettempdir(), suffix) @contextlib.contextmanager def temporary_file(name=""): """create a temporary directory and return a path to "name" in that directory At the end of the context, the directory is removed. The function doesn't create the file. """ # note : when dropping python2.7 support, we can use tempfile.TemporaryDirectory tmp = tempfile.mkdtemp() try: yield os.path.join(tmp, name) finally: shutil.rmtree(tmp, ignore_errors=True) # set up vars used in testing ("Deerwester" from the web tutorial) common_texts = [['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey']] common_dictionary = Dictionary(common_texts) common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]
def LDALoad(self): self.ldamodel = LdaModel.load("fixed_time_window_lda.model") self.dictionary = Dictionary.load("lda_dictionary.model") print(self.dictionary)
def term_document_matrix( documents, dictionary: Dictionary) -> List[List[Tuple[int, int]]]: return [dictionary.doc2bow(text) for text in documents]
def train_val_test(dataset: pd.DataFrame, dictionary: Dictionary, test_size: float, val_size: float) -> Dict[str, Any]: # Make train val test index num_docs = len(dataset) vaSize = int(np.floor(val_size * num_docs)) tsSize = int(np.floor(test_size * num_docs)) trSize = int(num_docs - vaSize - tsSize) idx_permute = np.random.permutation(num_docs).astype(int) print('Reading data....') # Make sure our text column is of type list dataset['text'] = dataset['text'].apply(lambda x: x.split(' ')) word2id = dict([(w, j) for j, w in dictionary.items()]) id2word = dict([(j, w) for j, w in dictionary.items()]) # Remove words not in train_data print('Starting vocabulary : {}'.format(len(dictionary))) vocab = list(dictionary) docs_tr = [[ word2id[w] for w in dataset['text'][idx_permute[idx_d]] if w in word2id ] for idx_d in range(trSize)] timestamps_tr = pd.DataFrame( dataset['timeslice'][idx_permute[range(trSize)]]) idx_tr = idx_permute[range(trSize)] docs_ts = [[ word2id[w] for w in dataset['text'][idx_permute[idx_d + trSize]] if w in word2id ] for idx_d in range(tsSize)] timestamps_ts = pd.DataFrame(dataset['timeslice'][idx_permute[range( trSize, trSize + tsSize)]]) idx_ts = idx_permute[range(trSize, trSize + tsSize)] docs_va = [[ word2id[w] for w in dataset['text'][idx_permute[idx_d + trSize + tsSize]] if w in word2id ] for idx_d in range(vaSize)] timestamps_va = pd.DataFrame(dataset['timeslice'][idx_permute[range( tsSize + trSize, num_docs)]]) idx_va = idx_permute[range(tsSize + trSize, num_docs)] print( ' Number of documents in train set : {} [this should be equal to {} and {}]' .format(len(docs_tr), trSize, len(timestamps_tr))) print( ' Number of documents in test set : {} [this should be equal to {} and {}]' .format(len(docs_ts), tsSize, len(timestamps_ts))) print( ' Number of documents in validation set: {} [this should be equal to {} and {}]' .format(len(docs_va), vaSize, len(timestamps_va))) # Split test set in 2 halves, the first containing the first half of the words in documents, and second part the second # half of words in documents. Will be use to gather test completion perplexity. print('Splitting test documents in 2 halves...') docs_ts_h1 = [[w for i, w in enumerate(doc) if i <= len(doc) / 2.0 - 1] for doc in docs_ts] docs_ts_h2 = [[w for i, w in enumerate(doc) if i > len(doc) / 2.0 - 1] for doc in docs_ts] print('Creating lists of words...') words_tr = create_list_words(docs_tr) words_ts = create_list_words(docs_ts) words_ts_h1 = create_list_words(docs_ts_h1) words_ts_h2 = create_list_words(docs_ts_h2) words_va = create_list_words(docs_va) print(' Total number of words used in train set : ', len(words_tr)) print(' Total number of words used in test set : ', len(words_ts)) print( ' Total number of words used in test firt set (first half of documents words): ', len(words_ts_h1)) print( ' Total number of words used in test firt set (first half of documents words): ', len(words_ts_h2)) print(' Total number of words used in val set : ', len(words_va)) n_docs_tr = len(docs_tr) n_docs_ts = len(docs_ts) n_docs_ts_h1 = len(docs_ts_h1) n_docs_ts_h2 = len(docs_ts_h2) n_docs_va = len(docs_va) # Get doc indices print('Getting doc indices...') doc_indices_tr = create_doc_indices(docs_tr) doc_indices_ts = create_doc_indices(docs_ts) doc_indices_ts_h1 = create_doc_indices(docs_ts_h1) doc_indices_ts_h2 = create_doc_indices(docs_ts_h2) doc_indices_va = create_doc_indices(docs_va) print('Creating bow representation...') bow_tr = create_bow(doc_indices_tr, words_tr, n_docs_tr, len(vocab)) bow_ts = create_bow(doc_indices_ts, words_ts, n_docs_ts, len(vocab)) bow_ts_h1 = create_bow(doc_indices_ts_h1, words_ts_h1, n_docs_ts_h1, len(vocab)) bow_ts_h2 = create_bow(doc_indices_ts_h2, words_ts_h2, n_docs_ts_h2, len(vocab)) bow_va = create_bow(doc_indices_va, words_va, n_docs_va, len(vocab)) print(' Train bag of words shape : {}'.format(bow_tr.shape)) print(' Test bag of words shape : {}'.format(bow_ts.shape)) print(' Test set 1 bag of words shape : {}'.format(bow_ts_h1.shape)) print(' Test set 2 bag of words shape : {}'.format(bow_ts_h2.shape)) print(' Val bag of words shape : {}'.format(bow_va.shape)) print('\nMost import words in train BOW : \n') print(get_most_important_words(bow_tr, id2word)) print('\nMost import words in val BOW : \n') print(get_most_important_words(bow_va, id2word)) print('\nMost import words in test BOW : \n') print(get_most_important_words(bow_ts, id2word)) print('\nDone splitting data.') return dict(BOW_train=bow_tr, BOW_test=bow_ts, BOW_test_h1=bow_ts_h1, BOW_test_h2=bow_ts_h2, BOW_val=bow_va, timestamps_train=timestamps_tr, timestamps_test=timestamps_ts, timestamps_val=timestamps_va, train_vocab_size=len(vocab), train_num_times=len(np.unique(timestamps_tr['timeslice'])), idx_train=idx_tr, idx_test=idx_ts, idx_val=idx_va)
def __init__(self, cut_off=0.2, cleanup_urls=True, nltk_tokenizer=False): super().__init__(cleanup_urls=cleanup_urls, nltk_tokenizer=nltk_tokenizer) self.dictionary = Dictionary(self.corpus) self.tfidf = TfidfModel(dictionary=self.dictionary)
for word_document in tqdm(word_documents): sws = [sp.tokenize(word) for word in word_document] sw_documents.append(list(chain.from_iterable(sws))) if os.path.exists(config_dic.get("cache_dir")): print( f"Write Cache data. {os.path.join(config_dic.get('cache_dir'), config_dic.get('train_name') + '.sw_documents')}" ) with open( os.path.join(config_dic.get("cache_dir"), config_dic.get('train_name') + ".sw_documents"), "wb") as f: f.write(cloudpickle.dumps(sw_documents)) print("=========== Build vocabulary ===========") special_token_dict = {PADDING: 0, UNKNOWN: 1} word_dic = Dictionary(word_documents) word_dic.filter_extremes(no_below=10, no_above=1.0) word_dic.patch_with_special_tokens(special_token_dict) sw_dic = Dictionary(sw_documents) sw_dic.filter_extremes(no_below=5, no_above=1.0) sw_dic.patch_with_special_tokens(special_token_dict) char_documents = [[[char for char in word] for word in document] for document in word_documents] # Document数 x 文字数 char_dic = Dictionary(list(chain.from_iterable(char_documents))) char_dic.patch_with_special_tokens(special_token_dict) word_dic.save( os.path.join(config_dic.get("vocab_dir"), f"{config_dic.get('train_name')}.word.dic")) char_dic.save( os.path.join(config_dic.get("vocab_dir"),
class Word2VecWmdRelaxSimilarity(Word2VecSimilarityBase): def __init__(self, cut_off=0.2, cleanup_urls=True, nltk_tokenizer=False): super().__init__(cleanup_urls=cleanup_urls, nltk_tokenizer=nltk_tokenizer) self.dictionary = Dictionary(self.corpus) self.tfidf = TfidfModel(dictionary=self.dictionary) def get_similar_bugs(self, query): query = self.text_preprocess(self.get_text(query)) words = [ word for word in set(chain(query, *self.corpus)) if word in self.w2vmodel.wv ] indices, words = zip(*sorted(((index, word) for ( index, _), word in zip(self.dictionary.doc2bow(words), words)))) query = dict(self.tfidf[self.dictionary.doc2bow(query)]) query = [(new_index, query[dict_index]) for new_index, dict_index in enumerate(indices) if dict_index in query] documents = [ dict(self.tfidf[self.dictionary.doc2bow(document)]) for document in self.corpus ] documents = [[(new_index, document[dict_index]) for new_index, dict_index in enumerate(indices) if dict_index in document] for document in documents] embeddings = np.array([self.w2vmodel.wv[word] for word in words], dtype=np.float32) nbow = dict(((index, list(chain([None], zip(*document)))) for index, document in enumerate(documents) if document != [])) nbow["query"] = tuple([None] + list(zip(*query))) distances = WMD(embeddings, nbow, vocabulary_min=1).nearest_neighbors("query") return [ self.bug_ids[distance[0]] for distance in distances if self.bug_ids[distance[0]] != query["id"] ] def get_distance(self, query1, query2): query1 = self.text_preprocess(self.get_text(query1)) query2 = self.text_preprocess(self.get_text(query2)) words = [ word for word in set(chain(query1, query2, *self.corpus)) if word in self.w2vmodel.wv ] indices, words = zip(*sorted(((index, word) for ( index, _), word in zip(self.dictionary.doc2bow(words), words)))) query1 = dict(self.tfidf[self.dictionary.doc2bow(query1)]) query2 = dict(self.tfidf[self.dictionary.doc2bow(query2)]) query1 = [(new_index, query1[dict_index]) for new_index, dict_index in enumerate(indices) if dict_index in query1] query2 = [(new_index, query2[dict_index]) for new_index, dict_index in enumerate(indices) if dict_index in query2] embeddings = np.array([self.w2vmodel.wv[word] for word in words], dtype=np.float32) nbow = {} nbow["query1"] = tuple([None] + list(zip(*query1))) nbow["query2"] = tuple([None] + list(zip(*query2))) distances = WMD(embeddings, nbow, vocabulary_min=1).nearest_neighbors("query1") return distances[0][1]
i=0 for x in mycol.find(): document=(x['text']).lower() temp=process_text(document) for word in temp: docmap[word]=i i=i+1 train_text.append(temp) print("no of entries in train text is %d"%len(train_text)) dictionary = Dictionary(train_text) corpus = [dictionary.doc2bow(text) for text in train_text] hdpmodel1 = HdpModel(corpus=corpus, id2word=dictionary) x=hdpmodel1.show_topics(num_topics=30,num_words=200) twords={} for topic,word in x: twords[str(topic)]=(re.sub('[^A-Za-z ]+', '', word)).split()
stemmer = PorterStemmer() translate_tab = {ord(p): u" " for p in punctuation} def text2tokens(raw_text): """Split the raw_text string into a list of stemmed tokens.""" clean_text = raw_text.lower().translate(translate_tab) tokens = [token.strip() for token in tokenizer.tokenize(clean_text)] tokens = [token for token in tokens if token not in eng_stopwords] #stemmed_tokens = [stemmer.stem(token) for token in tokens] return [token for token in tokens if len(token) > 2] # skip short tokens dataset = [text2tokens(txt) for txt in newsgroups['data'] if len(text2tokens(txt))>0] # convert a documents to list of tokens targets = [newsgroups['target'][i] for i,txt in enumerate(newsgroups['data']) if len(text2tokens(txt))>0] from gensim.corpora import Dictionary dictionary = Dictionary(documents=dataset, prune_at=None) dictionary.filter_extremes(no_below=5, no_above=0.6, keep_n=None) # use Dictionary to remove un-relevant tokens dictionary.compactify() vocab = dictionary.token2id print("Newsgroup loaded") print("Downloading fasttext") fasttext_vectors = gensim.downloader.load('fasttext-wiki-news-subwords-300') print("Fasttext downloaded") embeddings = np.zeros((len(dictionary), 300)) for w,i in dictionary.token2id.items(): try: embeddings[i] = fasttext_vectors.wv[w]
import csv from gensim.corpora import Dictionary from seq2seq.seq2seq import Encoder, Decoder device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") corpus = [] with open('dataset.csv', encoding='utf-8') as fp: reader = csv.reader(fp) for i, row in enumerate(reader): if i == 0: pass corpus.append(row[0].split(' ')) corpus.append(row[1].split(' ')) N = len(corpus) // 2 dct = Dictionary(corpus) word2id = dct.token2id initialize = dct[0] dct_len = len(word2id) word2id.update({"<pad>": dct_len, "<eos>": dct_len + 1}) id2word = {v: k for k, v in word2id.items()} seq_len = 10 def load_dataset(): def load_sent_list(training=True): sent_list = [] with open('dataset.csv', encoding='utf-8') as fp: reader = csv.reader(fp) for i, row in enumerate(reader):
from gensim.corpora import Dictionary from string import punctuation from nltk.corpus import stopwords from nltk import word_tokenize from nltk import TweetTokenizer ############# # Fonctions # ############# # élimine les "stop words" stop_words = stopwords.words('english') + list(punctuation) + [ 'nt', 's' ] # + remove stuff like n't, 's, ... # chargement du dictionnaire et du modèle tf-idf dictionary = Dictionary.load("ressources/dictionary") tfidf_model = TfidfModel.load("ressources/tfidf_model") #identifie si une chaîne de caractères est alphanumérique def is_alpha(string): regex = re.compile('[^a-zA-Z]') return bool(regex.sub('', string)) #identifie si une chaîne de caractères est une URL def tokenize(text): words = word_tokenize(text) words = [ re.sub('[\']', '', w.lower()) for w in words if not re.match('//*/*', w)
class LSISimilarity(BaseSimilarity): def __init__(self, cleanup_urls=True, nltk_tokenizer=False, confidence_threshold=0.8): super().__init__( cleanup_urls=cleanup_urls, nltk_tokenizer=nltk_tokenizer, confidence_threshold=confidence_threshold, ) self.corpus = [] for bug in bugzilla.get_bugs(): textual_features = self.text_preprocess(self.get_text(bug)) self.corpus.append([bug["id"], textual_features]) # Assigning unique integer ids to all words self.dictionary = Dictionary(text for bug_id, text in self.corpus) # Conversion to BoW corpus_final = [ self.dictionary.doc2bow(text) for bug_id, text in self.corpus ] # Initializing and applying the tfidf transformation model on same corpus,resultant corpus is of same dimensions tfidf = models.TfidfModel(corpus_final) corpus_tfidf = tfidf[corpus_final] # Transform TF-IDF corpus to latent 300-D space via Latent Semantic Indexing self.lsi = models.LsiModel(corpus_tfidf, id2word=self.dictionary, num_topics=300) corpus_lsi = self.lsi[corpus_tfidf] # Indexing the corpus self.index = similarities.Similarity(output_prefix="simdata.shdat", corpus=corpus_lsi, num_features=300) def search_similar_bugs(self, query, k=10): query_summary = "{} {}".format(query["summary"], query["comments"][0]["text"]) query_summary = self.text_preprocess(query_summary) # Transforming the query to latent 300-D space vec_bow = self.dictionary.doc2bow(query_summary) vec_lsi = self.lsi[vec_bow] # Perform a similarity query against the corpus sims = self.index[vec_lsi] sims = sorted(enumerate(sims), key=lambda item: -item[1]) # Get IDs of the k most similar bugs return [ self.corpus[j[0]][0] for j in sims[:k] if self.corpus[j[0]][0] != query["id"] ] def get_distance(self, query1, query2): raise NotImplementedError
import pyLDAvis.gensim # don't skip this import matplotlib.pyplot as plt %matplotlib inline # Enable logging for gensim - optional import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR) import warnings warnings.filterwarnings("ignore",category=DeprecationWarning) # Load a potentially pretrained model from disk. lda_model = models.LdaModel.load('lda_model') # Load previous dictionary id2word = Dictionary.load_from_text('/Users/hellofutrue/Desktop/Insight/Python/Feb/dictionary') posts_influencers = pd.read_csv('/Users/hellofutrue/Desktop/Insight/Python/Feb/files/posts_influencers.csv') posts_influencers = posts_influencers.rename(index=str, columns={'Unnamed: 0': "people", '0': 'content'}) data = posts_influencers.content.values.tolist() def preprocessing(dat): # Tokenization def sent_to_words(sentences): for sentence in sentences: yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations data_words = list(sent_to_words(dat)) # Build the bigram and trigram models bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases. trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
R_all+=R all_count+=1 P_all/=all_count R_all/=all_count try: F1=2*P_all*R_all/(P_all+R_all) except: F1=0.0 return P_all,R_all,F1 f=open(homedir+"/results/ontology/c2n.json",'r') c2n=json.load(f) f.close() prefix='http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#' ncit_dict=[k.split('#')[1] for k in c2n.keys()] dictionary=Dictionary([ncit_dict]);dictionary[0] model_name="MLPsparse_1hidden" model=get_model_S3(model_name) topic_num=[5,10,20,25,40,50,100,200,250] for tn in topic_num: lda=AuthorTopicModel.load(homedir+"/results/models/lda2000_topic"+str(tn)) threshold=0.0 volume=100 while threshold<1.0: alpha=0.0 while alpha<1.0: P,R,F=test_on_doc_S3_atmodel(lda,model,volume,alpha,threshold) f=open(homedir+"/results/logs/lda_eval_topic"+str(tn),'a')
import sys from gensim.corpora import Dictionary from gensim.matutils import corpus2dense from gensim.models import word2vec from sklearn import decomposition data_file = sys.argv[1] pca_num = int(sys.argv[2]) sentences = [s for s in word2vec.LineSentence(data_file) if len(s) >= 2] dic = Dictionary(sentences) corpus = [dic.doc2bow(s) for s in sentences] x = corpus2dense(corpus, len(dic)).T pca = decomposition.PCA(n_components = pca_num, random_state = 1) nx = pca.fit_transform(x) print(sum(pca.explained_variance_ratio_))
class gensim_data(object): def __init__(self,mashup_descriptions, api_descriptions, mashup_categories=None, api_categories=None,tag_times=2,mashup_only=False,strict_train=False): self.mashup_only =mashup_only self.strict_train = strict_train # 整合text和tag信息:一个mashup/api的信息整合在一起,一行 if tag_times>0 and mashup_categories is not None: assert len(mashup_descriptions)==len(mashup_categories) self.mashup_dow=[[]]*len(mashup_descriptions) for i in range(len(mashup_descriptions)): self.mashup_dow[i]=mashup_descriptions[i] for j in range(tag_times): self.mashup_dow[i] += mashup_categories[i] # 直接将文本和tag拼接,是否有更好的方法?增加出现次数? else: self.mashup_dow = mashup_descriptions self.mashup_dow = [[str (index) for index in indexes] for indexes in self.mashup_dow] # 二维列表 # print (self.mashup_dow[0]) if tag_times>0 and api_categories is not None: assert len (api_descriptions) == len (api_categories) self.api_dow=[[]]*len(api_descriptions) for i in range(len(api_descriptions)): self.api_dow[i]=api_descriptions[i] for j in range(tag_times): self.api_dow[i]+=api_categories[i] else: self.api_dow=api_descriptions self.api_dow = [[str (index) for index in indexes] for indexes in self.api_dow] if not self.mashup_only and not self.strict_train: self.dct = Dictionary(self.mashup_dow + self.api_dow) if self.mashup_only and self.strict_train: # 训练用的mashup,api的编码 self.train_mashup_dow = [self.mashup_dow[m_id] for m_id in dataset.crt_ds.his_mashup_ids] self.dct = Dictionary(self.train_mashup_dow) self.train_mashup_dow = [self.dct.doc2bow(mashup_info) for mashup_info in self.train_mashup_dow] # 词id-数目 # 无论怎样,总要为每个mashup/api计算feature self.mashup_dow = [self.dct.doc2bow(mashup_info) for mashup_info in self.mashup_dow] # 所有mashup文本的词id-数目 print('self.mashup_dow, num:',len(self.mashup_dow)) zero_num = sum([1 if len(mashup_info)==0 else 0 for mashup_info in self.mashup_dow]) print('zero_num',zero_num) self.api_dow = [self.dct.doc2bow(api_info) for api_info in self.api_dow] # print('len of self.mashup_dow,self.api_dow:{},{}'.format(len(self.mashup_dow),len (self.api_dow))) self.num_topics =0 self.model = None # 处理文本的模型 self._mashup_features= None # 文本提取的特征向量 self._api_features= None self.mashup_topics = None # 文本最高的N个topic self.api_topics = None # 只关注词在文本中是否出现过,二进制,用于计算cos和jaccard def get_binary_v(self): dict_size=len(self.dct) mashup_binary_matrix=np.zeros((meta_data.mashup_num,dict_size)) api_binary_matrix = np.zeros ((meta_data.api_num, dict_size)) mashup_words_list=[] # 每个mashup中所有出现过的词 api_words_list = [] for i in range(meta_data.mashup_num): temp_words_list,_=zip(*self.mashup_dow[i]) mashup_words_list.append(temp_words_list) for j in temp_words_list:# 出现的词汇index mashup_binary_matrix[i][j]=1.0 for i in range(meta_data.api_num): temp_words_list,_=zip(*self.api_dow[i]) api_words_list.append(temp_words_list) for j in temp_words_list:# 出现的词汇index api_binary_matrix[i][j]=1.0 return mashup_binary_matrix,api_binary_matrix,mashup_words_list,api_words_list def model_pcs(self,model_name,LDA_topic_num=None): # hdp结果形式:[(0, 0.032271167132309014),(1, 0.02362695056720504)] if self.mashup_only: if self.strict_train: train_corpus = self.train_mashup_dow else: train_corpus = self.mashup_dow else: if self.strict_train: train_corpus = self.train_mashup_dow + self.train_api_dow else: train_corpus = self.mashup_dow + self.api_dow if model_name=='HDP': self.model = HdpModel(train_corpus, self.dct) self.num_topics = self.model.get_topics ().shape[0] print('num_topics',self.num_topics) elif model_name=='TF_IDF': self.model =TfidfModel (train_corpus) self.num_topics=len(self.dct) elif model_name=='LDA': if LDA_topic_num is None: self.model = LdaModel(train_corpus) else: self.model = LdaModel(train_corpus,num_topics=LDA_topic_num) self.num_topics = self.model.get_topics ().shape[0] else: raise ValueError('wrong gensim_model name!') # 使用模型处理文本,再转化为标准的np格式(每个topic上都有上) # print(self.mashup_dow) self.mashup_features=[self.model[mashup_info] for mashup_info in self.mashup_dow] # 每个mashup和api的feature # print(self.mashup_features) print('self.mashup_features, num:', len(self.mashup_features)) zero_num1 = sum([1 if len(mashup_feature)==0 else 0 for mashup_feature in self.mashup_features]) print('zero_num1',zero_num1) for i in range(len(self.mashup_features)): if len(self.mashup_features[i])==0: print(self.mashup_dow[i]) self.api_features = [self.model[api_info] for api_info in self.api_dow] # print('when model-pcs,len of mashup_features and api_features:{},{}'.format(len(mashup_features),len(api_features))) self._mashup_features=np.zeros((meta_data.mashup_num, self.num_topics)) self._api_features = np.zeros((meta_data.api_num, self.num_topics)) for i in range(meta_data.mashup_num): # 部分维度有值,需要转化成规范array for index,value in self.mashup_features[i]: self._mashup_features[i][index]=value for i in range(meta_data.api_num): for index,value in self.api_features[i]: self._api_features[i][index]=value return self._mashup_features, self._api_features def get_topTopics(self,topTopicNum=3):# 选取概率最高的topK个主题 [(),(),...] mashup_topics = [] api_topics = [] for index in range(meta_data.mashup_num): sorted_mashup_feature = sorted(self.mashup_features[index],key = lambda x:x[1],reverse=True) try: topic_indexes,_ = zip(*sorted_mashup_feature) except: # 有时mashup_bow非空,但是mashup_feature为空 topic_indexes = random.sample(range(meta_data.mashup_num),topTopicNum) # print(self.mashup_dow[index]) # print(self.mashup_features[index]) # print(sorted_mashup_feature) # raise ValueError('wrong 138!') num = min(len(topic_indexes),topTopicNum) mashup_topics.append(topic_indexes[:num]) for index in range(meta_data.api_num): sorted_api_feature = sorted(self.api_features[index], key=lambda x: x[1], reverse=True) try: topic_indexes,_ = zip(*sorted_api_feature) except: topic_indexes = random.sample(range(meta_data.api_num), topTopicNum) num = min(len(topic_indexes),topTopicNum) api_topics.append(topic_indexes[:num]) return mashup_topics,api_topics
class LDAForEvent: height_weight = 8 # the weight of height in Manhattan distance delete_weight = 10 # the weight of delete a character when matching add_weight = 10 # the weight of add a character when matching ldamodel = LdaModel dictionary = corpora.Dictionary temp_dic = [] dictionary = Dictionary.load("lda_dictionary.model") # load raw data into workspace @staticmethod def read_excel(file): time_window = 40 * 1000 data = xlrd.open_workbook(file) table = data.sheets()[0] start = 0 # 开始的行 # end = 164 # 结束的行 end = len(table.col_values(0)) # from the first line to last line rows = end - start list_values = "" flag = 0 word_brffer = '' start_time = float(table.row_values(0)[0]) end_time = start_time + time_window x = start while x < end: # for x in range(start, end): row = table.row_values(x) flag += 1 temp_s_time = float(row[0]) temp_e_time = float(row[1]) temp_char = row[2] # handle time if temp_e_time < end_time: # the event is in the current time window word_brffer += temp_char elif temp_s_time > end_time: # the event is out of the window start_time = end_time end_time = start_time + time_window list_values += word_brffer + ' ' word_brffer = "" x -= 1 elif temp_s_time < end_time < temp_e_time: start_time = temp_e_time + 1 end_time = start_time + time_window list_values += word_brffer + temp_char + ' ' word_brffer = '' x += 1 # print([list_values]) # datamatrix = np.array(list_values) # print(datamatrix) return list_values # used to calculate the distance between two character # second vision # using characters a-z A-Z def calcDis(self, char_1, char_2): if ord(char_1) > 140: height1 = (ord(char_1) - ord('a')) % 7 width1 = (ord(char_1) - ord('a')) / 7 else: height1 = (ord(char_1) - ord('C')) % 7 width1 = (ord(char_1) - ord('C')) / 7 if ord(char_2) > 140: height2 = (ord(char_2) - ord('a')) % 7 width2 = (ord(char_2) - ord('a')) / 7 else: height2 = (ord(char_2) - ord('C')) % 7 width2 = (ord(char_2) - ord('C')) / 7 partA = self.height_weight * abs(height1 - height2) partB = abs(width1 - width2) return partA + partB # fuzzyEvent2 # used to match words with different length def fuzzyEvent(self, s1, s2): match_matrix = [[0 for i in range(len(s2) + 1)] for i in range(len(s1) + 1) ] # length of s1 is numbers of rows; s2 are columns for i in range(len(s1)): match_matrix[i][0] = self.add_weight * i for j in range(len(s2)): match_matrix[0][j] = self.delete_weight * j for i in range(1, len(s1) + 1): for j in range(1, len(s2) + 1): match_matrix[i][j] = min( match_matrix[i - 1][j - 1] + self.calcDis(self, s1[i - 1], s2[j - 1]), match_matrix[i - 1][j] + self.add_weight, match_matrix[i][j - 1] + self.delete_weight) sum_distance = match_matrix[len(s1)][len(s2)] max_unit = 100 # should change while the add_weight and delete_weight changed return (max_unit - sum_distance) / max_unit # doc is test document # dic is the dictionary of lda model def testEvent(self, doc, dic=[]): if len(dic) == 0: dic = self.dictionary testV = [] for i in range(len(dic)): temp = [i, 0] testV.append(temp) for word in doc: f_max = 0 flag = 1 temp_testV = [0 for i in range(len(testV))] for index in range(len(dic)): if dic[index] == word: testV[index][1] += 1 break if abs(len(word) - len(dic[index])) > 3: continue grade = self.fuzzyEvent(self, word, dic[index]) if f_max < grade: f_max = grade flag = 1 elif f_max == grade: flag += 1 temp_testV[index] = grade for index in range(len(testV)): if f_max == temp_testV[index]: testV[index][1] += 1 / flag return self.ldamodel[testV] def LDALoad(self): self.ldamodel = LdaModel.load("fixed_time_window_lda.model") self.dictionary = Dictionary.load("lda_dictionary.model") print(self.dictionary) # print(len(self.dictionary)) def LDATest(self, test): result = self.testEvent(self, test, self.dictionary) return result
with open(out_f, 'wb') as pipe: pickle.dump(docs_phrased, pipe) # check outcome of nlp pipeline print(''' ============================================================================= published article: ----------------------------------------------------------------------------- {} ============================================================================= tokenized article: ----------------------------------------------------------------------------- {} ============================================================================= tri-grammed tokenized article: ----------------------------------------------------------------------------- {} '''.format(docs[1], docs_tokens[1], docs_phrased[1])) # %% get corpus & dictionary to use for further nlp analysis # get dictionary and write it to a file pr_dictionary = Dictionary(docs_phrased) pr_dictionary.save('.data/pr_dictionary.dict') # get corpus and write it to a file pr_corpus = [pr_dictionary.doc2bow(doc) for doc in docs_phrased] out_f = os.path.join('.data', 'pr_corpus.mm') MmCorpus.serialize(out_f, pr_corpus) mm = MmCorpus(out_f)
class LDARetrieval(): def __init__(self, docs, get_model=False, num_topics=10, passes=6, iterations=40, prep_search=False): fDICT = "./models/lda_dict.dat" fCORPUS = "./models/lda_corpus.dat" if os.path.exists(fDICT) and os.path.exists(fCORPUS): print("Loading corpus from disk...") with open(fDICT, "rb") as fp: self.dictionary = pkl.load(fp) with open(fCORPUS, "rb") as fp: self.corpus = pkl.load(fp) else: print("Processing documents...") doclist = [docs[doc] for doc in docs] self.dictionary = Dictionary(doclist) self.dictionary.filter_extremes(no_below=400, no_above=0.333) self.corpus = [self.dictionary.doc2bow(doc) for doc in doclist] with open(fDICT, "wb") as fp: pkl.dump(self.dictionary, fp) with open(fCORPUS, "wb") as fp: pkl.dump(self.corpus, fp) if get_model: self.get_model(num_topics=num_topics, passes=passes, iterations=iterations, prep_search=prep_search, docs=docs) def train(self, num_topics, chunksize=10000, passes=6, iterations=40, eval_every=40): fmodel = f"./models/lda_{num_topics}top_{iterations}iter_{passes}pass" # logging.basicConfig(filename=fmodel + ".log", # format="%(asctime)s:%(levelname)s:%(message)s", # level=logging.INFO) temp = self.dictionary[0] id2word = self.dictionary.id2token model = LdaMulticore( corpus=self.corpus, id2word=id2word, chunksize=chunksize, iterations=iterations, num_topics=num_topics, passes=passes, eval_every=eval_every) model.save(fmodel + ".pt") self.model = model # p = re.compile("(-*\d+\.\d+) per-word .* (\d+\.\d+) perplexity") # matches = [p.findall(l) for l in open(fmodel+'.log')] # matches = [m for m in matches if len(m) > 0] # tuples = [t[0] for t in matches] # perplexity = [float(t[1]) for t in tuples] # liklihood = [float(t[0]) for t in tuples] # iter = list(range(0,len(tuples)*10,10)) # plt.plot(iter,liklihood,c="black") # plt.ylabel("log liklihood") # plt.xlabel("iteration") # plt.title("Topic Model Convergence") # plt.grid() # plt.savefig(fmodel + ".pdf") return model def prepare_search(self, docs): fdocsearch = f"./models/docs_{self.model.num_topics}search.dat" if os.path.exists(fdocsearch): print("Loading docs for search from disk...") with open(fdocsearch, "rb") as fp: self.docvecs = pkl.load(fp) else: print("Preparing docs for search...") self.docvecs = {} for doc in docs: docvec = np.zeros(self.model.num_topics) doc_repr = self.dictionary.doc2bow(docs[doc]) for i, frac in self.model[doc_repr]: docvec[i] = frac self.docvecs[doc] = docvec with open(fdocsearch, "wb") as fp: pkl.dump(self.docvecs, fp) def get_model(self, num_topics, passes=6, iterations=40, prep_search=False, docs=None): fname = f"./models/lda_{num_topics}top_{iterations}iter_{passes}pass" if not os.path.exists(fname + ".pt"): print("Model not found...") return None self.model = LdaModel.load(fname + ".pt") if prep_search: self.prepare_search(docs) return self.model def search(self, query): query_repr = self.dictionary.doc2bow(read_ap.process_text(query)) qvec = np.zeros(self.model.num_topics) for i, frac in self.model[query_repr]: qvec[i] = frac results = {} for doc in self.docvecs: results[doc] = -kl_divergence(self.docvecs[doc], qvec) results = list(results.items()) results.sort(key=lambda _: -_[1]) return results