def main(): data = get_data() ############ VERY TIME CONSUMING!!! #################### ############ Get Fuzzy scores - to csv #################### df_combos(df) df = manage_data(data) scores = pd.read_csv('similarity scores.csv') scores = scores.loc[scores['score']>87] ids = list(scores.id2.unique()) df['id'] = df.index +2 df = df.loc[~df['id'].isin(ids)] plot_journal_counts(df) plot_yearly_counts(df) ########## DTM MODELS ##################### df = df.sort_values(by='Year',ascending=True) ## If you want to run lda model uncomment next line times = get_time_slice(df) doc_processed = df['abstract3'].map(preprocess) dictionary = corpora.Dictionary(doc_processed) #to prepapre a document term matrix doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_processed] ldaseq = ldaseqmodel.LdaSeqModel(corpus=doc_term_matrix, id2word = dictionary, time_slice=times, num_topics=25, chain_variance=0.05) #### Set the times times = ["1990-1992", "1993-1995", "1996-1998", "1999-2001", "2002-2004", "2005-2007", "2008-2010", "2011-2013", "2014-2016", "2017-2019"] ### Make the topics in a spreadsheet # full, twenty, twentyfive full = make_topics_time(times, ldaseq, number = "twentyfive") full['time period'] = full.apply(fix_times, axis=1)
def run(self, document_collection, topic_count=2, time_group=[10, 10, 11]): """document_collection should be sorted in order of time_slice.""" dictionary = Dictionary(document_collection) corpus = [dictionary.doc2bow(text) for text in document_collection] ldaseq = ldaseqmodel.LdaSeqModel(corpus=corpus, id2word=dictionary, num_topics=topic_count, time_slice=time_group) topics = ldaseq.print_topics(1) for topic in topics: print("TOPIC " + str(topic)) return ldaseq
def sequence_lda_topic_modeling(self, tokenized_sentence_list, time_slice, num_topics): self.time_slice = time_slice texts = [[ word for word in document.split(" ") if word not in self.__get_stopwords() ] for document in tokenized_sentence_list] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] self.ldaseq = ldaseqmodel.LdaSeqModel(corpus=corpus, id2word=dictionary, time_slice=self.time_slice, num_topics=num_topics, em_max_iter=10) sequence_topic_list = [] for num in range(num_topics): sequence_topic_list.append( (num, self.ldaseq.print_topic_times(topic=num))) return sequence_topic_list
def bruteforce_lda_entire_corpus(): # this does not work # reports Bus error on slurm # probably requires way too much RAM/compute power corpus = mmcorpus.MmCorpus(join(corpus_folder, 'in_corpus.mm')) slice_df = pd.read_csv(join(d_folder, '..', 'notes', 'month_linects.txt'), sep=' ', names=['mth', 'ct'], index_col=False) slices = list(slice_df.ct) slices[-1] += corpus.num_docs - sum(slices) dictionary = Dictionary.load(join(corpus_folder, 'in_corpus_dict.dict')) print('loaded corpus and dictionary') t1 = time.time() ldaseqmodel.LdaSeqModel(corpus=corpus, id2word=dictionary, time_slice=slices, num_topics=500) t2 = time.time() print(f"# seconds = {int(t2-t1)}")
def setUp(self): texts = [ [u'senior', u'studios', u'studios', u'studios', u'creators', u'award', u'mobile', u'currently', u'challenges', u'senior', u'summary', u'senior', u'motivated', u'creative', u'senior'], [u'performs', u'engineering', u'tasks', u'infrastructure', u'focusing', u'primarily', u'programming', u'interaction', u'designers', u'engineers', u'leadership', u'teams', u'teams', u'crews', u'responsibilities', u'engineering', u'quality', u'functional', u'functional', u'teams', u'organizing', u'prioritizing', u'technical', u'decisions', u'engineering', u'participates', u'participates', u'reviews', u'participates', u'hiring', u'conducting', u'interviews'], [u'feedback', u'departments', u'define', u'focusing', u'engineering', u'teams', u'crews', u'facilitate', u'engineering', u'departments', u'deadlines', u'milestones', u'typically', u'spends', u'designing', u'developing', u'updating', u'bugs', u'mentoring', u'engineers', u'define', u'schedules', u'milestones', u'participating'], [u'reviews', u'interviews', u'sized', u'teams', u'interacts', u'disciplines', u'knowledge', u'skills', u'knowledge', u'knowledge', u'xcode', u'scripting', u'debugging', u'skills', u'skills', u'knowledge', u'disciplines', u'animation', u'networking', u'expertise', u'competencies', u'oral', u'skills', u'management', u'skills', u'proven', u'effectively', u'teams', u'deadline', u'environment', u'bachelor', u'minimum', u'shipped', u'leadership', u'teams', u'location', u'resumes', u'jobs', u'candidates', u'openings', u'jobs'], [u'maryland', u'client', u'producers', u'electricity', u'operates', u'storage', u'utility', u'retail', u'customers', u'engineering', u'consultant', u'maryland', u'summary', u'technical', u'technology', u'departments', u'expertise', u'maximizing', u'output', u'reduces', u'operating', u'participates', u'areas', u'engineering', u'conducts', u'testing', u'solve', u'supports', u'environmental', u'understands', u'objectives', u'operates', u'responsibilities', u'handles', u'complex', u'engineering', u'aspects', u'monitors', u'quality', u'proficiency', u'optimization', u'recommendations', u'supports', u'personnel', u'troubleshooting', u'commissioning', u'startup', u'shutdown', u'supports', u'procedure', u'operating', u'units', u'develops', u'simulations', u'troubleshooting', u'tests', u'enhancing', u'solving', u'develops', u'estimates', u'schedules', u'scopes', u'understands', u'technical', u'management', u'utilize', u'routine', u'conducts', u'hazards', u'utilizing', u'hazard', u'operability', u'methodologies', u'participates', u'startup', u'reviews', u'pssr', u'participate', u'teams', u'participate', u'regulatory', u'audits', u'define', u'scopes', u'budgets', u'schedules', u'technical', u'management', u'environmental', u'awareness', u'interfacing', u'personnel', u'interacts', u'regulatory', u'departments', u'input', u'objectives', u'identifying', u'introducing', u'concepts', u'solutions', u'peers', u'customers', u'coworkers', u'knowledge', u'skills', u'engineering', u'quality', u'engineering'], [u'commissioning', u'startup', u'knowledge', u'simulators', u'technologies', u'knowledge', u'engineering', u'techniques', u'disciplines', u'leadership', u'skills', u'proven', u'engineers', u'oral', u'skills', u'technical', u'skills', u'analytically', u'solve', u'complex', u'interpret', u'proficiency', u'simulation', u'knowledge', u'applications', u'manipulate', u'applications', u'engineering'], [u'calculations', u'programs', u'matlab', u'excel', u'independently', u'environment', u'proven', u'skills', u'effectively', u'multiple', u'tasks', u'planning', u'organizational', u'management', u'skills', u'rigzone', u'jobs', u'developer', u'exceptional', u'strategies', u'junction', u'exceptional', u'strategies', u'solutions', u'solutions', u'biggest', u'insurers', u'operates', u'investment'], [u'vegas', u'tasks', u'electrical', u'contracting', u'expertise', u'virtually', u'electrical', u'developments', u'institutional', u'utilities', u'technical', u'experts', u'relationships', u'credibility', u'contractors', u'utility', u'customers', u'customer', u'relationships', u'consistently', u'innovations', u'profile', u'construct', u'envision', u'dynamic', u'complex', u'electrical', u'management', u'grad', u'internship', u'electrical', u'engineering', u'infrastructures', u'engineers', u'documented', u'management', u'engineering', u'quality', u'engineering', u'electrical', u'engineers', u'complex', u'distribution', u'grounding', u'estimation', u'testing', u'procedures', u'voltage', u'engineering'], [u'troubleshooting', u'installation', u'documentation', u'bsee', u'certification', u'electrical', u'voltage', u'cabling', u'electrical', u'engineering', u'candidates', u'electrical', u'internships', u'oral', u'skills', u'organizational', u'prioritization', u'skills', u'skills', u'excel', u'cadd', u'calculation', u'autocad', u'mathcad', u'skills', u'skills', u'customer', u'relationships', u'solving', u'ethic', u'motivation', u'tasks', u'budget', u'affirmative', u'diversity', u'workforce', u'gender', u'orientation', u'disability', u'disabled', u'veteran', u'vietnam', u'veteran', u'qualifying', u'veteran', u'diverse', u'candidates', u'respond', u'developing', u'workplace', u'reflects', u'diversity', u'communities', u'reviews', u'electrical', u'contracting', u'southwest', u'electrical', u'contractors'], [u'intern', u'electrical', u'engineering', u'idexx', u'laboratories', u'validating', u'idexx', u'integrated', u'hardware', u'entails', u'planning', u'debug', u'validation', u'engineers', u'validation', u'methodologies', u'healthcare', u'platforms', u'brightest', u'solve', u'challenges', u'innovation', u'technology', u'idexx', u'intern', u'idexx', u'interns', u'supplement', u'interns', u'teams', u'roles', u'competitive', u'interns', u'idexx', u'interns', u'participate', u'internships', u'mentors', u'seminars', u'topics', u'leadership', u'workshops', u'relevant', u'planning', u'topics', u'intern', u'presentations', u'mixers', u'applicants', u'ineligible', u'laboratory', u'compliant', u'idexx', u'laboratories', u'healthcare', u'innovation', u'practicing', u'veterinarians', u'diagnostic', u'technology', u'idexx', u'enhance', u'veterinarians', u'efficiency', u'economically', u'idexx', u'worldwide', u'diagnostic', u'tests', u'tests', u'quality', u'headquartered', u'idexx', u'laboratories', u'employs', u'customers', u'qualifications', u'applicants', u'idexx', u'interns', u'potential', u'demonstrated', u'portfolio', u'recommendation', u'resumes', u'marketing', u'location', u'americas', u'verification', u'validation', u'schedule', u'overtime', u'idexx', u'laboratories', u'reviews', u'idexx', u'laboratories', u'nasdaq', u'healthcare', u'innovation', u'practicing', u'veterinarians'], [u'location', u'duration', u'temp', u'verification', u'validation', u'tester', u'verification', u'validation', u'middleware', u'specifically', u'testing', u'applications', u'clinical', u'laboratory', u'regulated', u'environment', u'responsibilities', u'complex', u'hardware', u'testing', u'clinical', u'analyzers', u'laboratory', u'graphical', u'interfaces', u'complex', u'sample', u'sequencing', u'protocols', u'developers', u'correction', u'tracking', u'tool', u'timely', u'troubleshoot', u'testing', u'functional', u'manual', u'automated', u'participate', u'ongoing'], [u'testing', u'coverage', u'planning', u'documentation', u'testing', u'validation', u'corrections', u'monitor', u'implementation', u'recurrence', u'operating', u'statistical', u'quality', u'testing', u'global', u'multi', u'teams', u'travel', u'skills', u'concepts', u'waterfall', u'agile', u'methodologies', u'debugging', u'skills', u'complex', u'automated', u'instrumentation', u'environment', u'hardware', u'mechanical', u'components', u'tracking', u'lifecycle', u'management', u'quality', u'organize', u'define', u'priorities', u'organize', u'supervision', u'aggressive', u'deadlines', u'ambiguity', u'analyze', u'complex', u'situations', u'concepts', u'technologies', u'verbal', u'skills', u'effectively', u'technical', u'clinical', u'diverse', u'strategy', u'clinical', u'chemistry', u'analyzer', u'laboratory', u'middleware', u'basic', u'automated', u'testing', u'biomedical', u'engineering', u'technologists', u'laboratory', u'technology', u'availability', u'click', u'attach'], [u'scientist', u'linux', u'asrc', u'scientist', u'linux', u'asrc', u'technology', u'solutions', u'subsidiary', u'asrc', u'engineering', u'technology', u'contracts'], [u'multiple', u'agencies', u'scientists', u'engineers', u'management', u'personnel', u'allows', u'solutions', u'complex', u'aeronautics', u'aviation', u'management', u'aviation', u'engineering', u'hughes', u'technical', u'technical', u'aviation', u'evaluation', u'engineering', u'management', u'technical', u'terminal', u'surveillance', u'programs', u'currently', u'scientist', u'travel', u'responsibilities', u'develops', u'technology', u'modifies', u'technical', u'complex', u'reviews', u'draft', u'conformity', u'completeness', u'testing', u'interface', u'hardware', u'regression', u'impact', u'reliability', u'maintainability', u'factors', u'standardization', u'skills', u'travel', u'programming', u'linux', u'environment', u'cisco', u'knowledge', u'terminal', u'environment', u'clearance', u'clearance', u'input', u'output', u'digital', u'automatic', u'terminal', u'management', u'controller', u'termination', u'testing', u'evaluating', u'policies', u'procedure', u'interface', u'installation', u'verification', u'certification', u'core', u'avionic', u'programs', u'knowledge', u'procedural', u'testing', u'interfacing', u'hardware', u'regression', u'impact', u'reliability', u'maintainability', u'factors', u'standardization', u'missions', u'asrc', u'subsidiaries', u'affirmative', u'employers', u'applicants', u'disability', u'veteran', u'technology', u'location', u'airport', u'bachelor', u'schedule', u'travel', u'contributor', u'management', u'asrc', u'reviews'], [u'technical', u'solarcity', u'niche', u'vegas', u'overview', u'resolving', u'customer', u'clients', u'expanding', u'engineers', u'developers', u'responsibilities', u'knowledge', u'planning', u'adapt', u'dynamic', u'environment', u'inventive', u'creative', u'solarcity', u'lifecycle', u'responsibilities', u'technical', u'analyzing', u'diagnosing', u'troubleshooting', u'customers', u'ticketing', u'console', u'escalate', u'knowledge', u'engineering', u'timely', u'basic', u'phone', u'functionality', u'customer', u'tracking', u'knowledgebase', u'rotation', u'configure', u'deployment', u'sccm', u'technical', u'deployment', u'deploy', u'hardware', u'solarcity', u'bachelor', u'knowledge', u'dell', u'laptops', u'analytical', u'troubleshooting', u'solving', u'skills', u'knowledge', u'databases', u'preferably', u'server', u'preferably', u'monitoring', u'suites', u'documentation', u'procedures', u'knowledge', u'entries', u'verbal', u'skills', u'customer', u'skills', u'competitive', u'solar', u'package', u'insurance', u'vacation', u'savings', u'referral', u'eligibility', u'equity', u'performers', u'solarcity', u'affirmative', u'diversity', u'workplace', u'applicants', u'orientation', u'disability', u'veteran', u'careerrookie'], [u'embedded', u'exelis', u'junction', u'exelis', u'embedded', u'acquisition', u'networking', u'capabilities', u'classified', u'customer', u'motivated', u'develops', u'tests', u'innovative', u'solutions', u'minimal', u'supervision', u'paced', u'environment', u'enjoys', u'assignments', u'interact', u'multi', u'disciplined', u'challenging', u'focused', u'embedded', u'developments', u'spanning', u'engineering', u'lifecycle', u'specification', u'enhancement', u'applications', u'embedded', u'freescale', u'applications', u'android', u'platforms', u'interface', u'customers', u'developers', u'refine', u'specifications', u'architectures'], [u'java', u'programming', u'scripts', u'python', u'debug', u'debugging', u'emulators', u'regression', u'revisions', u'specialized', u'setups', u'capabilities', u'subversion', u'technical', u'documentation', u'multiple', u'engineering', u'techexpousa', u'reviews'], [u'modeler', u'semantic', u'modeling', u'models', u'skills', u'ontology', u'resource', u'framework', u'schema', u'technologies', u'hadoop', u'warehouse', u'oracle', u'relational', u'artifacts', u'models', u'dictionaries', u'models', u'interface', u'specifications', u'documentation', u'harmonization', u'mappings', u'aligned', u'coordinate', u'technical', u'peer', u'reviews', u'stakeholder', u'communities', u'impact', u'domains', u'relationships', u'interdependencies', u'models', u'define', u'analyze', u'legacy', u'models', u'corporate', u'databases', u'architectural', u'alignment', u'customer', u'expertise', u'harmonization', u'modeling', u'modeling', u'consulting', u'stakeholders', u'quality', u'models', u'storage', u'agile', u'specifically', u'focus', u'modeling', u'qualifications', u'bachelors', u'accredited', u'modeler', u'encompass', u'evaluation', u'skills', u'knowledge', u'modeling', u'techniques', u'resource', u'framework', u'schema', u'technologies', u'unified', u'modeling', u'technologies', u'schemas', u'ontologies', u'sybase', u'knowledge', u'skills', u'interpersonal', u'skills', u'customers', u'clearance', u'applicants', u'eligibility', u'classified', u'clearance', u'polygraph', u'techexpousa', u'solutions', u'partnership', u'solutions', u'integration'], [u'technologies', u'junction', u'develops', u'maintains', u'enhances', u'complex', u'diverse', u'intensive', u'analytics', u'algorithm', u'manipulation', u'management', u'documented', u'individually', u'reviews', u'tests', u'components', u'adherence', u'resolves', u'utilizes', u'methodologies', u'environment', u'input', u'components', u'hardware', u'offs', u'reuse', u'cots', u'gots', u'synthesis', u'components', u'tasks', u'individually', u'analyzes', u'modifies', u'debugs', u'corrects', u'integrates', u'operating', u'environments', u'develops', u'queries', u'databases', u'repositories', u'recommendations', u'improving', u'documentation', u'develops', u'implements', u'algorithms', u'functional', u'assists', u'developing', u'executing', u'procedures', u'components', u'reviews', u'documentation', u'solutions', u'analyzing', u'conferring', u'users', u'engineers', u'analyzing', u'investigating', u'areas', u'adapt', u'hardware', u'mathematical', u'models', u'predict', u'outcome', u'implement', u'complex', u'database', u'repository', u'interfaces', u'queries', u'bachelors', u'accredited', u'substituted', u'bachelors', u'firewalls', u'ipsec', u'vpns', u'technology', u'administering', u'servers', u'apache', u'jboss', u'tomcat', u'developing', u'interfaces', u'firefox', u'internet', u'explorer', u'operating', u'mainframe', u'linux', u'solaris', u'virtual', u'scripting', u'programming', u'oriented', u'programming', u'ajax', u'script', u'procedures', u'cobol', u'cognos', u'fusion', u'focus', u'html', u'java', u'java', u'script', u'jquery', u'perl', u'visual', u'basic', u'powershell', u'cots', u'cots', u'oracle', u'apex', u'integration', u'competitive', u'package', u'bonus', u'corporate', u'equity', u'tuition', u'reimbursement', u'referral', u'bonus', u'holidays', u'insurance', u'flexible', u'disability', u'insurance'], [u'technologies', u'disability', u'accommodation', u'recruiter', u'techexpousa'], ['bank', 'river', 'shore', 'water'], ['river', 'water', 'flow', 'fast', 'tree'], ['bank', 'water', 'fall', 'flow'], ['bank', 'bank', 'water', 'rain', 'river'], ['river', 'water', 'mud', 'tree'], ['money', 'transaction', 'bank', 'finance'], ['bank', 'borrow', 'money'], ['bank', 'finance'], ['finance', 'money', 'sell', 'bank'], ['borrow', 'sell'], ['bank', 'loan', 'sell'] ] # initializing using own LDA sufficient statistics so that we get same results each time. sstats = np.loadtxt(datapath('sstats_test.txt')) dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] self.ldaseq = ldaseqmodel.LdaSeqModel(corpus = corpus , id2word= dictionary, num_topics=2, time_slice=[10, 10, 11], initialize='own', sstats=sstats)
def trigramz(date_1, date_2): """We then grab the time slice and then we can run the ldaseq with the given time_slice""" # grab the time_slice of the documents that we need time_sliced = time_slice(date_1, date_2) print(time_sliced) if time_sliced == 'not in dictionary': raise ValueError('dates must be contained in text timeframe') trigram_dictionary = phrases() print(trigram_dictionary) trigram_bow_corpus = B_O_wCreator(trigram_reviews_filepath, trigram_dictionary) with warnings.catch_warnings(): warnings.simplefilter('ignore') # lda = LdaMulticore(trigram_bow_corpus, # num_topics=50, # id2word=trigram_dictionary, # workers=3) ldaseq = ldaseqmodel.LdaSeqModel(corpus=trigram_bow_corpus, id2word=trigram_dictionary, time_slice=time_sliced, num_topics=50) ldaseq.save(lda_model_filepaths) print(ldaseq)
# Count word frequencies from collections import defaultdict frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 # Only keep words that appear more than once processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts] dictionary = corpora.Dictionary(processed_corpus) bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus] # Model num_topics = 5 ldaseq = ldaseqmodel.LdaSeqModel(corpus=bow_corpus, id2word=dictionary, time_slice=time_slice, num_topics=num_topics) # Output topic every year word_num = 12 out_path = "topic" with open(out_path + ".csv", "w") as csvfile: writer = csv.writer(csvfile) header = [] for i in range(0, num_topics): header.append(("topic_" + str(i + 1))) writer.writerow(header) for j in range(0, len(time_slice)): writer.writerow(ldaseq.print_topics(j, word_num)) # Visualising Dynamic Topic Models doc_topic, topic_term, doc_lengths, term_frequency, vocab = ldaseq.dtm_vis(time=0, corpus=bow_corpus)
decay=0.6, offset=0.8, passes=10, iterations=400, eval_every=10, model_file=model_file, only_viz=config.DO_NOT_COMPUTE_COHERENCE) print( 'topic:' + str(num_topics), ', time_slice' + ' '.join( [str(i) + ':' + str(j) for i, j in enumerate(time_slices)]) + ', c_v:' + str(round(c_v, 4)) + ', cu:' + str(round(u_mass, 4))) dyn_model = ldaseqmodel.LdaSeqModel(initialize='ldamodel', lda_model=model, time_slice=time_slices, corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=10, random_state=config.SEED) filename = config.TRAIN_PARAMETERS[section][3] dyn_model.save(filename) for t in range(0, len(time_slices)): doc_topic, topic_term, doc_lengths, term_frequency, vocab = dyn_model.dtm_vis( time=t, corpus=corpus) prepared = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency)
per_word_topics=True) # In[54]: #seqmodel from gensim.models import ldaseqmodel from gensim.corpora import Dictionary, bleicorpus import numpy from gensim.matutils import hellinger # In[60]: time_slice = [50, 145, 116, 81, 76, 19, 43, 64, 57, 65, 55, 16, 7] #time_slice=[350] ldaseq = ldaseqmodel.LdaSeqModel(corpus=corpus_new, id2word=id2word, time_slice=time_slice, num_topics=20) # In[61]: with open('ldaseq20.dat', "wb") as file: pickle.dump(ldaseq, file) # In[62]: ldaseq.print_topics(time=0) # In[30]: # Print the Keyword in the topics print(lda_model.print_topics())
# 读取时间段 t = open(main_path + 'corpus/dtm_o/time_series.txt', 'r') time_series = [int(i) for i in t.read().split()] t.close() # 建模 model_gen = ldaseqmodel.LdaSeqModel( corpus=corpus, time_slice=time_series, id2word=corpus.dictionary, alphas=para['alpha'], num_topics=para['num_topics'], initialize=para['initialize'], sstats=None, lda_model=None, obs_variance=para['obs_variance'], chain_variance=para['top_chain_var'], passes=10, random_state=None, lda_inference_max_iter=para['lda_inference_max_iter'], em_min_iter=para['em_min_iter'], em_max_iter=para['em_max_iter'], chunksize=100) # model_gen = LdaSeqModel(corpus = corpus, time_slice=time_series, id2word = dictionary, num_topics = num_topics) print 'model training finish' model_gen.save(main_path + 'result/dtm_o_' + sys.platform + '_topic_' + str(para['num_topics']) + '.model') print 'model saving finish' #model1 = DtmModel.load('topic1.model')
def generate_timeslice_data(index_arr): arr = [] for i in index_arr.values(): arr.append(i) diff_arr=[] for i in range(1,len(arr)): diff_arr.append(arr[i]-arr[i-1]) return diff_arr # In[4]: time_slices = generate_timeslice_data(time_index_arr) # In[ ]: ldaseq = ldaseqmodel.LdaSeqModel(corpus=corpus, id2word=id2word, time_slice=time_slices, num_topics=35) # In[ ]:
def run(n, folder, verbose=True): # documents is a list of lists, where each nested list has the words from one document documents = load_documents(folder) len(documents) len(documents[0]) # remove common words stoplist = set( 'for a an of the and or to in from on is are can we'.split()) documents = [[word for word in document if word not in stoplist] for document in documents] # remove words that appear only once from collections import defaultdict frequency = defaultdict(int) for document in documents: for word in document: frequency[word] += 1 documents = [[word for word in document if frequency[word] > 1] for document in documents] len(documents) [len(document) for document in documents] # use only the first n words per document documents = [document[:n] for document in documents] [len(document) for document in documents] class DTMcorpus(textcorpus.TextCorpus): def get_texts(self): return self.input def __len__(self): return len(self.input) corpus = DTMcorpus(documents) first_half = len(documents) / 2 second_half = len(documents) - first_half time_slice = [first_half, second_half] # n documents split into 2 time slices if verbose: # activate logging logger = logging.getLogger() logger.setLevel(logging.DEBUG) # run ldaseq = ldaseqmodel.LdaSeqModel(corpus=corpus, id2word=corpus.dictionary, time_slice=time_slice, num_topics=5) # Visualizing dynamic topic models from gensim.models.wrappers.dtmmodel import DtmModel from gensim.corpora import Dictionary, bleicorpus import pyLDAvis doc_topic, topic_term, doc_lengths, term_frequency, vocab = ldaseq.dtm_vis( time=0, corpus=corpus) vis_dtm = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency) # For ipython notebook: # pyLDAvis.display(vis_dtm) # This works best for me (then view dtm.html in a browser) with open("dtm3.html", "w") as f: pyLDAvis.save_html(vis_dtm, f) return ("dtm3.html saved.")
model_corpus_17 = tfidfmodel[ldacorpus_17] print(sent_big_17[0]) print(ldacorpus_17[0]) print(model_corpus_17[0]) time_slice = [3323, 3324, 7485, 7486] #Time slots I am considering #First time slot represents the first half of Book1 #Second time slot represents the second half of Book1 #Third time slot represents the first half of Book7 #Fourth time slot represents the second half of Book7 #Run the LDA Seq model that allows to track the dynamics of the topics. ldaseq = ldaseqmodel.LdaSeqModel(corpus=ldacorpus_17, id2word=dictionary_big_17, time_slice=time_slice, num_topics=3) # In[43]: #Look at the three different topics at time 0 (corresponding to the first book) ldaseq.print_topics(time=0) # In[44]: ldaseq.print_topics( time=1 ) #Look at the three different topics at time 1 (corresponding to book7) # In[45]:
normalized = " ".join(word for word in punc_free.split() if len(word) > 4) return normalized # In[5]: doc_clean = [clean(d).split() for d in docs] dictionary = corpora.Dictionary(doc_clean) doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean] #generating a document-term matrix # In[6]: #running the model on the data ldaseq = ldaseqmodel.LdaSeqModel(doc_term_matrix, id2word=dictionary, time_slice=time_slice, num_topics=5) # In[7]: #printing the topics generated ldaseq.print_topics(time=0) # In[8]: #printing evolution of topics in time slices ldaseq.print_topic_times(topic=0) # In[27]: #testing the trained model on a document in the corpus
def __len__(self): return len(self.input) corpus = DTMcorpus(documents) time_slice = [3, 7] # 10 documents split into 2 time slices # activate logging logger = logging.getLogger() logger.setLevel(logging.DEBUG) # run ldaseq = ldaseqmodel.LdaSeqModel(corpus=corpus, id2word=corpus.dictionary, time_slice=time_slice, num_topics=5) # Visualizing dynamic topic models from gensim.models.wrappers.dtmmodel import DtmModel from gensim.corpora import Dictionary, bleicorpus import pyLDAvis doc_topic, topic_term, doc_lengths, term_frequency, vocab = ldaseq.dtm_vis( time=0, corpus=corpus) vis_dtm = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency)
dictionary.compactify() # 删除去除单词后的空格 dictionary.save('C:/Users/WCH/Desktop/try-DTM/news_dictionary') # 保存词典 #将文档加载成构造语料库 class MyCorpus(object): def __iter__(self): for line in open('C:/Users/WCH/Desktop/try-DTM/testceshi.txt'): yield dictionary.doc2bow(line.lower().split()) corpus_memory_friendly = MyCorpus() corpus = [vector for vector in corpus_memory_friendly] # 将读取的文档转换成语料库 corpora.BleiCorpus.serialize('C:/Users/WCH/Desktop/try-DTM/news_corpus', corpus) # 存储为Blei lda-c格式的语料库 dictionary = Dictionary.load('C:/Users/WCH/Desktop/try-DTM/news_dictionary') corpus = bleicorpus.BleiCorpus('C:/Users/WCH/Desktop/try-DTM/news_corpus') time_slice = [1000, 1000, 1000, 821] #设置这个语料库的间隔,此处分为三个时期,第一个时期内有438条新闻,第二为430条,第三个为456条。 num_topics = 5 #设置主题数,此处为5个主题 ldaseq = ldaseqmodel.LdaSeqModel(corpus=corpus, id2word=dictionary, time_slice=time_slice, num_topics=num_topics) #将语料库、词典、参数加载入模型中进行训练 print ("########################################################") corpusTopic = ldaseq.print_topics(time=0) # 输出指定时期主题分布,此处第一个时期主题分布 print ("corpusTopic") print(corpusTopic) topicEvolution = ldaseq.print_topic_times(topic=0) # 查询指定主题在不同时期的演变,此处为第一个主题的 print ("topicEvolution") print(topicEvolution) doc = ldaseq.doc_topics(0) # 查询指定文档的主题分布,此处为第一篇文档的主题分布 print ("########################################################") corpusTopic = ldaseq.print_topics(time=1) # 输出指定时期主题分布,此处第一个时期主题分布 print ("corpusTopic") print(corpusTopic) topicEvolution = ldaseq.print_topic_times(topic=1) # 查询指定主题在不同时期的演变,此处为第一个主题的
WordCloud(background_color='white', colormap='brg', min_font_size=4, max_words=80, scale=1).fit_words(dict(LDAmain.show_topic(t, 200)))) plt.axis("off") plt.title("Topic #" + str(t)) plt.show() fig.savefig("visuals\LDAmaintop_%s.png" % t) #-----------------------4. Dynamic topic modelling------------------------------- time_slice = list_mon #declaring time slice ldaseq = ldaseqmodel.LdaSeqModel(initialize='ldamodel', lda_model=LDAmain, num_topics=5, corpus=corpus, id2word=dictionary, time_slice=time_slice, chain_variance=0.05) ldaseq.print_topics(time=0, top_terms=20) ldaseq.print_topic_times(topic=0, top_terms=20) ldaseq.save("models\ldaseqmain") #exporting results np.savetxt("topics\dtmtop0file.csv", ldaseq.print_topic_times(topic=0), delimiter=",", fmt='%s') np.savetxt("topics\dtmtop1file.csv", ldaseq.print_topic_times(topic=1), delimiter=",",
dictionary.filter_extremes( no_below=10, no_above=0.25 ) #Filter words that appear less than 10 documents and more than 25% of all documents #Create the bag of words for all documents bag_of_words = [dictionary.doc2bow(abstract) for abstract in abstracts] print('- Read and preprocessed the dataset!') ########################## DYNAMIC TOPIC MODELING ########################## #Build the model print('- Training the model') start_time = time.time() #Start count time ldaseq = ldaseqmodel.LdaSeqModel(corpus=bag_of_words, id2word=dictionary, time_slice=time_slices_2years_interval, num_topics=8) print('- Model finish running in', round((time.time() - start_time) / 60), 'min(s)') #Save the model path = datapath('dynamic_model_code') ldaseq.save(path) ########################## EVALUATION ########################## coherence = ldaseq.dtm_coherence(time=0) temp = CoherenceModel(topics=coherence, corpus=bag_of_words, dictionary=dictionary, coherence='u_mass') print("u_mass = ", temp.get_coherence()) temp = CoherenceModel(topics=coherence,