def get_corpus(filename): """Load raw data from a file and return vectorizer and feature_matrix. Parameters ---------- filename: The path to a json file containing the university database. Returns ------- corpus: A numpy array containing abstracts and titles. """ df_cleaned = database_cleaner(filename) # For nlp, only retaining faculty_name, research_areas, paper_titles, abstracts df_filtered = df_cleaned[[ 'faculty_name', 'research_areas', 'paper_titles', 'abstracts' ]] missing = df_filtered['paper_titles'] == '' df_nlp = df_filtered[~missing] # Choosing abstracts and paper_titles to predict topics for a professor df_nlp['research_areas'] = df_nlp['research_areas'].apply( lambda x: " ".join(x)) corpus = (df_nlp['paper_titles'] + df_nlp['abstracts'] + df_nlp['research_areas']).values return corpus
def get_data(filename): """Load raw data from a file and return vectorizer and feature_matrix. Parameters ---------- filename: The path to a json file containing the university database. Returns ------- data: A numpy array containing abstracts. """ df_cleaned = database_cleaner(filename) # For nlp, only retaining faculty_name, research_areas, paper_titles, abstracts df_filtered = df_cleaned[[ 'faculty_name', 'research_areas', 'paper_titles', 'abstracts' ]] missing = df_filtered['paper_titles'] == '' num_missing = sum(missing) print(f'{num_missing} faculties have missing papers in {filename}') print('Running nlp-pipeline on faculties with non-missing papers...') df_nlp = df_filtered[~missing] # Choosing abstracts and paper_titles to predict topics for a professor df_nlp['research_areas'] = df_nlp['research_areas'].apply( lambda x: " ".join(x)) data = (df_nlp['paper_titles'] + df_nlp['abstracts'] + df_nlp['research_areas']).values return data
lda_model.fit() print(lda_model.coherence_score()) # Fit LDAMallet to training data(doesn't work with tf-idf) ldamallet_model = MyGenSimModel(num_topics=11, algorithm='LDAMallet', tf_idf=False, bigrams=True, trigrams=False, lemmatization=True) ldamallet_model.transform(data) ldamallet_model.fit() print(ldamallet_model.coherence_score()) # Append to pge_database with updated predicted_research_areas based on top-10 features pge_df = database_cleaner('../data/json/majors_database.json') doc_topics_df_LDA = lda_model.format_document_topics() doc_topics_df_LDAMallet = ldamallet_model.format_document_topics() pge_df_updated_LDA = pd.concat([pge_df, doc_topics_df_LDA], axis=1) pge_df_updated_LDAMallet = pd.concat([pge_df, doc_topics_df_LDAMallet], axis=1) pge_df_updated_LDA.to_json( path_or_buf='../data/json/final_gensim_database_LDA.json') pge_df_updated_LDAMallet.to_json( path_or_buf='../data/json/final_gensim_database_LDAMallet.json') # Save html for the pyLDAvis visualization of LDAMallet model vis_LDA = lda_model.visualize_lda_model() vis_LDAMallet = ldamallet_model.visualize_lda_mallet() pyLDAvis.save_html(data=vis_LDA, fileobj="templates/LDA.html") pyLDAvis.save_html(data=vis_LDAMallet, fileobj="templates/LDAMallet.html")