### Tokenize, remove stopwords, save the result ### # preprocesser = Preprocesser() # preprocesser.tokenize(corpus, remove_stopwords=False) # corpus_tokenized = preprocesser.corpus_tokenized # pickle.dump(corpus_tokenized, open('resources/corpus_300k_filtered_tokenized_with_stopwords_cs.c', 'wb')) # save_file(corpus_tokenized, "corpus_300k_filtered_tokenized_with_stopwords_cs") # save_file(corpus_tokenized, "corpus_10k_test") corpus_tokenized = pickle.load( open( "/home/nsaef/projects/CollectionExplorer/web/CollectionExplorer/static/CollectionExplorer/corpora/12/12_tokens_stopwords-included_cs.corpus", "rb")) ##### Versioning and Duplicates ##### version_handler = VersionHandler() version_handler.calc_hashes(corpus_tokenized) candidates = version_handler.calculate_similarities() ##### Topic Modelling ##### # ### Vectorize the corpus using raw frequencies for lda ### # processer_rf = Preprocesser() # corpus_rf = processer_rf.vectorize_frequencies(corpus) # feature_names = processer_rf.feature_names_raw # ### Create topic models using LDA ### # lda = TopicModeller(n_topics=30) # lda.create_topic_models(corpus_rf, feature_names) # topics = lda.documents_per_topic(corpus_rf, corpus) # lda.print_top_words(feature_names, n_top_words=20, collection=topics)