Ejemplo n.º 1
0
    ### Tokenize, remove stopwords, save the result ###
    # preprocesser = Preprocesser()
    # preprocesser.tokenize(corpus, remove_stopwords=False)
    # corpus_tokenized = preprocesser.corpus_tokenized
    # pickle.dump(corpus_tokenized, open('resources/corpus_300k_filtered_tokenized_with_stopwords_cs.c', 'wb'))
    # save_file(corpus_tokenized, "corpus_300k_filtered_tokenized_with_stopwords_cs")
    # save_file(corpus_tokenized, "corpus_10k_test")

    corpus_tokenized = pickle.load(
        open(
            "/home/nsaef/projects/CollectionExplorer/web/CollectionExplorer/static/CollectionExplorer/corpora/12/12_tokens_stopwords-included_cs.corpus",
            "rb"))

    ##### Versioning and Duplicates #####
    version_handler = VersionHandler()
    version_handler.calc_hashes(corpus_tokenized)
    candidates = version_handler.calculate_similarities()

    ##### Topic Modelling #####

    # ### Vectorize the corpus using raw frequencies for lda ###
    # processer_rf = Preprocesser()
    # corpus_rf = processer_rf.vectorize_frequencies(corpus)
    # feature_names = processer_rf.feature_names_raw

    # ### Create topic models using LDA ###
    # lda = TopicModeller(n_topics=30)
    # lda.create_topic_models(corpus_rf, feature_names)
    # topics = lda.documents_per_topic(corpus_rf, corpus)
    # lda.print_top_words(feature_names, n_top_words=20, collection=topics)