def evaluate_genia_dataset():
    import logging.config
    logging.config.fileConfig(os.path.join('..', 'logging.conf'))

    import multiprocessing
    from jgtextrank import keywords_extraction_from_tagged_corpus
    import time

    reader = WordListCorpusReader('../resource', 'smart-stop-list.txt')
    stop_list = reader.words()
    gs_terms = load_genia_gs_terms('genia_gs_terms.txt')

    corpus_directory = os.path.join('GENIAcorpus302', 'text', 'files')
    pre_processed_corpus = pre_processing_corpus_with_spacy(corpus_directory,
                                                            encoding="utf-8",
                                                            lemma=True)
    print("term extraction from spaCy pre-processed corpus [%s] ..." %
          corpus_directory)
    pre_processed_corpus = list(pre_processed_corpus)
    start = time.time()
    genia_keywords, genia_top_vertices = keywords_extraction_from_tagged_corpus(
        pre_processed_corpus,
        window=2,
        top_p=1,
        stop_words=stop_list,
        weight_comb="norm_max",
        export=False,
        workers=multiprocessing.cpu_count())

    end = time.time()

    print("\n")
    print("Complete. Wall-clock elapsed time: ", end - start, "s")
    print("\n")
    print("top 50 keywords: ", genia_keywords[:50])
    evaluate_results(genia_keywords, gs_terms, "genia+pagerank")

    start = time.time()
    genia_gcvalue_keywords, genia_gcvalue_top_vertices = keywords_extraction_from_tagged_corpus(
        pre_processed_corpus,
        window=2,
        top_p=1,
        stop_words=stop_list,
        weight_comb="gcvalue",
        export=False,
        workers=multiprocessing.cpu_count())

    end = time.time()

    print("\n")
    print("Complete. Wall-clock elapsed time: ", end - start, "s")
    print("\n")
    print("top 50 keywords: ", genia_gcvalue_keywords[:50])
    evaluate_results(genia_gcvalue_keywords, gs_terms,
                     "genia+pagerank+gcvalue")
def evaluate_hulth2003_testset():
    import logging.config
    logging.config.fileConfig(os.path.join('..', 'logging.conf'))

    import multiprocessing
    from jgtextrank import keywords_extraction_from_tagged_corpus
    import time

    reader = WordListCorpusReader('../resource', 'smart-stop-list.txt')
    stop_list = reader.words()
    hulth2003_corpus_directory = os.path.join('Hulth2003', 'Test')
    hulth2003_gs_terms = load_Hulth2003_gs_terms(hulth2003_corpus_directory)

    hulth2003_corpus_directory = os.path.join('Hulth2003', 'Test')
    pre_processed_hulth2003_corpus = pre_processing_corpus_with_spacy(hulth2003_corpus_directory,
                                                                      encoding="utf-8", lemma=True, default_file_suffix=".abstr")

    print("term extraction from spaCy pre-processed hulth2003 corpus [%s] ..." % hulth2003_corpus_directory)
    pre_processed_hulth2003_corpus = list(pre_processed_hulth2003_corpus)

    start = time.time()
    hulth2003_keywords, hulth2003_top_t_vertices = keywords_extraction_from_tagged_corpus(pre_processed_hulth2003_corpus,
                                                                                                      window=3,
                                                                                                      top_p = 1, stop_words=stop_list,
                                                                                                      weight_comb="norm_max",
                                                                                                      export=False,
                                                                                                      workers=multiprocessing.cpu_count())

    end = time.time()

    print("\n")
    print("Complete. Wall-clock elapsed time: ", end - start, "s")
    print("\n")
    print("top 50 keywords: ", hulth2003_keywords[:50])
    print("\n")
    evaluate_results(hulth2003_keywords, hulth2003_gs_terms, "hulth2003+pagerank")

    start = time.time()
    hulth2003_keywords, hulth2003_top_t_vertices = keywords_extraction_from_tagged_corpus(pre_processed_hulth2003_corpus,
                                                                                          window=2,
                                                                                          top_p = 1, stop_words=stop_list,
                                                                                          weight_comb="gcvalue",
                                                                                          export=False,
                                                                                          workers=multiprocessing.cpu_count())

    end = time.time()

    print("\n")
    print("Complete. Wall-clock elapsed time: ", end - start, "s")
    print("\n")
    print("top 50 keywords: ", hulth2003_keywords[:50])
    print("\n")
    evaluate_results(hulth2003_keywords, hulth2003_gs_terms, "hulth2003+pagerank+gcvalue")
def evaluate_semeval2017_testset():
    import logging.config
    logging.config.fileConfig(os.path.join('..', 'logging.conf'))

    import multiprocessing
    from jgtextrank import keywords_extraction_from_tagged_corpus
    import time
    gs_terms = load_scienceie_test_dataset("semeval_articles_test",
                                           file_suffix=".ann")
    print(len(gs_terms), " gs terms loaded.")
    reader = WordListCorpusReader('../resource', 'smart-stop-list.txt')
    stop_list = reader.words()
    semeval2017_pre_processed_corpus = pre_processing_corpus_with_spacy(
        "semeval_articles_test", default_file_suffix=".txt")
    semeval2017_pre_processed_corpus = list(semeval2017_pre_processed_corpus)
    start = time.time()
    semeval2017_keywords, semeval2017_top_t_vertices = keywords_extraction_from_tagged_corpus(
        semeval2017_pre_processed_corpus,
        window=3,
        top_p=1,
        stop_words=stop_list,
        weight_comb="avg",
        export=False,
        workers=1)
    end = time.time()
    print("\n")
    print("Complete. Wall-clock elapsed time: ", end - start, "s")
    print("\n")
    print("top 50 keywords: ", semeval2017_keywords[:50])
    print("\n")
    evaluate_results(semeval2017_keywords, gs_terms, "pagerank+norm_max")

    start = time.time()
    semeval2017_gcvalue_keywords, semeval2017_gcvalue_top_t_vertices = keywords_extraction_from_tagged_corpus(
        semeval2017_pre_processed_corpus,
        window=3,
        top_p=1,
        stop_words=stop_list,
        weight_comb="gcvalue",
        export=False,
        workers=1)
    end = time.time()
    print("\n")
    print("Complete. Wall-clock elapsed time: ", end - start, "s")
    print("\n")
    print("top 50 keywords: ", semeval2017_gcvalue_keywords[:50])
    print("\n")
    evaluate_results(semeval2017_gcvalue_keywords, gs_terms,
                     "pagerank + GC-Value")
def evaluate_aclrdtec1_dataset():
    import logging.config
    logging.config.fileConfig(os.path.join('../jgtextrank', 'logging.conf'))

    import multiprocessing
    from jgtextrank import keywords_extraction_from_tagged_corpus
    import time

    reader = WordListCorpusReader('../jgtextrank', 'smart-stop-list.txt')
    stop_list = reader.words()

    # C:\\Data\\NLP-corpus\\ACL RD-TEC\\_all_annotated_candid_term\\_all_annotated_candid_term
    aclrdtec1_gs_terms_file_path = os.path.join('/home', 'jieg', 'data',
                                                'ACL RD-TEC-1',
                                                '_all_annotated_candid_term')
    print("loading ACLRDTEC 1.0 GS terms from [%s] ..." %
          aclrdtec1_gs_terms_file_path)
    aclrdtec1_gs_terms = load_aclrdtec1_gs_terms(aclrdtec1_gs_terms_file_path)
    print("total [%s] normed GS terms loaded" % len(aclrdtec1_gs_terms))

    zipped_corpus_path = os.path.join('/home', 'jieg', 'data', 'ACL RD-TEC-1',
                                      'ACLRDTEC-1.zip')
    print("term extraction from spaCy pre-processed corpus [%s] ..." %
          zipped_corpus_path)
    start = time.time()
    doc_content_list = load_all_files_from_zip_file(zipped_corpus_path)

    pre_processed_corpus = pre_processing_unzipped_corpus_with_spacy(
        doc_content_list)
    pre_processed_corpus = list(pre_processed_corpus)
    end = time.time()
    print("\n")
    print("Complete corpus pre-processing. Wall-clock elapsed time: ",
          end - start, "s")
    print("\n")

    start = time.time()
    aclrdtec1_keywords, aclrdtec1_top_vertices = keywords_extraction_from_tagged_corpus(
        pre_processed_corpus,
        window=2,
        top_p=1,
        stop_words=stop_list,
        weight_comb="norm_max",
        export=False,
        workers=multiprocessing.cpu_count())

    end = time.time()

    print("\n")
    print("Complete. Wall-clock elapsed time: ", end - start, "s")
    print("\n")
    print("top 50 keywords: ", aclrdtec1_keywords[:50])
    evaluate_results(aclrdtec1_keywords, aclrdtec1_gs_terms,
                     "aclrdtec1 + pagerank")

    start = time.time()
    aclrdtec1_gcvalue_keywords, aclrdtec1_gcvalue_top_vertices = keywords_extraction_from_tagged_corpus(
        pre_processed_corpus,
        window=2,
        top_p=1,
        stop_words=stop_list,
        weight_comb="gcvalue",
        export=False,
        workers=multiprocessing.cpu_count())

    end = time.time()

    print("\n")
    print("Complete. Wall-clock elapsed time: ", end - start, "s")
    print("\n")
    print("top 50 keywords: ", aclrdtec1_gcvalue_keywords[:50])
    evaluate_results(aclrdtec1_gcvalue_keywords, aclrdtec1_gs_terms,
                     "aclrdtec1+pagerank+gcvalue")