Example #1
0
 def test_annotated_empty(self):
     anno_json = corpus.TermLabels(
         "../../data/test/samples_with_manual_annotation.json1")
     check_corpus = corpus.Corpus(
         "../../data/test/samples_news_clean_random.xml",
         annotations=anno_json)
     self.assertEqual(len(check_corpus), 0)
Example #2
0
 def test_subsetting(self):
     check_corpus = corpus.Corpus(
         "../../data/test/samples_news_clean_random.xml")
     sample_urls = [
         "https://theloadstar.com/will-digitisation-kill-off-forwarder/",
         "https://theloadstar.com/alert-airlines-unsafe-hoverboards-iata-calls-stiff-penalty-shippers-mis-declare-battery-devices/",
         "https://theloadstar.com/australian-box-terminal-operators-offset-falling-volumes-by-hiking-fees/",
     ]
     subset_xml = check_corpus.get_documents_by_urls(sample_urls)
     self.assertEqual(len(subset_xml), len(sample_urls))
Example #3
0
def process_manual_annotation():
    log.info(
        f"Begin incorporating manual annotation to the XML, result in {RELEVANT_DIR}"
    )
    anno_json = corpus.TermLabels(
        os.path.join(MANUAL_DIR, "terms", "news.jsonl"))
    manual_corpus = corpus.Corpus(os.path.join(PROCESSED_DIR,
                                               "lda_sampling_15p.xml"),
                                  annotations=anno_json)
    manual_corpus.write_xml_to(
        os.path.join(PROCESSED_DIR, "lda_sampling_15p.annotated.xml"))
Example #4
0
def preprocess_corpus():
    log.info(f"Begin combining from {SCRAPED_DIR}")
    combined_corpus = corpus.Corpus(SCRAPED_DIR)
    log.info("Begin filtering empty documents")
    combined_corpus.filter_empty()
    n_sample = 10
    log.info(f"Begin sampling, n={n_sample}")
    sampled_corpus = combined_corpus.get_sample(n_sample)
    log.info(f"Write sample.xml to {INTERIM_DIR}")
    sampled_corpus.write_xml_to(os.path.join(
        INTERIM_DIR, "sample.xml"))  # use dummy filename for now
Example #5
0
def evaluate_terms():
    annotated_corpus = corpus.Corpus(os.path.join(RELEVANT_DIR, "dev.xml"))
    log.info("Begin evaluation")
    evaluator = evaluation.Evaluator(annotated_corpus)
    extracted_terms = {
        "TF-IDF": "tfidf.csv",
        "KPM": "kpm.csv",
        "YAKE": "yake.csv",
        "SingleRank": "singlerank.csv",
        "TopicRank": "topicrank.csv",
        "MultipartiteRank": "mprank.csv",
        "PositionRank": "positionrank.csv",
        "EmbedRank": "embedrank_wiki_unigrams.csv"
    }
    for method, file_name in extracted_terms.items():
        t = terms.TermsExtractor.read_terms_from(
            os.path.join(EXTRACTED_DIR, file_name))
        evaluator.add_prediction(method, t)
    today_date = date.today().strftime("%Y%m%d")
    evaluator.evaluate_and_visualize(
        os.path.join(PLOT_DIR, f"eval_{today_date}.html"))
Example #6
0
def create_core_nlp_documents(core_nlp_folder):
    log.info(f"Begin preparing Core NLP Documents to {core_nlp_folder}")
    annotated_corpus = corpus.Corpus(
        os.path.join(PROCESSED_DIR, "lda_sampling_15p.annotated.xml"))
    annotated_corpus.write_to_core_nlp_xmls(core_nlp_folder)
Example #7
0
 def test_news(self):
     check_corpus1 = corpus.Corpus("../../data/test/samples_news_raw.xml")
     self.assertEqual(len(check_corpus1), 2)
Example #8
0
 def test_combine_and_filter(self):
     combined_corpus = corpus.Corpus(
         xml_input="../../data/test/scrape_samples/")
     self.assertEqual(len(combined_corpus), 102)
     combined_corpus.filter_empty()
     self.assertEqual(len(combined_corpus), 99)
Example #9
0
 def test_sampling(self):
     check_corpus = corpus.Corpus(
         "../../data/test/samples_news_clean_random.xml")
     sample_xml = check_corpus.get_sample(3)
     self.assertEqual(len(sample_xml), 3)
Example #10
0
 def test_wiki(self):
     check_corpus = corpus.Corpus("../../data/test/samples_wiki.xml")
     self.assertEqual(len(check_corpus), 3)
Example #11
0
 def test_existing_annotation_w_extra_annotation(self):
     anno_json = corpus.TermLabels(
         "../../data/test/samples_with_manual_annotation.json1")
     check_corpus = corpus.Corpus("../../data/test/samples_with_terms.xml",
                                  annotations=anno_json)
     self.assertEqual(len(check_corpus), 2)
Example #12
0
 def test_existing_annotation(self):
     check_corpus = corpus.Corpus("../../data/test/samples_with_terms.xml")
     self.assertEqual(len(check_corpus), 2)