Beispiel #1
0
def scrape_extract_store_rss(site_name, db):
    latest_date = db.get_latest_date_of_source(site_name)
    for newsflash in rss_sites.scrape(site_name):
        if newsflash.date <= latest_date:
            break
        # TODO: pass both title and description, leaving this choice to the classifier
        newsflash.accident = classify_rss(newsflash.title
                                          or newsflash.description)
        newsflash.organization = classify_organization(site_name)
        if newsflash.accident:
            # FIX: No accident-accurate date extracted
            extract_geo_features(db, newsflash)
        db.insert_new_newsflash(newsflash)
Beispiel #2
0
def test_classification_statistics_ynet():
    # The classification in the file is "definitional", meaning:
    # We don't care if it is "about" an accident, but rather whether it us "THE report".
    # In other words, is it the _first_ report about a _recent_ accident
    with open('tests/accidents_definitional_ynet.tsv', encoding='utf8') as f:
        data = [line.split('\t') for line in f.read().split('\n')]

    stats = {True: {True: 0, False: 0}, False: {True: 0, False: 0}}
    for title, expected in data:
        expected = bool(int(expected))
        actual = classify_rss(title)
        stats[expected][actual] += 1

    tp = stats[True][True]
    fp = stats[False][True]
    fn = stats[True][False]
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * precision * recall / (precision + recall)

    # These constants should (hopefully) only be updated upwards
    assert precision > BEST_PRECISION_YNET
    assert recall > BEST_RECALL_YNET
    assert f1 > BEST_F1_YNET