Ejemplo n.º 1
0
def test_compute_prominence_multiple():
    strings = ['biden', 'joe biden', 'donald trump', 'D. Trump']

    clusters = fuzzy_cluster(strings)
    clusters = compute_prominence(clusters)
    assert isinstance(clusters, list)
    assert len(clusters) > 0
Ejemplo n.º 2
0
def test_compute_prominence_weight_multipliers():
    clusters = fuzzy_cluster(simulate_ner_data())
    clusters = compute_prominence(clusters,
                                  weight_position=0.5,
                                  weight_multipliers=np.random.rand(
                                      len(clusters)))
    clusters = pd.DataFrame.from_dict(clusters)
    assert isinstance(clusters.prominence_score.tolist()[0], float)
Ejemplo n.º 3
0
def run_random(articles,
               entitites,
               id=None,
               scorer=partial_token_set_ratio,
               cutoff=75):

    if id is None:
        id = np.random.choice(articles.content_id.tolist())

    article = articles[articles.content_id == id]
    article = article[['content_id', 'title', 'lead', 'body']]
    article_ents = entities[entities.content_id == id]
    article_ents = article_ents[article_ents.placement == "body"]
    preds = article_ents.to_dict(orient="records")

    t1 = time.time()

    clusters = fuzzy_cluster(preds,
                             scorer=scorer,
                             workers=4,
                             cutoff=cutoff,
                             merge_output=True)
    #pd.DataFrame.from_dict(clu ters)

    clusters = compute_prominence(clusters,
                                  merge_output=True,
                                  weight_position=.5)

    # subset location entities (for matching with cities)
    locations = [x["entity_group"] == "LOC" for x in clusters]
    locations = list(compress(clusters, locations))
    clusters = locations

    clusters = match_whitelist(clusters,
                               whitelist=whitelist,
                               scorer=ratio,
                               score_cutoff=95,
                               merge_output=True,
                               aggregate_cluster=True,
                               workers=1)

    t2 = time.time()

    if len(clusters) > 0:
        clusters = pd.DataFrame.from_dict(clusters).sort_values(
            by="prominence_rank")

    print(id)
    #print(article.title.tolist()[0])
    #print(article.lead.tolist()[0])
    print(article.body.tolist()[0])
    print(clusters)

    return t2 - t1
Ejemplo n.º 4
0
def test_compute_prominence_none():
    clusters = fuzzy_cluster([])
    clusters = compute_prominence(clusters)
    assert isinstance(clusters, list)
    assert len(clusters) == 0
Ejemplo n.º 5
0
def test_compute_prominence_single():
    clusters = fuzzy_cluster(["Biden"])
    clusters = compute_prominence(clusters)
    assert isinstance(clusters, list)
    assert len(clusters) == 1