コード例 #1
0
def calculate_scores(annotated_filepath, original_filepath):
    text = extract_annotated_text(annotated_filepath)

    expected_terms = re.findall(r"\w+", text.lower(), flags=re.UNICODE)

    article_extractor = MSSArticleExtractor()

    with open(original_filepath, "r") as f:
        contents = f.read()

    contents = html.document_fromstring(contents)

    contents = clean_html(contents)

    with codecs.open("cleaned_text.html", "w", encoding="utf-8") as f:
        f.write(tostring(contents))

    article = article_extractor.extract_article(tostring(contents))

    with codecs.open("text.html", "w", encoding="utf-8") as f:
        f.write(article)

    terms = re.findall(r"\w+", article.lower(), flags=re.UNICODE)

    matcher = SequenceMatcher(None, expected_terms, terms)

    matches = matcher.get_matching_blocks()

    sretsrel = sum([match.size for match in matches])
    srel = len(expected_terms)

    if terms:
        precision = float(sretsrel) / float(len(terms))
    else:
        precision = 0.0

    if srel > 0:
        recall = float(sretsrel) / float(srel)
    else:
        recall = 0.0

    try:
        f1 = 2 * ((precision * recall) / (precision + recall))
    except:
        f1 = 0.0

    return (precision, recall, f1)
コード例 #2
0
from sys import argv
import codecs

from article_extraction.mss import MSSArticleExtractor

if len(argv) != 3:
    print("You must specify a url and an output file")
    quit()

url = argv[1]
output = argv[2]

article_extractor = MSSArticleExtractor()

article = article_extractor.extract_article_from_url(url)

with codecs.open(output, "w", encoding="utf-8") as f:
    f.write(article)