def calculate_scores(annotated_filepath, original_filepath): text = extract_annotated_text(annotated_filepath) expected_terms = re.findall(r"\w+", text.lower(), flags=re.UNICODE) article_extractor = MSSArticleExtractor() with open(original_filepath, "r") as f: contents = f.read() contents = html.document_fromstring(contents) contents = clean_html(contents) with codecs.open("cleaned_text.html", "w", encoding="utf-8") as f: f.write(tostring(contents)) article = article_extractor.extract_article(tostring(contents)) with codecs.open("text.html", "w", encoding="utf-8") as f: f.write(article) terms = re.findall(r"\w+", article.lower(), flags=re.UNICODE) matcher = SequenceMatcher(None, expected_terms, terms) matches = matcher.get_matching_blocks() sretsrel = sum([match.size for match in matches]) srel = len(expected_terms) if terms: precision = float(sretsrel) / float(len(terms)) else: precision = 0.0 if srel > 0: recall = float(sretsrel) / float(srel) else: recall = 0.0 try: f1 = 2 * ((precision * recall) / (precision + recall)) except: f1 = 0.0 return (precision, recall, f1)
from sys import argv import codecs from article_extraction.mss import MSSArticleExtractor if len(argv) != 3: print("You must specify a url and an output file") quit() url = argv[1] output = argv[2] article_extractor = MSSArticleExtractor() article = article_extractor.extract_article_from_url(url) with codecs.open(output, "w", encoding="utf-8") as f: f.write(article)