def dont_geotag_if_detail_exists(self, nominatim): gkg = Gkg( id=3771256, gkgrecordid="20170215174500-2503", date=20170215174500, document_identifier= "http://www.philstar.com/headlines/2017/02/16/1672746/yasay-harris-affirm-stronger-phl-us-ties" ) self.session.add(gkg) analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) content = DocumentContent( content_clean= "It was early Saturday when a flash flood hit large parts of India and Pakistan and washed away more than 500 houses" ) self.session.add(content) self.session.commit() analysis.content_id = content.id self.session.commit() fact = Fact(unit='person', term='displaced') self.session.add(fact) self.session.commit() loc1 = self.session.query(Location).filter( Location.location_name == 'India').one_or_none() fact.locations.append(loc1) analysis.facts.append(fact) self.session.commit() process_locations(analysis) assert not nominatim.called
def scrape_pdf(url, analysis): pdf_file_path, last_modified = download_pdf(url) try: text = extract_pdf_text(pdf_file_path) if not text: raise Exception("No text extracted from PDF at {}".format(url)) text = re.sub('\s+', ' ', text) # collapse all whitespace text_clean = cleanup(text) # Clean text for analysis steps analysis.domain = urlparse(url).hostname analysis.publication_date = last_modified or None try: analysis.language = detect(text) except LangDetectException: raise Exception("Unable to determine language") if analysis.language != 'en': session.commit() raise Exception("Article not in English") content = DocumentContent(analysis=[analysis], content=text, content_clean=text_clean, content_type='pdf') session = object_session(analysis) session.add(content) session.commit() return analysis finally: os.unlink(pdf_file_path)
def test_extract_eviction_facts(self): """Extracts eviction-related facts with eviction Term""" gkg = Gkg() analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) content = DocumentContent( content_clean= "ordered eviction for 2000 people from their homes in Bosnia") self.session.add(content) self.session.commit() analysis.content_id = content.id self.session.commit() extract_facts(analysis) self.assertEqual(FactTerm.EVICTED, analysis.facts[0].term)
def test_extract_refugee_facts(self): """Extracts refugee-related facts with Refugee Term""" gkg = Gkg() analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) content = DocumentContent( content_clean= "It was early Saturday when government troops entered the area and forced more than 20000 refugees to flee." ) self.session.add(content) self.session.commit() analysis.content_id = content.id self.session.commit() extract_facts(analysis) self.assertEqual(FactTerm.REFUGEE, analysis.facts[0].term)
def test_extract_facts_simple(self): """Extracts simple facts when present and saves to DB""" gkg = Gkg() analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) content = DocumentContent( content_clean= "It was early Saturday when a flash flood hit the area and washed away more than 500 houses" ) self.session.add(content) self.session.commit() analysis.content_id = content.id self.session.commit() extract_facts(analysis) self.assertEqual(1, len(analysis.facts))
def test_extract_sacked_facts(self): """Extracts sacked-related facts with eviction Term""" gkg = Gkg() analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) content = DocumentContent( content_clean= "last week 2000 people have been sacked from their homes in Nigeria" ) self.session.add(content) self.session.commit() analysis.content_id = content.id self.session.commit() extract_facts(analysis) self.assertEqual(FactTerm.SACKED, analysis.facts[0].term)
def scrape_html(analysis): """Downloads and extracts content plus metadata for html page Parameters ---------- analysis: analysis object to be scraped session: the analysis session Returns ------- analysis: The updated analysis object """ a = newspaper.Article(analysis.gkg.document_identifier) a.download() if a.download_state == 2: a.parse() analysis.title = a.title analysis.authors = a.authors analysis.publication_date = a.publish_date or None text = re.sub('\s+', ' ', a.text) # collapse all whitespace # Scraping should fail if text is length 0 if len(text) == 0: raise Exception("Content is empty") text_clean = cleanup(text) # Clean text for analysis steps text_ts = remove_wordcloud_stopwords(text_clean) try: analysis.language = detect(text) except LangDetectException: raise Exception("Unable to determine language") if analysis.language != 'en': session.commit() raise Exception("Article not in English") content = DocumentContent(analysis=[analysis], content=text, content_clean=text_clean, content_type='text', content_ts=func.to_tsvector( 'simple_english', text_ts)) session = object_session(analysis) session.add(content) session.commit() return analysis else: # Temporary fix to deal with https://github.com/codelucas/newspaper/issues/280 raise Exception("Retrieval Failed")
def test_use_existing_location(self): """Uses existing locations when they exist""" gkg = Gkg() analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) content = DocumentContent( content_clean= "It was early Saturday when a flash flood hit large parts of Bosnia and washed away more than 500 houses" ) self.session.add(content) location = Location(location_name='Bosnia') self.session.add(location) self.session.commit() analysis.content_id = content.id self.session.commit() extract_facts(analysis) fact = analysis.facts[0] extracted_location = fact.locations[0] self.assertEqual(location.id, extracted_location.id)
def test_create_locations_with_names(self): """Creates locations for facts only with location names""" gkg = Gkg() analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) content = DocumentContent( content_clean= "It was early Saturday when a flash flood hit large parts of London and Middlesex and washed away more than 500 houses" ) self.session.add(content) self.session.commit() analysis.content_id = content.id self.session.commit() extract_facts(analysis) facts = analysis.facts self.assertEqual(1, len(facts)) fact = facts[0] self.assertEqual(2, len(fact.locations)) loc_names = [loc.location_name for loc in fact.locations] self.assertIn('London', loc_names) self.assertIn('Middlesex', loc_names) self.assertEqual([None, None], [loc.country for loc in fact.locations])
def test_version_lifecycle(self): gkg = self.session.query(Gkg).first() analysis = Analysis(gkg=gkg, status=Status.NEW) self.session.add(analysis) self.session.commit() analysis.create_new_version(Status.SCRAPING) history = self.session.query(AnalysisHistory).filter( AnalysisHistory.gkg == gkg) self.assertEqual(1, history.count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.NEW).count()) content = DocumentContent(content_type="text/html", content="Lorem ipsum") analysis.content = content analysis.create_new_version(Status.SCRAPED) self.assertEqual(2, history.count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.NEW).count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.SCRAPING).count()) analysis.create_new_version(Status.EXTRACTING) self.assertEqual(3, history.count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.NEW).count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.SCRAPING).count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.SCRAPED).count()) # content is preserved scraped = history.filter( AnalysisHistory.status == Status.SCRAPED).one_or_none() self.assertEqual(analysis.content, scraped.content) fact = Fact(analysis_date=datetime.now()) analysis.facts = [fact] analysis.create_new_version(Status.EXTRACTED) self.assertEqual(4, history.count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.NEW).count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.SCRAPING).count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.SCRAPED).count()) self.assertEqual( 1, history.filter( AnalysisHistory.status == Status.EXTRACTING).count()) # content still preserved extracting = history.filter( AnalysisHistory.status == Status.EXTRACTING).one_or_none() self.assertEqual(analysis.content, extracting.content) analysis.create_new_version(Status.EDITING) analysis.content = DocumentContent(content_type="text/html", content="Lorem edited") analysis.create_new_version(Status.EDITED) self.assertEqual(6, history.count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.NEW).count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.SCRAPING).count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.SCRAPED).count()) self.assertEqual( 1, history.filter( AnalysisHistory.status == Status.EXTRACTING).count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.EXTRACTED).count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.EDITING).count()) # content has changed, but reports are preserved extracted = history.filter( AnalysisHistory.status == Status.EXTRACTED).one_or_none() self.assertNotEqual(analysis.content.id, extracted.content.id) self.assertCountEqual([f.id for f in analysis.facts], [f.id for f in extracted.facts]) analysis.create_new_version(Status.EDITING) fact2 = Fact(analysis_date=datetime.now()) analysis.facts.append(fact2) analysis.create_new_version(Status.EDITED) self.assertEqual(8, history.count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.NEW).count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.SCRAPING).count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.SCRAPED).count()) self.assertEqual( 1, history.filter( AnalysisHistory.status == Status.EXTRACTING).count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.EXTRACTED).count()) self.assertEqual( 2, history.filter(AnalysisHistory.status == Status.EDITING).count()) self.assertEqual( 1, history.filter(AnalysisHistory.status == Status.EDITED).count()) edited = history.filter( AnalysisHistory.status == Status.EDITED).one_or_none() self.assertCountEqual([f.id for f in analysis.facts], [fact.id, fact2.id]) self.assertCountEqual([f.id for f in edited.facts], [fact.id])