Beispiel #1
0
 def dont_geotag_if_detail_exists(self, nominatim):
     gkg = Gkg(
         id=3771256,
         gkgrecordid="20170215174500-2503",
         date=20170215174500,
         document_identifier=
         "http://www.philstar.com/headlines/2017/02/16/1672746/yasay-harris-affirm-stronger-phl-us-ties"
     )
     self.session.add(gkg)
     analysis = Analysis(gkg=gkg, status=Status.NEW)
     self.session.add(analysis)
     content = DocumentContent(
         content_clean=
         "It was early Saturday when a flash flood hit large parts of India and Pakistan and washed away more than 500 houses"
     )
     self.session.add(content)
     self.session.commit()
     analysis.content_id = content.id
     self.session.commit()
     fact = Fact(unit='person', term='displaced')
     self.session.add(fact)
     self.session.commit()
     loc1 = self.session.query(Location).filter(
         Location.location_name == 'India').one_or_none()
     fact.locations.append(loc1)
     analysis.facts.append(fact)
     self.session.commit()
     process_locations(analysis)
     assert not nominatim.called
Beispiel #2
0
def scrape_pdf(url, analysis):
    pdf_file_path, last_modified = download_pdf(url)
    try:
        text = extract_pdf_text(pdf_file_path)
        if not text:
            raise Exception("No text extracted from PDF at {}".format(url))
        text = re.sub('\s+', ' ', text)  # collapse all whitespace
        text_clean = cleanup(text)  # Clean text for analysis steps
        analysis.domain = urlparse(url).hostname
        analysis.publication_date = last_modified or None
        try:
            analysis.language = detect(text)
        except LangDetectException:
            raise Exception("Unable to determine language")
        if analysis.language != 'en':
            session.commit()
            raise Exception("Article not in English")
        content = DocumentContent(analysis=[analysis],
                                  content=text,
                                  content_clean=text_clean,
                                  content_type='pdf')
        session = object_session(analysis)
        session.add(content)
        session.commit()
        return analysis
    finally:
        os.unlink(pdf_file_path)
Beispiel #3
0
 def test_extract_eviction_facts(self):
     """Extracts eviction-related facts with eviction Term"""
     gkg = Gkg()
     analysis = Analysis(gkg=gkg, status=Status.NEW)
     self.session.add(analysis)
     content = DocumentContent(
         content_clean=
         "ordered eviction for 2000 people from their homes in Bosnia")
     self.session.add(content)
     self.session.commit()
     analysis.content_id = content.id
     self.session.commit()
     extract_facts(analysis)
     self.assertEqual(FactTerm.EVICTED, analysis.facts[0].term)
Beispiel #4
0
 def test_extract_refugee_facts(self):
     """Extracts refugee-related facts with Refugee Term"""
     gkg = Gkg()
     analysis = Analysis(gkg=gkg, status=Status.NEW)
     self.session.add(analysis)
     content = DocumentContent(
         content_clean=
         "It was early Saturday when government troops entered the area and forced more than 20000 refugees to flee."
     )
     self.session.add(content)
     self.session.commit()
     analysis.content_id = content.id
     self.session.commit()
     extract_facts(analysis)
     self.assertEqual(FactTerm.REFUGEE, analysis.facts[0].term)
Beispiel #5
0
 def test_extract_facts_simple(self):
     """Extracts simple facts when present and saves to DB"""
     gkg = Gkg()
     analysis = Analysis(gkg=gkg, status=Status.NEW)
     self.session.add(analysis)
     content = DocumentContent(
         content_clean=
         "It was early Saturday when a flash flood hit the area and washed away more than 500 houses"
     )
     self.session.add(content)
     self.session.commit()
     analysis.content_id = content.id
     self.session.commit()
     extract_facts(analysis)
     self.assertEqual(1, len(analysis.facts))
Beispiel #6
0
 def test_extract_sacked_facts(self):
     """Extracts sacked-related facts with eviction Term"""
     gkg = Gkg()
     analysis = Analysis(gkg=gkg, status=Status.NEW)
     self.session.add(analysis)
     content = DocumentContent(
         content_clean=
         "last week 2000 people have been sacked from their homes in Nigeria"
     )
     self.session.add(content)
     self.session.commit()
     analysis.content_id = content.id
     self.session.commit()
     extract_facts(analysis)
     self.assertEqual(FactTerm.SACKED, analysis.facts[0].term)
Beispiel #7
0
def scrape_html(analysis):
    """Downloads and extracts content plus metadata for html page
    Parameters
    ----------
    analysis: analysis object to be scraped
    session: the analysis session

    Returns
    -------
    analysis: The updated analysis object
    """

    a = newspaper.Article(analysis.gkg.document_identifier)
    a.download()
    if a.download_state == 2:
        a.parse()
        analysis.title = a.title
        analysis.authors = a.authors
        analysis.publication_date = a.publish_date or None

        text = re.sub('\s+', ' ', a.text)  # collapse all whitespace
        # Scraping should fail if text is length 0
        if len(text) == 0:
            raise Exception("Content is empty")
        text_clean = cleanup(text)  # Clean text for analysis steps
        text_ts = remove_wordcloud_stopwords(text_clean)
        try:
            analysis.language = detect(text)
        except LangDetectException:
            raise Exception("Unable to determine language")
        if analysis.language != 'en':
            session.commit()
            raise Exception("Article not in English")
        content = DocumentContent(analysis=[analysis],
                                  content=text,
                                  content_clean=text_clean,
                                  content_type='text',
                                  content_ts=func.to_tsvector(
                                      'simple_english', text_ts))
        session = object_session(analysis)
        session.add(content)
        session.commit()
        return analysis
    else:  # Temporary fix to deal with https://github.com/codelucas/newspaper/issues/280
        raise Exception("Retrieval Failed")
Beispiel #8
0
 def test_use_existing_location(self):
     """Uses existing locations when they exist"""
     gkg = Gkg()
     analysis = Analysis(gkg=gkg, status=Status.NEW)
     self.session.add(analysis)
     content = DocumentContent(
         content_clean=
         "It was early Saturday when a flash flood hit large parts of Bosnia and washed away more than 500 houses"
     )
     self.session.add(content)
     location = Location(location_name='Bosnia')
     self.session.add(location)
     self.session.commit()
     analysis.content_id = content.id
     self.session.commit()
     extract_facts(analysis)
     fact = analysis.facts[0]
     extracted_location = fact.locations[0]
     self.assertEqual(location.id, extracted_location.id)
Beispiel #9
0
 def test_create_locations_with_names(self):
     """Creates locations for facts only with location names"""
     gkg = Gkg()
     analysis = Analysis(gkg=gkg, status=Status.NEW)
     self.session.add(analysis)
     content = DocumentContent(
         content_clean=
         "It was early Saturday when a flash flood hit large parts of London and Middlesex and washed away more than 500 houses"
     )
     self.session.add(content)
     self.session.commit()
     analysis.content_id = content.id
     self.session.commit()
     extract_facts(analysis)
     facts = analysis.facts
     self.assertEqual(1, len(facts))
     fact = facts[0]
     self.assertEqual(2, len(fact.locations))
     loc_names = [loc.location_name for loc in fact.locations]
     self.assertIn('London', loc_names)
     self.assertIn('Middlesex', loc_names)
     self.assertEqual([None, None], [loc.country for loc in fact.locations])
Beispiel #10
0
    def test_version_lifecycle(self):
        gkg = self.session.query(Gkg).first()
        analysis = Analysis(gkg=gkg, status=Status.NEW)
        self.session.add(analysis)
        self.session.commit()

        analysis.create_new_version(Status.SCRAPING)

        history = self.session.query(AnalysisHistory).filter(
            AnalysisHistory.gkg == gkg)
        self.assertEqual(1, history.count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.NEW).count())

        content = DocumentContent(content_type="text/html",
                                  content="Lorem ipsum")
        analysis.content = content
        analysis.create_new_version(Status.SCRAPED)

        self.assertEqual(2, history.count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.NEW).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPING).count())

        analysis.create_new_version(Status.EXTRACTING)

        self.assertEqual(3, history.count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.NEW).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPED).count())

        # content is preserved
        scraped = history.filter(
            AnalysisHistory.status == Status.SCRAPED).one_or_none()
        self.assertEqual(analysis.content, scraped.content)

        fact = Fact(analysis_date=datetime.now())
        analysis.facts = [fact]
        analysis.create_new_version(Status.EXTRACTED)

        self.assertEqual(4, history.count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.NEW).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPED).count())
        self.assertEqual(
            1,
            history.filter(
                AnalysisHistory.status == Status.EXTRACTING).count())

        # content still preserved
        extracting = history.filter(
            AnalysisHistory.status == Status.EXTRACTING).one_or_none()
        self.assertEqual(analysis.content, extracting.content)

        analysis.create_new_version(Status.EDITING)
        analysis.content = DocumentContent(content_type="text/html",
                                           content="Lorem edited")
        analysis.create_new_version(Status.EDITED)

        self.assertEqual(6, history.count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.NEW).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPED).count())
        self.assertEqual(
            1,
            history.filter(
                AnalysisHistory.status == Status.EXTRACTING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.EXTRACTED).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.EDITING).count())

        # content has changed, but reports are preserved
        extracted = history.filter(
            AnalysisHistory.status == Status.EXTRACTED).one_or_none()
        self.assertNotEqual(analysis.content.id, extracted.content.id)
        self.assertCountEqual([f.id for f in analysis.facts],
                              [f.id for f in extracted.facts])

        analysis.create_new_version(Status.EDITING)
        fact2 = Fact(analysis_date=datetime.now())
        analysis.facts.append(fact2)
        analysis.create_new_version(Status.EDITED)

        self.assertEqual(8, history.count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.NEW).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.SCRAPED).count())
        self.assertEqual(
            1,
            history.filter(
                AnalysisHistory.status == Status.EXTRACTING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.EXTRACTED).count())
        self.assertEqual(
            2,
            history.filter(AnalysisHistory.status == Status.EDITING).count())
        self.assertEqual(
            1,
            history.filter(AnalysisHistory.status == Status.EDITED).count())

        edited = history.filter(
            AnalysisHistory.status == Status.EDITED).one_or_none()
        self.assertCountEqual([f.id for f in analysis.facts],
                              [fact.id, fact2.id])
        self.assertCountEqual([f.id for f in edited.facts], [fact.id])