Esempio n. 1
0
 def test_counts_with_spaces(self):
     doc = AnnoDoc("Ther were 565 749 new cases")
     doc.add_tier(self.annotator)
     actual_counts = [
         count.metadata['count'] for count in doc.tiers['counts'].spans
         if 'case' in count.metadata['attributes']
     ]
     self.assertEqual(actual_counts, [565749])
Esempio n. 2
0
 def test_age_elimination(self):
     doc = AnnoDoc(
         '1200 children under the age of 5 are afflicted with a mystery illness'
     )
     doc.add_tier(self.annotator)
     test_utils.assertHasProps(doc.tiers['counts'].spans[0].metadata,
                               {'count': 1200})
     self.assertEqual(len(doc.tiers['counts'].spans), 1)
Esempio n. 3
0
 def test_raw_counts(self):
     doc = AnnoDoc('There are 5 new ones.')
     doc.add_tier(self.annotator)
     test_utils.assertHasProps(doc.tiers['counts'].spans[0].metadata, {
         'count': 5,
         'attributes': ['incremental']
     })
     self.assertEqual(len(doc.tiers['counts'].spans), 1)
 def test_dashes(self):
     text = 'Adenoviruses, first seen between 2010-1-1 and 2010-1-2'
     doc = AnnoDoc(text)
     doc.add_tier(self.annotator)
     self.assertEqual(
         doc.tiers['dates'].spans[0].datetime_range,
         [datetime.datetime(2010, 1, 1),
          datetime.datetime(2010, 1, 3)])
 def test_dashes_2(self):
     text = 'First seen between 2010-1-1 - 2011-1-1'
     doc = AnnoDoc(text)
     doc.add_tier(self.annotator)
     self.assertEqual(
         doc.tiers['dates'].spans[0].datetime_range,
         [datetime.datetime(2010, 1, 1),
          datetime.datetime(2011, 1, 2)])
 def test_inexact_range(self):
     text = 'From May to August of 2009 we languished there.'
     doc = AnnoDoc(text)
     doc.add_tier(self.annotator)
     self.assertEqual(
         doc.tiers['dates'].spans[0].datetime_range,
         [datetime.datetime(2009, 5, 1),
          datetime.datetime(2009, 9, 1)])
 def test_1950s(self):
     text = 'Adenoviruses, first isolated in the 1950s from explanted adenoid tissue.'
     doc = AnnoDoc(text)
     doc.add_tier(self.annotator)
     self.assertEqual(
         doc.tiers['dates'].spans[0].datetime_range,
         [datetime.datetime(1950, 1, 1),
          datetime.datetime(1960, 1, 1)])
 def test_month_of_year(self):
     example = "Dengue cases were increasing in the 3rd month of the year [2017]."
     doc = AnnoDoc(example)
     doc.add_tier(self.annotator)
     self.assertEqual(
         doc.tiers['dates'].spans[0].datetime_range,
         [datetime.datetime(2017, 3, 1),
          datetime.datetime(2017, 4, 1)])
 def test_since_date(self):
     text = 'nearly 5000 cases have been reported since 1 Sep 2010.'
     doc = AnnoDoc(text, date=datetime.datetime(2010, 12, 10))
     doc.add_tier(self.annotator)
     self.assertEqual(
         doc.tiers['dates'].spans[0].datetime_range,
         [datetime.datetime(2010, 9, 1),
          datetime.datetime(2010, 12, 10)])
 def test_count_table(self):
     doc = AnnoDoc('''
     Type / Suspected / Confirmed / Recovered / Ongoing / Total
     Cases / 8 / 34 / 18 / 16 / 70
     Deaths / 7 / 33 / 17 / 15 / 65
     ''')
     doc.add_tier(self.annotator)
     self.assertEqual(doc.tiers['dates'].spans, [])
 def test_week_parsing(self):
     text = "AES had taken 13 lives in the 2nd week of October 2017."
     doc = AnnoDoc(text)
     doc.add_tier(self.annotator)
     self.assertEqual(
         doc.tiers['dates'].spans[0].datetime_range,
         [datetime.datetime(2017, 10, 8),
          datetime.datetime(2017, 10, 15)])
Esempio n. 12
0
    def test_species_false_positive(self):
        # Sierra wil be detected as a species if using naive ontology keyword
        # matching.
        doc = AnnoDoc("""
Guinea, 3 new cases and 5 deaths; Liberia, 8 new cases with 7 deaths;
and Sierra Leone 11 new cases and 2 deaths.""")
        doc.add_tier(self.annotator)
        self.assertEqual(len(doc.tiers['species']), 0)
    def test_global_cases(self):
        doc = AnnoDoc("""
As of June 2018 a total of 100 cases have been reported globally.
""")
        doc.add_tier(self.annotator)
        self.assertEqual(
            doc.tiers['incidents'][0].metadata['locations'][0]['name'],
            'Earth')
Esempio n. 14
0
    def test_multipart_names(self):
        text = 'From Seattle, WA, Canada is not far away.'
        doc = AnnoDoc(text)
        doc.add_tier(self.annotator)

        self.assertEqual(doc.text, text)
        self.assertEqual(len(doc.tiers['geonames'].spans), 2)
        self.assertEqual(doc.tiers['geonames'].spans[0].text, "Seattle, WA")
 def assertHasCounts(self, sent, counts):
     doc = AnnoDoc(sent)
     doc.add_tier(self.annotator)
     actuals = [span.metadata.get('count') for span in doc.tiers['counts']]
     expecteds = [count.get('count') for count in counts]
     self.assertEqual(actuals, expecteds)
     for actual, expected in zip(doc.tiers['counts'].spans, counts):
         test_utils.assertHasProps(actual.metadata, expected)
Esempio n. 16
0
 def test_day_of_week(self):
     text = "Sat 19 Aug 2017"
     doc = AnnoDoc(text)
     doc.add_tier(self.annotator)
     self.assertEqual(
         doc.tiers['dates'].spans[0].datetime_range,
         [datetime.datetime(2017, 8, 19),
          datetime.datetime(2017, 8, 20)])
Esempio n. 17
0
 def test_relative_date_range(self):
     text = "between Thursday and Friday"
     doc = AnnoDoc(text, date=datetime.datetime(2017, 7, 15))
     doc.add_tier(self.annotator)
     self.assertEqual(
         doc.tiers['dates'].spans[0].datetime_range,
         [datetime.datetime(2017, 7, 13),
          datetime.datetime(2017, 7, 15)])
Esempio n. 18
0
 def test_formatted_date(self):
     text = "08-FEB-17"
     doc = AnnoDoc(text, date=datetime.datetime(2017, 7, 15))
     doc.add_tier(self.annotator)
     self.assertEqual(
         doc.tiers['dates'].spans[0].datetime_range,
         [datetime.datetime(2017, 2, 8),
          datetime.datetime(2017, 2, 9)])
Esempio n. 19
0
 def test_dateparse_bug_2(self):
     # The current version of the date annotator tries to parse 72\n1994, which triggers an exception
     # in the dateparse library.
     doc = AnnoDoc("""
     Year Cases Fatal
     1991 46,320 697\n1992 31,870 208\n1993 6,833 72\n1994 1,785 16\n1995 2,160 23"""
                   )
     doc.add_tier(self.annotator)
Esempio n. 20
0
 def test_bug_causing_sentence(self):
     text = u"""
     In late June 2012, an increase in cases of prolonged fever for ≥3 days
     was reported from the Vanimo General Hospital in
     Vanimo, Sandaun Province.
     """
     doc = AnnoDoc(text)
     doc.add_tier(self.annotator)
Esempio n. 21
0
 def test_northeast(self):
     doc = AnnoDoc(u"""
      Instead, a novel virus was isolated from a patient’s blood. Since March 2010, there were frequent reports of a unique group of hospitalized patients who presented with clinical symptoms similar to those of SFTS in Central and Northeast China (Fig. 1). On the basis of data from a primary investigation in 2009, an enhanced surveillance was implement- ed in selected provinces in China to further in- vestigate the cause and epidemiologic character- istics of SFTS. Here we describe the discovery and characterization of a novel phlebovirus in the Bunyaviridae family, designated SFTS bunyavirus (SFTSV), which is associated with SFTS. We also discuss the clinical manifestations of SFTS and the epidemiologic investigations. Methods Case Definition and Surveillance Methods Since 2009, we have implemented an active sur- veillance program in selected areas in Hubei and Henan provinces to identify patients with SFTS. The syndrome was characterized by acute fever (temperatures of 38°C or more) and thrombocyto- penia (platelet count, <100,000 per cubic millime- ter) of unknown cause.2 We collected blood sam- ples from hospitalized patients whose symptoms fulfilled the criteria of the case definition. We excluded patients whose symptoms fit these crite- ria but who had other clinical or laboratory-con- firmed diagnoses. We defined a laboratory-confirmed case as meeting one or more of the following criteria: the isolation of SFTSV from the patient’s serum, the detection of SFTSV RNA in the patient’s se- rum during the acute phase of the illness, or the detection of seroconversion or an elevation by a factor of four in serum IgG antibodies against SFTSV on enzyme-linked immunosorbent assay (ELISA), indirect immunof luorescence assay, or neutralization testing in serum obtained during the convalescent phase. If possible, we collected serum samples within 2 weeks after the onset of fever and again during the convalescent phase. We also collected serum samples from 200 patient- matched healthy persons living in the same areas and during the same time period. The research protocol was approved by the human bioethics committee of the Chinese Center for Disease Con- trol and Prevention, and all participants provided written informed consent. Isolation of an Unknown Pathogen In June 2009, a blood sample in heparin antico- agulant was obtained on day 7 after the onset of illness from a patient from Xinyang City in Henan Province. Because the cause of the illness was un- known, we designed a strategy to isolate the patho- gen by inoculating multiple cell lines susceptible to both viral and rickettsial agents, including hu- man cell line HL60; animal cell lines DH82, L929, Vero, and Vero E6; and tick cell line ISE6. The pa- tient’s white cells were used to inoculate cell mono- layers. The cells were cultured at 37°C in a 5% carbon dioxide atmosphere with media changes twice a week. In 2010, we used a related strategy to isolate an additional 11 strains of the virus by inoculation of serum or homogenized white cells onto Vero cells. Electron Microscopy A DH82-cell monolayer that was infected with SFTSV in T25 flasks was fixed for transmission electron microscopy with Ito solution, as de- scribed previously.3 Ultrathin sections were cut on a Reichert–Leica Ultracut S ultramicrotome, stained with lead citrate and examined in a Phil- ips 201 or CM-100 electron microscope at 60 kV. Negative-stain electron microscopy was performed on virions purified from a clarified culture super- natant of infected Vero cells concentrated by a factor of 100.4,5 Genetic Analysis For the first SFTSV isolate, formalin-fixed cell cul- ture was used to extract viral RNA using a High Pure FFPE RNA Micro Kit (Roche Applied Sci- ence). The virus was sequenced with the use of the restriction-fragment–length-polymorphism assay with amplified complementary DNA, as described previously.6 For the remaining 11 strains of the virus, the whole genomes were sequenced with the use of the sequence-independent, single-primer amplification (SISPA) method.7 The 5' and 3' ter- minals of viral RNA segments were determined with a RACE Kit (Invitrogen). Phylogenetic analy- ses were performed with the neighbor-joining method with the use of the Poisson correction and complete deletion of gaps. Neutralization Assay For microneutralization testing, serial dilutions of serum samples were mixed with an equal vol- ume of 100 median tissue-culture infectious dos- es of SFTSV (strain HB29) and incubated at 37°C for 1.5 hours. The mixture was then added to a 96-well plate containing Vero cells in quadrupli- cate. The plates were incubated at 37°C in a 5% carbon dioxide atmosphere for 12 days. Viral in- fection was detected on specific immunofluores- cence assays in serum samples from patients with laboratory-confirmed infection. The end-point ti- ter was expressed as the reciprocal of the highest dilution of serum that prevented infection. Polymerase Chain Reaction RNA that was extracted from serum, whole blood, or homogenized arthropods was amplified with the use of a one-step, multiplex real-time reverse- transcriptase polymerase chain reaction (RT-PCR) with primers for SFTSV (Qiagen). The cutoff cycle- threshold value for a positive sample was set at 35 cycles. Nested RT-PCR and sequencing were used to verify samples from which only one ge- nomic segment was amplified. Virus Isolation The first SFTSV (strain DBM) was isolated from a 42-year-old man from Henan Province. A month after inoculation of cell monolayers with white cells obtained from the patient, virus-induced cellular changes visible on light microscopy (cyto- pathic effect) were observed in DH82 cells but not in the other cell lines. The morphologic features of infected DH82 cells changed from round mono- cytes to an elongated shape, which had granular particles in the cytoplasm (Fig. 2A). After several passages in culture, the cytopathic effect usually appeared on day 4 after inoculation of a fresh monolayer. Subsequently, 11 additional strains of the virus were isolated from serum samples ob- tained from patients during the acute phase of illness in six provinces with the use of Vero cells (Table 1 in the Supplementary Appendix, available with the full text of this article at NEJM.org). SFTSV can infect a variety of cells, including L929, Vero E6, Vero (Fig. 2B), and DH82 cells, but it re- sulted in the cytopathic effect only in DH82 cells. The viral particles were spheres with a diameter of 80 to 100 nm. Negative-stain electron microscopy of SFTSV particles that were purified from the su- pernatants of infected Vero cells revealed complex surface projections (Fig. 2C). Transmission electron microscopy revealed viral particles in the DH82-cell cytoplasm. The virions were observed inside vacu- oles, presumably in the Golgi apparatus (Fig. 2D). Partial sequences were obtained from the first isolated virus strain DBM, and the complete ge- nomes of 11 additional human isolates of SFTSV were determined. (GenBank accession numbers are provided in Table 1 in the Supplementary Ap- pendix.) All isolates including strain DBM were closely related (96% homology of nucleotide se- quences for all segments). The terminals of the three genomic segments of SFTSV were found to be similar to counterparts in other phlebovirus- es.8 The L segment contains 6368 nucleotides with one open reading frame encoding 2084 amino acids. The M segment contains 3378 nu- cleotides with one open reading frame encoding 1073 amino acid precursors of glycoproteins (Gn and Gc). The S segment contains 1744 nucleo- tides of ambisense RNA encoding two proteins, the N and NSs proteins, in opposite orientations, separated by a 62-bp intergenic region. Phylogenetic trees based on partial or complete viral genomic sequences of L, M, and S segments from strains DBM, HN6, and HB29 showed that SFTSV was related to prototypic viruses of the five genera of Bunyaviridae (Fig. 1 in the Supple- mentary Appendix). Among the genera orthobun- yavirus, hantavirus, nairovirus, phlebovirus, and tospovirus, SFTSV belongs to the phlebovirus genus8 but was more distantly related to proto- typic viruses in the other four genera. To verify this finding, we carried out a phylogenetic analy- sis, using complete deduced amino acid sequenc- es coding for RNA-dependent RNA polymerase, glycoproteins (Gn and Gc), and N and NSs pro- teins of SFTSV (strains HB29, HN6, AN12, LN2, JS3, and SD4) from six provinces in China, as com- pared with the other known phleboviruses (Fig. 3). The generated phylogenetic tree showed that all SFTSV isolates clustered together but were near- ly equidistant from the other two groups,9 the Sandfly fever group (Rift Valley fever virus, Punta Toro virus, Toscana virus, Massila virus, and Sandfly fever Sicilian virus) and the Uukuniemi group. This suggested that SFTSV is the proto- type of a third group in the phlebovirus genus. A comparison of the similarity of amino acid sequences provided further evidence that SFTSV is distinct from the other phleboviruses (Table 2 in the Supplementary Appendix). Both RNA- dependent RNA polymerase and glycoproteins of SFTSV are slightly more closely related to coun- terparts in Uukuniemi virus. However, N pro- teins in SFTSV and Rift Valley fever virus had 41.4% similarity. In contrast, the amino acids in NSs proteins encoded by the S segment showed a similarity of only 11.2 to 16.0% with amino acids in other phleboviruses. Serologic Analysis We evaluated seroconversion against SFTSV in pa- tients with SFTS using three different methods: immunof luorescence assay, ELISA, and microneu- tralization. We chose a cohort of 35 patients with RT-PCR–confirmed SFTSV infection who had se- rum samples from both acute and convalescent phases of the illness. An elevation in the anti- body titer by a factor of four or seroconversion was observed in all 35 patients, as seen especially on microneutralization (Table 1). These results indi- cated that high levels of neutralizing antibodies were generated during the convalescent phase of the illness. An antibody titer of more than 1:25,600 on ELISA was present in 15 convalescent-phase serum samples, indicating a robust humoral im- mune response against SFTSV. Among the 35 se- ropositive samples, all SFTSV infections were confirmed on viral RNA sequencing, and 11 were confirmed on virus isolation. It is noteworthy that specific neutralizing antibodies against SFTSV persisted in some convalescent-phase serum sam- ples even 1 year after recovery. Clinical Symptoms The first patient, a 42-year-old male farmer, pre- sented with fever (temperatures of 39.2 to 39.7°C), fatigue, conjunctival congestion, diarrhea, abdom- inal pain, leukocytopenia, thrombocytopenia, pro- teinuria, and hematuria. Later, a unique group of hospitalized patients with acute high fever with thrombocytopenia was identified. We analyzed only 81 patients with laboratory-confirmed SFTSV infection who had a complete medical record for the clinical spectrum of SFTS. The clinical symp- toms of SFTS were nonspecific, and the major symptoms included fever and gastrointestinal symptoms. Regional lymphadenopathy was also frequently observed (Table 2). The most common abnormalities on laboratory testing were thrombo- cytopenia (95%) and leukocytopenia (86%) (Table 3). Multiorgan failure developed rapidly in most patients, as shown by elevated levels of serum ala- nine aminotransferase, aspartate aminotransfer- ase, creatine kinase, and lactate dehydrogenase. Proteinuria (in 84% of patients) and hematuria (in 59%) were also observed. Among the 171 con- firmed cases, there were 21 deaths (12%). However, it is not clear how SFTSV caused these deaths. Epidemiologic Investigation From June 2009 through September 2010, we de- tected SFTS bunyavirus RNA, specific antiviral antibodies, or both in 171 patients among 241 hospitalized patients who met the case defini- tion for SFTS2 in Central and Northeast China. These patients included 43 in Henan, 52 in Hubei, 93 in Shandong, 31 in Anhui, 11 in Jiangsu, and 11 in Liaoning provinces. In 2010, a total of 148 of 154 laboratory-confirmed cases (96%) occurred from May to July. The ages of the patients ranged from 39 to 83 years, and 115 of 154 patients (75%) were over 50 years of age. Of these 154 patients, 86 (56%) were women, and 150 (97%) were farm- ers living in wooded and hilly areas and working in the fields before the onset of disease. No SFTSV was identified on real-time RT-PCR and no anti- bodies against SFTSV were identified in serum samples that were collected from 200 patient- matched healthy control subjects in the endemic areas, from 180 healthy subjects from nonendem- ic areas, and from 54 patients with suspected hem- orrhagic fever with renal syndrome. Mosquitoes and ticks were commonly found in the patients’ home environment. However, viral RNA was not detected in any of 5900 mosquitoes tested. On the other hand, 10 of 186 ticks (5.4%) of the species Haemaphysalis longicornis that were collected from domestic animals in the areas where the patients lived contained SFTSV RNA. The viruses in the ticks were isolated in Vero cell culture, and the RNA sequences of these viruses were very closely related but not identical to the SFTSV isolated in samples obtained from the patients (data not shown). There was no epidemiologic evidence of human-to-human transmission of the virus. Discussion Although we have not fulfilled Koch’s postulates for establishing a causal relationship between a mi- crobe and a disease in their entirety, our findings suggest that SFTS is caused by a newly identified bunyavirus. These data include epidemiologic, clinical, and laboratory findings and several lines of evidence that include virus isolation, viral RNA detection, and molecular and serologic analyses. SFTS has been identified in Central and Northeast China, which covers all six provinces where sur- veillance for SFTS was carried out.
      """)
     doc.add_tier(self.annotator)
     self.assertTrue(
         'Northeast' not in [
             span.text for span in doc.tiers['geonames'].spans]
     )
Esempio n. 22
0
 def test_date_range(self):
     doc = AnnoDoc(
         "The 7 new cases age between 17 and 70, and their onset dates vary between 19 May [2018] - 5 Jun [2018]."
     )
     doc.add_tier(self.annotator)
     self.assertEqual(
         doc.tiers['dates'].spans[0].datetime_range,
         [datetime.datetime(2018, 5, 19),
          datetime.datetime(2018, 6, 6)])
Esempio n. 23
0
 def test_dashes_3(self):
     doc = AnnoDoc(
         'Distribution of reported yellow fever cases from 1 Jul 2017-17 Apr 2018'
     )
     doc.add_tier(self.annotator)
     self.assertEqual(
         doc.tiers['dates'].spans[0].datetime_range,
         [datetime.datetime(2017, 7, 1),
          datetime.datetime(2018, 4, 18)])
 def test_duration_with_years(self):
     text = 'I lived there for three years, from 1999 until late 2001'
     doc = AnnoDoc(text)
     doc.add_tier(DateAnnotator(include_end_date=False))
     self.assertEqual(len(doc.tiers['dates'].spans), 1)
     self.assertEqual(
         doc.tiers['dates'].spans[0].datetime_range,
         [datetime.datetime(1999, 1, 1),
          datetime.datetime(2002, 1, 1)])
 def test_relative_date_range_2(self):
     doc = AnnoDoc(
         "In the past 20 days 285 cases of swine flu have been reported across the state.",
         date=datetime.datetime(2018, 12, 21))
     doc.add_tier(self.annotator)
     self.assertEqual(
         doc.tiers['dates'].spans[0].datetime_range,
         [datetime.datetime(2018, 12, 1),
          datetime.datetime(2018, 12, 21)])
    def test_incident_3(self):
        doc = AnnoDoc("""
As of [Thu 7 Sep 2017], there have been at total of:
157 laboratory-confirmed cases of MERS-CoV infection, including
69 deaths [reported case fatality rate 40.2 percent],
103 recoveries, and 0 currently active cases/infections in Greece.
        """)
        doc.add_tier(self.annotator)
        test_utils.assertHasProps(
            doc.tiers['incidents'].spans[0].metadata, {
                'value':
                157,
                'type':
                'cumulativeCaseCount',
                'status':
                'confirmed',
                'resolvedDisease': {
                    'label': 'Middle East respiratory syndrome',
                    'id': 'https://www.wikidata.org/wiki/Q16654806'
                },
                'dateRange': [
                    datetime.datetime(2017, 9, 7, 0, 0),
                    datetime.datetime(2017, 9, 8, 0, 0)
                ]
            })
        test_utils.assertHasProps(
            doc.tiers['incidents'].spans[1].metadata['locations'][0], {
                'latitude': 39.0,
                'name': 'Hellenic Republic',
                'id': '390903',
                'countryCode': 'GR',
                'asciiname': 'Hellenic Republic',
                'countryName': 'Hellenic Republic',
                'featureCode': 'PCLI',
                'namesUsed': 'Greece',
                'admin1Code': '00',
                'longitude': 22.0
            })
        test_utils.assertHasProps(
            doc.tiers['incidents'].spans[1].metadata, {
                'value':
                69,
                'type':
                'cumulativeDeathCount',
                'species': {
                    'id': 'tsn:180092',
                    'label': 'H**o sapiens'
                },
                'resolvedDisease': {
                    'label': 'Middle East respiratory syndrome',
                    'id': 'https://www.wikidata.org/wiki/Q16654806'
                },
                'dateRange': [
                    datetime.datetime(2017, 9, 7, 0, 0),
                    datetime.datetime(2017, 9, 8, 0, 0)
                ]
            })
 def test_dateparser_bug(self):
     # This triggers an exception in the dateparser library described in this
     # bug report:
     # https://github.com/scrapinghub/dateparser/issues/339
     # This only tests that the exception is handled.
     # The date range in the text is still not property parsed.
     text = "week 1 - 53, 2015"
     doc = AnnoDoc(text)
     doc.add_tier(self.annotator)
 def test_simple_date(self):
     text = 'I went to Chicago Friday, October 7th 2010.'
     doc = AnnoDoc(text)
     doc.add_tier(self.annotator)
     self.assertEqual(len(doc.tiers['dates'].spans), 1)
     self.assertEqual(
         doc.tiers['dates'].spans[0].datetime_range,
         [datetime.datetime(2010, 10, 7),
          datetime.datetime(2010, 10, 8)])
Esempio n. 29
0
    def test_multipart_names_5(self):
        text = 'From Seattle, WA, Canada is not far away.'
        doc = AnnoDoc(text)
        doc.add_tier(self.annotator, split_compound_geonames=True)

        self.assertEqual(doc.text, text)
        self.assertEqual(len(doc.tiers['geonames'].spans), 3)
        self.assertEqual(doc.tiers['geonames'].spans[0].text, "Seattle")
        self.assertEqual(doc.tiers['geonames'].spans[1].text, "WA")
Esempio n. 30
0
 def test_url_names(self):
     doc = AnnoDoc(u"""
     [1] Cholera - South Sudan
     Date: 19 Jul 2014
     Source: Radio Tamazuj [edited]
     https://radiotamazuj.org/en/article/south-sudan-100-total-cholera-deaths
     """)
     doc.add_tier(self.annotator)
     self.assertEqual(len(doc.tiers['geonames']), 1)
Esempio n. 31
0
def create_annotations(article_uri, content):
    annotated_doc = AnnoDoc(content)
    for annotator in annotators:
        annotated_doc.add_tier(annotator)
    def get_span_uri(span):
        h = hashlib.md5()
        h.update(article_uri)
        h.update(str(span.start) + ':' + str(span.end))
        return "http://www.eha.io/types/annotation/annie/span/" + str(h.hexdigest())
    for tier_name in ['geonames', 'diseases', 'hosts', 'modes', 'pathogens', 'symptoms']:
        tier = annotated_doc.tiers[tier_name]
        update_query = make_template("""
        prefix anno: <http://www.eha.io/types/annotation_prop/>
        prefix eha: <http://www.eha.io/types/>
        prefix rdf: <http://www.w3.org/2000/01/rdf-schema#>
        prefix dc: <http://purl.org/dc/terms/>
        {% for span in spans %}
        INSERT DATA {
            <{{get_span_uri(span)}}> anno:annotator eha:annie
                {% if span.geoname %}
                    ; rdf:type eha:geoname_annotation
                    ; anno:geoname <http://sws.geonames.org/{{span.geoname.geonameid}}>
                {% else %}
                    ; rdf:type eha:keyword_annotation
                    ; anno:category "{{tier_name}}"
                {% endif %}
                ; anno:label "{{span.label | escape}}"
                ; anno:source_doc <{{source_doc}}>
                ; anno:start {{span.start}}
                ; anno:end {{span.end}}
                ; anno:selected-text "{{span.text | escape}}"
        } ;
        {% if tier_name == "diseases" %}
            INSERT DATA {
                {% for entity_uri in resolve_keyword(span.label) %}
                     <{{entity_uri}}> dc:relation <{{get_span_uri(span)}}> .
                {% endfor %}
            } ;
        {% endif %}
        {% endfor %}
        INSERT DATA {
            <{{source_doc}}> anno:annotated_by eha:annie_1
        }
        """).render(
            get_span_uri=get_span_uri,
            resolve_keyword=resolve_keyword,
            source_doc=article_uri,
            tier_name=tier_name,
            spans=tier.spans)
        sparql_utils.update(update_query)
Esempio n. 32
0
import hashlib
from epitator.annotator import AnnoDoc
from epitator.keyword_annotator import KeywordAnnotator
from epitator.geoname_annotator import GeonameAnnotator
import re
from pylru import lrudecorator

annotators = [
    KeywordAnnotator(),
    GeonameAnnotator(),
]

# Test that the keyword annotator is set up correctly
test_doc = AnnoDoc("ebola influenza glanders dermatitis")
for annotator in annotators:
    test_doc.add_tier(annotator)
assert(
    set(disease.label for disease in test_doc.tiers["diseases"].spans) -
    set("ebola influenza glanders dermatitis".split(" ")) == set())

@lrudecorator(500)
def resolve_keyword(keyword):
    query = make_template("""
    prefix anno: <http://www.eha.io/types/annotation_prop/>
    prefix oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>
    prefix obo: <http://purl.obolibrary.org/obo/>
    prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    SELECT ?entity
    WHERE {
        BIND (obo:DOID_4 AS ?disease)
        ?entity rdfs:subClassOf* ?disease .
Esempio n. 33
0
    def diagnose(
        self,
        content,
        diseases_only=False,
        content_date=None,
        use_infection_annotator=False,
        include_incidents=False):
        time_sofar = time_sofar_gen(datetime.datetime.now())
        base_keyword_dict = self.keyword_extractor.transform([content])[0]
        feature_dict = self.keyword_processor.transform([base_keyword_dict])
        X = self.dict_vectorizer.transform(feature_dict)[0]

        logger.info(time_sofar.next() + 'Computed feature vector')
        def diagnosis(i, p):
            scores = self.classifier.coef_[i] * X
            # Scores are normalized so they can be compared across different
            # classifications.
            norm = np.linalg.norm(scores)
            if norm > 0:
                scores /= norm
            scores *= p
            # These might be numpy types. I coerce them to native python
            # types so we can easily serialize the output as json.

            scored_keywords = zip(self.keywords, scores)
            keyword_scores = {}
            for keyword, score in scored_keywords:
                if score > 0 and keyword in base_keyword_dict:
                    keyword_scores[keyword] = float(score)

            return {
                'name': unicode(self.classifier.classes_[i]),
                'probability': float(p),
                'keywords': [{
                        'name': unicode(kwd),
                        'score': float(score),
                    }
                    for kwd, score in scored_keywords
                    if score > 0 and kwd in base_keyword_dict],
                'inferred_keywords': [{
                        'name': unicode(kwd),
                        'score': float(score),
                    }
                    for kwd, score in scored_keywords
                    if score > 0 and kwd not in base_keyword_dict]
            }
        diseases = [diagnosis(i,p) for i,p in self.best_guess(X)]
        if diseases_only:
            return {
                'diseases': diseases
            }
        logger.info(time_sofar.next() + 'Diagnosed diseases')

        anno_doc = AnnoDoc(content, date=content_date)
        anno_doc.add_tier(self.keyword_annotator)
        logger.info('keywords annotated')
        anno_doc.add_tier(self.resolved_keyword_annotator)
        logger.info('resolved keywords annotated')
        anno_doc.add_tier(self.date_annotator)
        logger.info('dates annotated')
        if use_infection_annotator:
            anno_doc.add_tier(self.infection_annotator)
            anno_doc.tiers['counts'] = anno_doc.tiers.pop('infections')
            attribute_remappings = {
                'infection': 'case'
            }
            for span in anno_doc.tiers['counts']:
                span.metadata['attributes'] = [
                    attribute_remappings.get(attribute, attribute)
                    for attribute in span.metadata['attributes']]
        else:
            anno_doc.add_tier(self.count_annotator)
        logger.info('counts annotated')
        anno_doc.add_tier(self.geoname_annotator)
        logger.info('geonames annotated')
        anno_doc.add_tier(StructuredIncidentAnnotator())
        logger.info('structured incidents annotated')
        anno_doc.filter_overlapping_spans(
            tier_names=[ 'dates', 'geonames', 'diseases', 'hosts', 'modes',
                         'pathogens', 'symptoms' ]
        )
        logger.info('filtering overlapping spans done')

        dates = []
        for span in anno_doc.tiers['dates']:
            range_start, range_end = span.datetime_range
            dates.append({
                'type': 'datetime',
                'name': span.text,
                'value': span.text,
                'textOffsets': [
                    [span.start, span.end]
                ],
                'timeRange': {
                    'beginISO': range_start.isoformat().split('T')[0],
                    'begin': {
                        'year': range_start.year,
                        'month': range_start.month,
                        'date': range_start.day
                    },
                    # The date range does not include the end day.
                    'endISO': range_end.isoformat().split('T')[0],
                    'end': {
                        'year': range_end.year,
                        'month': range_end.month,
                        'date': range_end.day
                    },
                }
            })

        geonames_grouped = {}
        for span in anno_doc.tiers['geonames']:
            if not span.geoname['geonameid'] in geonames_grouped:
                geonames_grouped[span.geoname['geonameid']] = {
                    'type': 'location',
                    'name': span.geoname.name,
                    'geoname': span.geoname.to_dict(),
                    'textOffsets': [
                        [span.start, span.end]
                    ]
                }
            else:
                geonames_grouped[
                    span.geoname['geonameid']
                ]['textOffsets'].append(
                    [span.start, span.end]
                )
        logger.info(time_sofar.next() + 'Annotated geonames')

        counts = []
        for span in anno_doc.tiers['counts'].without_overlaps(anno_doc.tiers['structured_data']):
            count_dict = dict(span.metadata)
            count_dict['type'] = 'count'
            count_dict['text'] = span.text
            count_dict['label']= span.label
            count_dict['textOffsets']= [[span.start, span.end]]
            counts.append(count_dict)
            # Include legacy case counts so the diagnositic dashboard
            # doesn't break.
            if 'case' in count_dict['attributes']:
                counts.append({
                    'type': 'caseCount',
                    'text': count_dict['text'],
                    'value': count_dict['count'],
                    'modifiers': count_dict['attributes'],
                    'cumulative': "cumulative" in count_dict['attributes'],
                    'textOffsets': count_dict['textOffsets']
                })
        keyword_types = ['diseases', 'hosts', 'modes', 'pathogens', 'symptoms']
        keyword_groups = {}
        for keyword_type in keyword_types:
            keyword_groups[keyword_type] = {}
            for span in anno_doc.tiers['keyword.' + keyword_type]:
                if span.label not in keyword_groups[keyword_type]:
                    keyword_groups[keyword_type][span.label] = {
                        'type': keyword_type,
                        'value': span.label,
                        'textOffsets': [[span.start, span.end]]
                    }
                else:
                    keyword_groups[keyword_type][span.label]['textOffsets'].append(
                        [span.start, span.end]
                    )
        resolved_keywords = []
        for span in anno_doc.tiers['resolved_keywords'].without_overlaps(anno_doc.tiers['geonames']):
            resolved_keywords.append({
                'type': 'resolvedKeyword',
                'resolutions': span.metadata['resolutions'],
                'text': span.text,
                'textOffsets': [[span.start, span.end]]})
        result = {
            'diagnoserVersion': self.__version__,
            'dateOfDiagnosis': datetime.datetime.now(),
            'diseases': diseases,
            'structuredIncidents': [
                dict(span.metadata, textOffsets=[[span.start, span.end]])
                for span in anno_doc.tiers['structured_incidents']],
            'features': counts +\
                        geonames_grouped.values() +\
                        dates +\
                        keyword_groups['diseases'].values() +\
                        keyword_groups['hosts'].values() +\
                        keyword_groups['modes'].values() +\
                        keyword_groups['pathogens'].values() +\
                        keyword_groups['symptoms'].values() +\
                        resolved_keywords}
        if include_incidents:
            result['incidents'] = []
            anno_doc.add_tier(IncidentAnnotator())
            for incident_span in anno_doc.tiers['incidents']:
                metadata = incident_span.metadata
                incident_data = {
                    'offsets': [span.start, span.end],
                    'type': metadata['type'],
                    'value': metadata['value'],
                    'dateRange': [d.isoformat().split('T')[0] for d in metadata['dateRange']],
                    'locations': metadata['locations'],
                    'species': metadata['species'],
                    'status': metadata.get('status'),
                    'resolvedDisease': metadata.get('resolvedDisease'),
                    'annotations': {
                        'case': [{ 'offsets': [incident_span.start, incident_span.end] }]
                    }
                }
                if 'count_annotation' in metadata:
                    count_annotation = metadata['count_annotation']
                    incident_data['annotations'] = {
                        'case': [{ 'offsets': [count_annotation.start, count_annotation.end] }],
                        'date': [
                            { 'offsets': [anno.start, anno.end] }
                            for anno in metadata['date_territory'].metadata
                        ],
                        'location': [
                            { 'offsets': [anno.start, anno.end] }
                            for anno in metadata['geoname_territory'].metadata
                        ],
                        'disease': [
                            { 'offsets': [anno.start, anno.end] }
                            for anno in metadata['disease_territory'].metadata
                        ]
                    }
                result['incidents'].append(incident_data)
        return result