Esempio n. 1
0
def _annotate_all_tiers(text):
    annotated = AnnoDoc(text)
    anno_tiers = [
        GeonameAnnotator(),
        CountAnnotator(),
        ResolvedKeywordAnnotator(),
        DateAnnotator()
    ]
    for tier in anno_tiers:
        annotated.add_tiers(tier)
    return annotated
def annotated_example():
    dirname = os.path.dirname(__file__)
    path = os.path.join(dirname, '..', 'data', 'fixtures', 'annotated_example.pkl')
    if not os.path.isfile(path):
        from epitator.annotator import AnnoDoc
        from epitator.count_annotator import CountAnnotator

        annotated = AnnoDoc('I am in Berlin. Here are 5 confirmed cases of influenza. '
                            'Still, less worse than those 100 confirmed and 200 suspected cases last year.')
        annotated.add_tiers(CountAnnotator())
        annotated = delete_non_epitator_name_entity_tiers(annotated)
        os.makedirs(os.path.dirname(path), exist_ok=True)
        with open(path, 'wb') as handel:
            pickle.dump(annotated, handel)
    else:
        with open(path, 'rb') as handel:
            annotated = pickle.load(handel)
    return annotated
Esempio n. 3
0
class Summarizer:
    def __init__(self):
        self.doc: Optional[AnnoDoc] = None

    def summarize(self, text: Union[str, List[str]]) -> Dict[str, str]:
        if not isinstance(text, str):
            return self._batch_summarize(text)
        else:
            self.doc = AnnoDoc(text)
            self.doc.add_tiers(ResolvedKeywordAnnotator())
            self.doc.add_tiers(GeonameAnnotator())
            return {
                "disease": self._extract_key_disease(),
                "geoname": self._extract_key_geoname(),
            }

    def _batch_summarize(self, texts) -> List[Dict[str, str]]:
        with Pool(cpu_count() - 1) as p:
            return p.map(self.summarize, texts)

    def _extract_key_disease(self):
        try:
            diseases = [
                i.metadata["resolutions"][0]["entity"]["label"]
                for i in self.doc.tiers["resolved_keywords"].spans
            ]
            return Counter(diseases).most_common(1)[0][0]
        except (KeyError, IndexError):
            return ""

    def _extract_key_geoname(self):
        try:
            geonames = [i.geoname for i in self.doc.tiers["geonames"].spans]
            country_of_geoname = []
            for geoname in geonames[:3]:
                try:
                    country_of_geoname.append(geoname.country_name)
                except AttributeError:
                    country_of_geoname.append(geoname.name)
            return Counter(country_of_geoname).most_common(1)[0][0]
        except (KeyError, IndexError):
            return ""
Esempio n. 4
0
def main(in_file, out_file):
    f = open(in_file)
    txt = f.readlines()
    txt = ' '.join(txt)
    adoc = AnnoDoc(txt)
    adoc.add_tiers(GeonameAnnotator())
    adoc.add_tiers(DateAnnotator())
    adoc.add_tiers(ResolvedKeywordAnnotator())
    # print(vars(adoc.tiers['geonames']))
    # print(vars(adoc.tiers['dates']))
    # print(vars(adoc.tiers['resolved_keywords']))

    # Location
    geo = [x.to_dict() for x in adoc.tiers['geonames'].spans]
    # Date
    dates = [str(x) for x in adoc.tiers['dates'].spans]
    # Other
    other = [x.to_dict() for x in adoc.tiers['resolved_keywords'].spans]

    d = {'location': geo, 'date': dates, 'resolved_keyword': other}

    with open(out_file, 'w') as dest:
        json.dump(d, dest)
Esempio n. 5
0
 def _annotate(self, text: str, entity: str) -> AnnoDoc:
     tier = {"counts": CountAnnotator(), "dates": DateAnnotator()}
     annotated = AnnoDoc(text)
     annotated.add_tiers(tier[entity])
     return annotated
Esempio n. 6
0
def _annotate(text: str, to_optimize: str) -> AnnoDoc:
    tier = {'counts': CountAnnotator(), 'dates': DateAnnotator()}
    annotated = AnnoDoc(text)
    annotated.add_tiers(tier[to_optimize])
    return annotated