def _annotate_all_tiers(text): annotated = AnnoDoc(text) anno_tiers = [ GeonameAnnotator(), CountAnnotator(), ResolvedKeywordAnnotator(), DateAnnotator() ] for tier in anno_tiers: annotated.add_tiers(tier) return annotated
def annotated_example(): dirname = os.path.dirname(__file__) path = os.path.join(dirname, '..', 'data', 'fixtures', 'annotated_example.pkl') if not os.path.isfile(path): from epitator.annotator import AnnoDoc from epitator.count_annotator import CountAnnotator annotated = AnnoDoc('I am in Berlin. Here are 5 confirmed cases of influenza. ' 'Still, less worse than those 100 confirmed and 200 suspected cases last year.') annotated.add_tiers(CountAnnotator()) annotated = delete_non_epitator_name_entity_tiers(annotated) os.makedirs(os.path.dirname(path), exist_ok=True) with open(path, 'wb') as handel: pickle.dump(annotated, handel) else: with open(path, 'rb') as handel: annotated = pickle.load(handel) return annotated
class Summarizer: def __init__(self): self.doc: Optional[AnnoDoc] = None def summarize(self, text: Union[str, List[str]]) -> Dict[str, str]: if not isinstance(text, str): return self._batch_summarize(text) else: self.doc = AnnoDoc(text) self.doc.add_tiers(ResolvedKeywordAnnotator()) self.doc.add_tiers(GeonameAnnotator()) return { "disease": self._extract_key_disease(), "geoname": self._extract_key_geoname(), } def _batch_summarize(self, texts) -> List[Dict[str, str]]: with Pool(cpu_count() - 1) as p: return p.map(self.summarize, texts) def _extract_key_disease(self): try: diseases = [ i.metadata["resolutions"][0]["entity"]["label"] for i in self.doc.tiers["resolved_keywords"].spans ] return Counter(diseases).most_common(1)[0][0] except (KeyError, IndexError): return "" def _extract_key_geoname(self): try: geonames = [i.geoname for i in self.doc.tiers["geonames"].spans] country_of_geoname = [] for geoname in geonames[:3]: try: country_of_geoname.append(geoname.country_name) except AttributeError: country_of_geoname.append(geoname.name) return Counter(country_of_geoname).most_common(1)[0][0] except (KeyError, IndexError): return ""
def main(in_file, out_file): f = open(in_file) txt = f.readlines() txt = ' '.join(txt) adoc = AnnoDoc(txt) adoc.add_tiers(GeonameAnnotator()) adoc.add_tiers(DateAnnotator()) adoc.add_tiers(ResolvedKeywordAnnotator()) # print(vars(adoc.tiers['geonames'])) # print(vars(adoc.tiers['dates'])) # print(vars(adoc.tiers['resolved_keywords'])) # Location geo = [x.to_dict() for x in adoc.tiers['geonames'].spans] # Date dates = [str(x) for x in adoc.tiers['dates'].spans] # Other other = [x.to_dict() for x in adoc.tiers['resolved_keywords'].spans] d = {'location': geo, 'date': dates, 'resolved_keyword': other} with open(out_file, 'w') as dest: json.dump(d, dest)
def _annotate(self, text: str, entity: str) -> AnnoDoc: tier = {"counts": CountAnnotator(), "dates": DateAnnotator()} annotated = AnnoDoc(text) annotated.add_tiers(tier[entity]) return annotated
def _annotate(text: str, to_optimize: str) -> AnnoDoc: tier = {'counts': CountAnnotator(), 'dates': DateAnnotator()} annotated = AnnoDoc(text) annotated.add_tiers(tier[to_optimize]) return annotated