class Cached(Provider): cache: Dict[str, Any] name = 'cached' known_schemas = { # these assume same tokenisation "bio": lambda doc, annotation: offsets_from_biluo_tags( iob_to_biluo(doc, annotation)), "bilou": offsets_from_biluo_tags, "offsets": OFFSETS, "list_of_clusters": convert_clusters_to_offsets, # these provide their own tokenisation # annotation: List[Tuple[str,str]] "list_of_tuples_bio_flat": lambda doc, annotation: get_offsets(doc.text, annotation), # annotation: List[List[Tuple[str,str]]] "list_of_tuples_bio_stacked": lambda doc, annotation: get_offsets_from_sentences( doc.text, annotation), # annotation: Tuple[List[str],List[str]] "tuple_of_lists_flat": lambda doc, annotation: get_offsets(doc.text, zip(*annotation[:2])), # annotation: List[Tuple[List[str]], Tuple[List[str]]] "list_of_tuples_of_lists": lambda doc, annotation: get_offsets_from_sentences( doc.text, ((w, l) for t in annotation for w, l in zip(*t[:2]))), # annotation: Tuple[List[List[str]], Tuple[List[List[str]] "tuple_of_lists_of_lists": lambda doc, annotation: get_offsets_from_sentences( doc.text, ((w, l) for ws, ls in zip(*annotation[:2]) for w, l in zip(ws, ls))) # TODO: BRAT # TODO: Pubmed } def __init__(self, schema: Union[str, Callable[[Doc, Any], OffsetAnnotation]] = None, getter=None, path: str = None): self.cache = {} self.loaded = False if not schema: self.schema = OFFSETS elif schema in self.known_schemas: self.schema = Cached.known_schemas[schema] elif isinstance(schema, Callable): self.schema = schema else: self.schema = None self.getter = getter if path: self.load(path) @overrides def save(self, path: str): util.save_file(self.cache, path) # TODO: guess schema @overrides def load(self, path): self.cache = util.load_file(path) self.loaded = True @overrides def annotate_document(self, doc: Doc) -> OffsetAnnotation: if not self.loaded: raise ValueError("You forgot to load the cache!") annotations = self.cache.get(doc._.id, None) if annotations: if self.schema: if self.schema == OFFSETS: return self.getter( annotations) if self.getter else annotations else: return self.schema( doc, self.getter(annotations) if self.getter else annotations) else: logger.info( f"no schema loaded for {self.__class__.__name__}, good luck!" ) return annotations
def test_get_offsets_works_with_commas_in_between(): text = "I like, wedding cake cakes." annotation = list(zip('I like , wedding cake cakes .'.split(), "O O O B-CAKE I-CAKE B-CAKE O".split())) assert get_offsets(text, annotation) == [(8, 20, "CAKE"), (21, 26, "CAKE")]
def test_get_offsets_works_with_funky_spacing(): text = "I like , wedding cake cake." annotation = list(zip('I like , wedding cake cake .'.split(), "O O O B-CAKE I-CAKE B-CAKE O".split())) assert get_offsets(text, annotation) == [(12, 24, "CAKE"), (25, 29, "CAKE")]
def test_get_offsets_works_with_last_tag(): logger.info(f"Working dir: {os.getcwd()}") text = "I like cakes" annotation = list(zip(text.split(), "O O B-CAKE".split())) assert get_offsets(text, annotation) == [(7, 12, "CAKE")]
def test_get_offsets_works_with_last_longer_tags(): text = "I like big cakes" annotation = list(zip(text.split(), "O O B-CAKE I-CAKE".split())) assert get_offsets(text, annotation) == [(7, 16, "CAKE")]
def test_get_offsets_works_with_consecutive_tags(): text = "I like wedding cake cakes." annotation = list(zip('I like wedding cake cakes .'.split(), "O O B-CAKE I-CAKE B-CAKE O".split())) assert get_offsets(text, annotation) == [(7, 19, "CAKE"), (20, 25, "CAKE")]
def test_get_offsets_works_with_bio_tags(): text = "I like big cakes." annotation = list(zip('I like big cakes .'.split(), "O O B-CAKE I-CAKE O".split())) assert get_offsets(text, annotation) == [(7, 16, "CAKE")]
def test_get_offsets_works_with_wrong_capitalisation(): text = "I like cakes." annotation = list(zip('i like cakes .'.split(), "O O B-CAKE O".split())) assert get_offsets(text, annotation) == [(7, 12, "CAKE")]
def test_get_offsets_works_with_sane_text(): text = "I like cakes." annotation = list(zip('I like cakes .'.split(), "O O B-CAKE O".split())) assert get_offsets(text, annotation) == [(7, 12, "CAKE")]
def test_offset_latch_match_returns_position_of_last_token(): text = "I like , wedding cake cake troll" annotation = list(zip('I like , wedding cake cake troll'.split(), "O O O B-CAKE I-CAKE B-CAKE O".split())) _, last_match = get_offsets(text, annotation, return_last_match=True) assert last_match == 35