def _process_mm_biomarker_result(spans: List[Span], tokens: List[DocToken]) -> List[Span]: result_spans = [] measurement_spans = [x for x in spans if x.label == 'MEASUREMENT'] if len(measurement_spans) > 1: abnormal_text = measurement_spans[0].text total_text = measurement_spans[1].text result_spans.append( Span(text=abnormal_text, entity=AbnormalCells.create(abnormal_text), tokens=measurement_spans[0].tokens)) result_spans.append( Span(text=total_text, entity=TotalCells.create(total_text), tokens=measurement_spans[1].tokens)) elif len(measurement_spans) == 1: total_text = measurement_spans[0].text result_spans.append( Span(text=total_text, entity=TotalCells.create(total_text), tokens=measurement_spans[0].tokens)) mm_mutation_type_spans = [ x for x in spans if x.label == 'MM_BIOMARKER_MUTATION' ] if len(mm_mutation_type_spans) > 0: token_start = mm_mutation_type_spans[0].offset mm_mutation_type_tokens = tokens[token_start:] text = ' '.join([x.token for x in mm_mutation_type_tokens]) result_spans.append( Span(text=text, entity=MmBiomarkerMutation.create(text), tokens=mm_mutation_type_tokens)) return result_spans
def spans(doc, tokens): spans = [ Span( text="a", doc=doc, entity=Entity(name="A", label="LABEL_0"), tokens=tokens[:1], ), Span( text="a", doc=doc, entity=Entity(name="B", label="LABEL_0"), tokens=tokens[:1], ), Span( text="a", doc=doc, entity=Entity(name="A", label="LABEL_1"), tokens=tokens[:1], ), Span( text="a", doc=doc, entity=Entity(name="C", label="LABEL_0", synonyms=["a"]), tokens=tokens[:1], ), ] assert 4 == count_it(spans) return spans
def sort_key(cls, span: Span): return ( -span.num_tokens, span.match_type(), span.offset, span.label, )
def sort_key(cls, span: Span): return ( -span.num_tokens, # longest wins span.match_type(), # exact name > exact synonym > lower case span.offset, # deterministic span.label, # deterministic )
def _process_mm_biomarker(spans: List[Span]) -> List[Span]: result_spans = [] if len(spans) > 0: text = '/'.join([x.text for x in spans]) tokens = [token for span in spans for token in span.tokens] biomarker_span = Span(text=text, entity=Biomarker.create(text), tokens=tokens) biomarker_result_span = Span( text=text, entity=BiomarkerResult.create(biomarker_span), tokens=tokens) result_spans.append(biomarker_result_span) return result_spans
def is_keep(self, span: Span): return ( # keep text with length > 3 len(span.text) > 3 # keep text with numbers or symbols (often measurements) or len(set.intersection(set(span.text), set("1234567890%<>=-+~"))) > 0 # keep text which is not solely a lowercase synonym or span.match_type() not in {SpanMatch.LowercaseSynonym, SpanMatch.LowercaseName})
def _resolve_entity(self, prefix: Token, doc_tokens: List[DocToken]): any_found = False while not any_found and prefix: entities = self.resolver.resolve(term=prefix) for entity in entities: span = Span( text=prefix, doc=self.doc, entity=entity, tokens=doc_tokens, ) self.spans.append(span) any_found = True if not any_found: prefix = prefix.left_token doc_tokens = doc_tokens[:-1]