def merge_top_n(chains): """ Take first (most probable) as base for resulting chain and merge other N-1 chains one by one Entities in next merged chain, which has any overlap with entities in resulting chain, just ignored non-overlap >>> chains = [ ['B-PER', 'O' ], ... ['O' , 'B-FUNC'] ] >>> merge_top_n(chains) ['B-PER', 'B-FUNC'] partially overlap >>> chains = [ ['B-PER', 'I-PER', 'O' ], ... ['O' , 'B-PER', 'I-PER'] ] >>> merge_top_n(chains) ['B-PER', 'I-PER', 'O'] fully overlap >>> chains = [ ['B-PER', 'I-PER'], ... ['B-ORG', 'I-ORG'] ] >>> merge_top_n(chains) ['B-PER', 'I-PER'] """ ret = copy.copy(chains[0]) for chain in chains[1:]: encoder = IobEncoder() for items, tag in encoder.iter_group(enumerate(chain)): is_tagged = False idx = 0 while not is_tagged and idx < len(items): item = items[idx] idx = idx + 1 is_tagged = ret[item] != 'O' if is_tagged: continue for item in items: ret[item] = chain[item] return ret
def extract(self, bytes_data): """ Extract named entities from binary HTML data ``bytes_data``. Return a list of ``(entity_text, entity_type)`` tuples. """ html_tokens, tags = self.extract_raw(bytes_data) groups = IobEncoder.group(zip(html_tokens, tags)) return _drop_empty((self.build_entity(tokens), tag) for (tokens, tag) in groups if tag != 'O')
def extract(self, bytes_data): """ Extract named entities from binary HTML data ``bytes_data``. Return a list of ``(entity_text, entity_type)`` tuples. """ html_tokens, tags = self.extract_raw(bytes_data) groups = IobEncoder.group(zip(html_tokens, tags)) return _drop_empty( (self.build_entity(tokens, tag), tag) for (tokens, tag) in groups if tag != 'O' )
def __init__(self, tagset=None, sequence_encoder=None, text_tokenize_func=None, kill_html_tags=None, replace_html_tags=None, ignore_html_tags=None): self.tagset = set(tagset) if tagset is not None else None self.text_tokenize_func = text_tokenize_func or tokenize self.kill_html_tags = kill_html_tags self.replace_html_tags = replace_html_tags if ignore_html_tags is not None: self.ignore_html_tags = set(ignore_html_tags) else: self.ignore_html_tags = {'script', 'style'} self.ignore_html_tags.add(Comment) # always ignore comments # FIXME: don't use shared instance of sequence encoder # because sequence encoder is stateful self.sequence_encoder = sequence_encoder or IobEncoder() tag_pattern = self.sequence_encoder.token_processor.tag_re.pattern self._tag_re = re.compile(r"(^|\s)%s(\s|$)" % tag_pattern.strip())