Exemple #1
0
def merge_top_n(chains):
    """
    Take first (most probable) as base for resulting chain
    and merge other N-1 chains one by one
    Entities in next merged chain, which has any overlap
    with entities in resulting chain, just ignored

    non-overlap
    >>> chains = [ ['B-PER', 'O'     ],
    ...            ['O'    , 'B-FUNC'] ]

    >>> merge_top_n(chains)
    ['B-PER', 'B-FUNC']

    partially overlap
    >>> chains = [ ['B-PER', 'I-PER', 'O'    ],
    ...            ['O'    , 'B-PER', 'I-PER'] ]

    >>> merge_top_n(chains)
    ['B-PER', 'I-PER', 'O']

    fully overlap
    >>> chains = [ ['B-PER', 'I-PER'],
    ...            ['B-ORG', 'I-ORG'] ]

    >>> merge_top_n(chains)
    ['B-PER', 'I-PER']
    """
    ret = copy.copy(chains[0])
    for chain in chains[1:]:
        encoder = IobEncoder()

        for items, tag in encoder.iter_group(enumerate(chain)):

            is_tagged = False
            idx = 0
            while not is_tagged and idx < len(items):
                item = items[idx]
                idx = idx + 1
                is_tagged = ret[item] != 'O'

            if is_tagged:
                continue

            for item in items:
                ret[item] = chain[item]
    return ret
Exemple #2
0
 def extract(self, bytes_data):
     """
     Extract named entities from binary HTML data ``bytes_data``.
     Return a list of ``(entity_text, entity_type)`` tuples.
     """
     html_tokens, tags = self.extract_raw(bytes_data)
     groups = IobEncoder.group(zip(html_tokens, tags))
     return _drop_empty((self.build_entity(tokens), tag)
                        for (tokens, tag) in groups if tag != 'O')
Exemple #3
0
 def extract(self, bytes_data):
     """
     Extract named entities from binary HTML data ``bytes_data``.
     Return a list of ``(entity_text, entity_type)`` tuples.
     """
     html_tokens, tags = self.extract_raw(bytes_data)
     groups = IobEncoder.group(zip(html_tokens, tags))
     return _drop_empty(
         (self.build_entity(tokens, tag), tag)
         for (tokens, tag) in groups if tag != 'O'
     )
    def __init__(self, tagset=None, sequence_encoder=None,
                 text_tokenize_func=None, kill_html_tags=None,
                 replace_html_tags=None, ignore_html_tags=None):
        self.tagset = set(tagset) if tagset is not None else None
        self.text_tokenize_func = text_tokenize_func or tokenize
        self.kill_html_tags = kill_html_tags
        self.replace_html_tags = replace_html_tags

        if ignore_html_tags is not None:
            self.ignore_html_tags = set(ignore_html_tags)
        else:
            self.ignore_html_tags = {'script', 'style'}
        self.ignore_html_tags.add(Comment)  # always ignore comments

        # FIXME: don't use shared instance of sequence encoder
        # because sequence encoder is stateful
        self.sequence_encoder = sequence_encoder or IobEncoder()

        tag_pattern = self.sequence_encoder.token_processor.tag_re.pattern
        self._tag_re = re.compile(r"(^|\s)%s(\s|$)" % tag_pattern.strip())