def _make_token(self, start, end):
     '''
     Instantiates a Token from self._input_string[start:end]
     '''
     ## all thfift strings must be encoded first
     tok_string = self._input_string[start:end].encode('utf-8')
     if only_whitespace.match(tok_string):
         ## drop any tokens with only whitespace
         return None
     tok = Token()
     tok.token = tok_string
     tok.token_num = self.tok_num
     if 'BYTES' in self.config['offset_types']:
         tok.offsets[OffsetType.BYTES] = Offset(
             type =  OffsetType.BYTES,
             first=self.byte_idx + len(self._input_string[:start].encode('utf-8')),
             length=len(tok_string),
             value=self.config['offset_debugging'] and tok_string or None,
             )
     if 'LINES' in self.config['offset_types']:
         tok.offsets[OffsetType.LINES] = Offset(
             type =  OffsetType.LINES,
             first=self.line_idx,
             length=1,
             value=self.config['offset_debugging'] and tok_string or None,
             )
     self.tok_num += 1
     ## keep track of position within a sentence
     tok.sentence_pos = self.sent_pos
     self.sent_pos += 1
     return tok
Esempio n. 2
0
def test_entity_type():
    tok_per = Token(entity_type=EntityType.PER)
    tok_foo = Token(entity_type=EntityType.CUSTOM_TYPE,
                    custom_entity_type='foo')

    assert get_entity_type(tok_per) == 'PER'
    assert get_entity_type(tok_foo) == 'foo'
Esempio n. 3
0
 def _make_token(self, start, end):
     '''
     Instantiates a Token from self._input_string[start:end]
     '''
     ## all thfift strings must be encoded first
     tok_string = self._input_string[start:end].encode('utf-8')
     if only_whitespace.match(tok_string):
         ## drop any tokens with only whitespace
         return None
     tok = Token()
     tok.token = tok_string
     tok.token_num = self.tok_num
     if 'BYTES' in self.config['offset_types']:
         tok.offsets[OffsetType.BYTES] = Offset(
             type=OffsetType.BYTES,
             first=self.byte_idx +
             len(self._input_string[:start].encode('utf-8')),
             length=len(tok_string),
             value=self.config['offset_debugging'] and tok_string or None,
         )
     if 'LINES' in self.config['offset_types']:
         tok.offsets[OffsetType.LINES] = Offset(
             type=OffsetType.LINES,
             first=self.line_idx,
             length=1,
             value=self.config['offset_debugging'] and tok_string or None,
         )
     self.tok_num += 1
     ## keep track of position within a sentence
     tok.sentence_pos = self.sent_pos
     self.sent_pos += 1
     return tok
    def make_sentences(self, stream_item):
        'assemble Sentence and Token objects'
        self.make_label_index(stream_item)
        sentences = []
        token_num = 0
        new_mention_id = 0
        for sent_start, sent_end, sent_str in self._sentences(stream_item.body.clean_visible):
            assert isinstance(sent_str, unicode)
            sent = Sentence()
            sentence_pos = 0
            for start, end in self.word_tokenizer.span_tokenize(sent_str):
                try:
                    token_str = sent_str[start:end].encode('utf8')
                except Exception, exc:
                    logger.critical("died on sent_str[%d:%d].encode('utf8')",
                                    start, end, exc_info=True)
                    sys.exit('failed to cope with %r in %r' % (sent_str[start:end], sent_str))
                tok = Token(
                    token_num=token_num,
                    token=token_str,
                    sentence_pos=sentence_pos,
                )
                tok.offsets[OffsetType.BYTES] = Offset(
                    type=OffsetType.BYTES, 
                    first=sent_start + start,
                    length = end - start,
                )
                ## whitespace tokenizer will never get a token
                ## boundary in the middle of an 'author' label
                try:
                    #logger.debug('searching for %d in %r', sent_start + start, self.label_index._keys)
                    label = self.label_index.find_le(sent_start + start)
                except ValueError:
                    label = None
                if label:
                    off = label.offsets[OffsetType.BYTES]
                    if off.first + off.length > sent_start + start:
                        logger.info('overlapping label: %r' % label.target.target_id)
                        ## overlaps
                        streamcorpus.add_annotation(tok, label)
                        assert label.annotator.annotator_id in tok.labels

                        logger.info('adding label to tok: %r has %r',
                                     tok.token, label.target.target_id)

                        if label in self.label_to_mention_id:
                            mention_id = self.label_to_mention_id[label]
                        else:
                            mention_id = new_mention_id
                            new_mention_id += 1
                            self.label_to_mention_id[label] = mention_id

                        tok.mention_id = mention_id

                token_num += 1
                sentence_pos += 1
                sent.tokens.append(tok)
            sentences.append(sent)
Esempio n. 5
0
def test_multi_token_match():
    si = make_stream_item(0, '')
    tagger_id = 'test_tagger'
    annotator_id = 'test_anno'
    target_id = 'test_target'
    si.body.sentences[tagger_id] = [
        Sentence(tokens=[
            Token(token='This'),
            Token(token='-LRB-big-RRB- dog'),
            Token(token='Jake'),
            Token(token='has'),
            Token(token='no'),
            Token(token=u'\u1F601'.encode('utf8')),
            Token(token='...'),
            Token(token='Teeth'),
        ])
    ]
    rating = Rating(
        annotator=Annotator(annotator_id=annotator_id),
        target=Target(target_id=target_id),
        mentions=['Big dog! Jake... ', u'\u1F601 Teeth'.encode('utf8')],
    )
    add_annotation(si, rating)
    aligner_data = dict(
        tagger_id=tagger_id,
        annotator_id=annotator_id,
    )

    multi_token_match(si, aligner_data)

    assert si.body.sentences[tagger_id][0].tokens[1].labels
    assert si.body.sentences[tagger_id][0].tokens[2].labels
    assert si.body.sentences[tagger_id][0].tokens[-3].labels
    assert si.body.sentences[tagger_id][0].tokens[-2].labels
    assert si.body.sentences[tagger_id][0].tokens[-1].labels
    def make_sentences(self, stream_item):
        'assemble Sentence and Token objects'
        self.make_label_index(stream_item)
        sentences = []
        token_num = 0
        new_mention_id = 0
        for sent_start, sent_end, sent_str in self._sentences(
                stream_item.body.clean_visible):
            assert isinstance(sent_str, unicode)
            sent = Sentence()
            sentence_pos = 0
            for start, end in self.word_tokenizer.span_tokenize(sent_str):
                try:
                    token_str = sent_str[start:end].encode('utf8')
                except Exception, exc:
                    logger.critical("died on sent_str[%d:%d].encode('utf8')",
                                    start,
                                    end,
                                    exc_info=True)
                    sys.exit('failed to cope with %r in %r' %
                             (sent_str[start:end], sent_str))
                tok = Token(
                    token_num=token_num,
                    token=token_str,
                    sentence_pos=sentence_pos,
                )
                tok.offsets[OffsetType.BYTES] = Offset(
                    type=OffsetType.BYTES,
                    first=sent_start + start,
                    length=end - start,
                )
                ## whitespace tokenizer will never get a token
                ## boundary in the middle of an 'author' label
                try:
                    #logger.debug('searching for %d in %r', sent_start + start, self.label_index._keys)
                    label = self.label_index.find_le(sent_start + start)
                except ValueError:
                    label = None
                if label:
                    off = label.offsets[OffsetType.BYTES]
                    if off.first + off.length > sent_start + start:
                        logger.info('overlapping label: %r' %
                                    label.target.target_id)
                        ## overlaps
                        streamcorpus.add_annotation(tok, label)
                        assert label.annotator.annotator_id in tok.labels

                        logger.info('adding label to tok: %r has %r',
                                    tok.token, label.target.target_id)

                        if label in self.label_to_mention_id:
                            mention_id = self.label_to_mention_id[label]
                        else:
                            mention_id = new_mention_id
                            new_mention_id += 1
                            self.label_to_mention_id[label] = mention_id

                        tok.mention_id = mention_id

                token_num += 1
                sentence_pos += 1
                sent.tokens.append(tok)
            sentences.append(sent)
    assert si.version == Versions.v0_3_0, 'new streamcorpus collections should be built using the latest version'

    ## clean_visible is byte identical to clean_html, except all the
    ## tags are converted to whitespace, so offsets in match
    #input_html = si.body.clean_html = text.encode('utf8')
    clean_visible = si.body.clean_visible.decode('utf8')

    ## run the text through a tagger
    #tagger_output = my_tagger( clean_visible )
    
    ## to illustrate, here, we construct a single sentence of tokens
    ## with all the fields populated
    first_sentence = Sentence()
    first_sentence.tokens = [
        Token(
            token='The',
            ),
        Token(
            token='cat',
            ),
        Token(
            token='jumped',
            ),
        Token(
            token='over',
            ),
        Token(
            token='the',
            ),
        Token(
            token='car',