def multi_token_match(stream_item, aligner_data):
    '''
    iterate through tokens looking for near-exact matches to strings
    in si.ratings...mentions
    '''    
    sentences = stream_item.body.sentences.get(aligner_data['tagger_id'])
    if not sentences:
        return
    ## construct a list of tuples, where the first part of each tuple
    ## is a tuple of cleansed strings, and the second part is the
    ## Token object from which it came.
    tokens = map(lambda tok: (cleanse(tok.token.decode('utf8')).split(' '), tok), 
                 itertools.chain(*[sent.tokens for sent in sentences]))    
    for annotator_id, ratings in stream_item.ratings.items():
        if annotator_id == aligner_data['annotator_id']:
            for rating in ratings:
                label = Label(annotator=rating.annotator,
                              target=rating.target)
                
                num_tokens_matched = 0
                for tok in look_ahead_match(rating, tokens):
                    if aligner_data.get('update_labels'):
                        tok.labels.pop(annotator_id, None)
                    add_annotation(tok, label)
                    num_tokens_matched += 1

                if num_tokens_matched == 0:
                    logger.critical('failed multi_token_match %r:\n  mentions: %r\n  tokens: %r\n clean_html=%r',
                                    stream_item.abs_url, rating.mentions, tokens, stream_item.body.clean_html)
                else:
                    logger.debug('matched %d tokens for %r',
                                 num_tokens_matched, rating.target.target_id)
def multi_token_match(stream_item, aligner_data):
    '''
    iterate through tokens looking for near-exact matches to strings
    in si.ratings...mentions
    '''
    sentences = stream_item.body.sentences.get(aligner_data['tagger_id'])
    if not sentences:
        return
    ## construct a list of tuples, where the first part of each tuple
    ## is a tuple of cleansed strings, and the second part is the
    ## Token object from which it came.
    tokens = map(
        lambda tok: (cleanse(tok.token.decode('utf8')).split(' '), tok),
        itertools.chain(*[sent.tokens for sent in sentences]))
    for annotator_id, ratings in stream_item.ratings.items():
        if annotator_id == aligner_data['annotator_id']:
            for rating in ratings:
                label = Label(annotator=rating.annotator, target=rating.target)

                num_tokens_matched = 0
                for tok in look_ahead_match(rating, tokens):
                    if aligner_data.get('update_labels'):
                        tok.labels.pop(annotator_id, None)
                    add_annotation(tok, label)
                    num_tokens_matched += 1

                if num_tokens_matched == 0:
                    logger.critical(
                        'failed multi_token_match %r:\n  mentions: %r\n  tokens: %r\n clean_html=%r',
                        stream_item.abs_url, rating.mentions, tokens,
                        stream_item.body.clean_html)
                else:
                    logger.debug('matched %d tokens for %r',
                                 num_tokens_matched, rating.target.target_id)
    def _make_stream_item(cls, path, metadata, abs_url, entities):
        '''
        
        '''
        ## Every StreamItem has a stream_time property.  It usually comes
        ## from the document creation time.
        creation_time = os.path.getctime(path)

        ## make stream item
        stream_item = streamcorpus.make_stream_item(
            creation_time,
            abs_url)

        stream_item.source = metadata.get('source')

        ## build a ContentItem for the body
        body = streamcorpus.ContentItem()
        body.media_type = magic.from_file(path, mime=True)
        
        logger.info('opening %r', path)
        with open(path) as f:
            body.raw = f.read()

        ## attach the content_item to the stream_item
        stream_item.body = body

        ## annotations
        anno = streamcorpus.Annotator()
        anno.annotator_id = metadata['annotator_id']
        anno.annotation_time = stream_item.stream_time

        num_ratings = 0
        for entity, is_profile in entities:
            num_ratings += 1

            ## pull out target id and mention tokens
            target_id = str(entity['target_id'])

            ## build a Label for the doc-level label:
            rating = streamcorpus.Rating()
            rating.annotator = anno
            rating.target = streamcorpus.Target(target_id = target_id)
            rating.contains_mention = True

            if is_profile:
                rating.flags = [streamcorpus.FlagType.PROFILE]

            ## parse slots in yaml file
            slots = cls._parse_slots(entity['slots'])

            ## heuristically split the slots string on white space and
            ## use each token as a separate mention.
            rating.mentions = [cleanse(unicode(slot[1], 'utf-8')) for slot in slots]

            ## put this one label in the array of labels
            streamcorpus.add_annotation(stream_item, rating)

        ## provide this stream_item to the pipeline
        logger.info('created StreamItem(num ratings=%d, abs_url=%r', num_ratings, stream_item.abs_url)
        return stream_item
def test_multi_token_match():
    si = make_stream_item(0, '')
    tagger_id = 'test_tagger'
    annotator_id = 'test_anno'
    target_id = 'test_target'
    si.body.sentences[tagger_id] = [
        Sentence(tokens=[
                Token(token='This'),
                Token(token='-LRB-big-RRB- dog'),
                Token(token='Jake'),
                Token(token='has'),
                Token(token='no'),
                Token(token=u'\u1F601'.encode('utf8')),
                Token(token='...'),
                Token(token='Teeth'),
                ])]
    rating = Rating(annotator=Annotator(annotator_id=annotator_id),
           target=Target(target_id=target_id),
           mentions=['Big dog! Jake... ', u'\u1F601 Teeth'.encode('utf8')],
           )
    add_annotation(si, rating)
    aligner_data = dict(
        tagger_id = tagger_id,
        annotator_id = annotator_id,
        )
                               
    multi_token_match(si, aligner_data)

    assert si.body.sentences[tagger_id][0].tokens[1].labels
    assert si.body.sentences[tagger_id][0].tokens[2].labels
    assert si.body.sentences[tagger_id][0].tokens[-3].labels
    assert si.body.sentences[tagger_id][0].tokens[-2].labels
    assert si.body.sentences[tagger_id][0].tokens[-1].labels
Example #5
0
def test_multi_token_match():
    si = make_stream_item(0, '')
    tagger_id = 'test_tagger'
    annotator_id = 'test_anno'
    target_id = 'test_target'
    si.body.sentences[tagger_id] = [
        Sentence(tokens=[
            Token(token='This'),
            Token(token='-LRB-big-RRB- dog'),
            Token(token='Jake'),
            Token(token='has'),
            Token(token='no'),
            Token(token=u'\u1F601'.encode('utf8')),
            Token(token='...'),
            Token(token='Teeth'),
        ])
    ]
    rating = Rating(
        annotator=Annotator(annotator_id=annotator_id),
        target=Target(target_id=target_id),
        mentions=['Big dog! Jake... ', u'\u1F601 Teeth'.encode('utf8')],
    )
    add_annotation(si, rating)
    aligner_data = dict(
        tagger_id=tagger_id,
        annotator_id=annotator_id,
    )

    multi_token_match(si, aligner_data)

    assert si.body.sentences[tagger_id][0].tokens[1].labels
    assert si.body.sentences[tagger_id][0].tokens[2].labels
    assert si.body.sentences[tagger_id][0].tokens[-3].labels
    assert si.body.sentences[tagger_id][0].tokens[-2].labels
    assert si.body.sentences[tagger_id][0].tokens[-1].labels
    def make_sentences(self, stream_item):
        'assemble Sentence and Token objects'
        self.make_label_index(stream_item)
        sentences = []
        token_num = 0
        new_mention_id = 0
        for sent_start, sent_end, sent_str in self._sentences(stream_item.body.clean_visible):
            assert isinstance(sent_str, unicode)
            sent = Sentence()
            sentence_pos = 0
            for start, end in self.word_tokenizer.span_tokenize(sent_str):
                try:
                    token_str = sent_str[start:end].encode('utf8')
                except Exception, exc:
                    logger.critical("died on sent_str[%d:%d].encode('utf8')",
                                    start, end, exc_info=True)
                    sys.exit('failed to cope with %r in %r' % (sent_str[start:end], sent_str))
                tok = Token(
                    token_num=token_num,
                    token=token_str,
                    sentence_pos=sentence_pos,
                )
                tok.offsets[OffsetType.BYTES] = Offset(
                    type=OffsetType.BYTES, 
                    first=sent_start + start,
                    length = end - start,
                )
                ## whitespace tokenizer will never get a token
                ## boundary in the middle of an 'author' label
                try:
                    #logger.debug('searching for %d in %r', sent_start + start, self.label_index._keys)
                    label = self.label_index.find_le(sent_start + start)
                except ValueError:
                    label = None
                if label:
                    off = label.offsets[OffsetType.BYTES]
                    if off.first + off.length > sent_start + start:
                        logger.info('overlapping label: %r' % label.target.target_id)
                        ## overlaps
                        streamcorpus.add_annotation(tok, label)
                        assert label.annotator.annotator_id in tok.labels

                        logger.info('adding label to tok: %r has %r',
                                     tok.token, label.target.target_id)

                        if label in self.label_to_mention_id:
                            mention_id = self.label_to_mention_id[label]
                        else:
                            mention_id = new_mention_id
                            new_mention_id += 1
                            self.label_to_mention_id[label] = mention_id

                        tok.mention_id = mention_id

                token_num += 1
                sentence_pos += 1
                sent.tokens.append(tok)
            sentences.append(sent)
def _offset_labels(stream_item, aligner_data, offset_type='BYTES'):
    ## get a set of tokens -- must have OffsetType.<offset_type> type offsets.

    offset_type = OffsetType._NAMES_TO_VALUES[offset_type]

    sentences = stream_item.body.sentences[aligner_data['tagger_id']]

    ## These next few steps are probably the most
    ## memory intensive, because they fully
    ## instantiate all the tokens.

    token_collection = SortedCollection(
        itertools.chain(*[sent.tokens for sent in sentences]),
        key=lambda tok: tok.offsets[offset_type].first
        )

    ## if labels on ContentItem, then make labels on Tokens
    for annotator_id in stream_item.body.labels:
        if annotator_id != aligner_data['annotator_id']:
            continue
        for label in stream_item.body.labels[annotator_id]:

            ## remove the offset from the label, because we are
            ## putting it into the token
            label_off = label.offsets.pop( offset_type )

            assert label_off.length == len(label_off.value)

            #print 'L: %d\t%r\t%r' % (label_off.first, label_off.value,
            #                         '\n'.join(hope_original.split('\n')[label_off.first:label_off.first+label_off.length]))

            #print 'tc %d %r' % (len(token_collection), token_collection._keys)
            #print 'label_off.first=%d, length=%d, value=%r' % (label_off.first, label_off.length, label_off.value)

            toks = token_collection.find_range(
                    label_off.first, label_off.first + label_off.length)

            #print "find_le: ", token_collection.find_le(label_off.first)

            toks = list(toks)
            #print 'aligned tokens', toks

            for tok in toks:
                add_annotation(tok, label)

                ## only for debugging
                assert tok.token is not None, tok.token

                if not tok.token in label_off.value:
                    sys.exit('%r not in %r' % \
                        ([(t.offsets[offset_type].first, t.token)
                          for t in toks],
                         label_off.value))
def _offset_labels(stream_item, aligner_data, offset_type='BYTES'):
    ## get a set of tokens -- must have OffsetType.<offset_type> type offsets.

    offset_type = OffsetType._NAMES_TO_VALUES[offset_type]

    sentences = stream_item.body.sentences[aligner_data['tagger_id']]

    ## These next few steps are probably the most
    ## memory intensive, because they fully
    ## instantiate all the tokens.

    token_collection = SortedCollection(
        itertools.chain(*[sent.tokens for sent in sentences]),
        key=lambda tok: tok.offsets[offset_type].first)

    ## if labels on ContentItem, then make labels on Tokens
    for annotator_id in stream_item.body.labels:
        if annotator_id != aligner_data['annotator_id']:
            continue
        for label in stream_item.body.labels[annotator_id]:

            ## remove the offset from the label, because we are
            ## putting it into the token
            label_off = label.offsets.pop(offset_type)

            assert label_off.length == len(label_off.value)

            #print 'L: %d\t%r\t%r' % (label_off.first, label_off.value,
            #                         '\n'.join(hope_original.split('\n')[label_off.first:label_off.first+label_off.length]))

            #print 'tc %d %r' % (len(token_collection), token_collection._keys)
            #print 'label_off.first=%d, length=%d, value=%r' % (label_off.first, label_off.length, label_off.value)

            toks = token_collection.find_range(
                label_off.first, label_off.first + label_off.length)

            #print "find_le: ", token_collection.find_le(label_off.first)

            toks = list(toks)
            #print 'aligned tokens', toks

            for tok in toks:
                add_annotation(tok, label)

                ## only for debugging
                assert tok.token is not None, tok.token

                if not tok.token in label_off.value:
                    sys.exit('%r not in %r' % \
                        ([(t.offsets[offset_type].first, t.token)
                          for t in toks],
                         label_off.value))
def names_in_chains(stream_item, aligner_data):
    '''
    Convert doc-level Rating object into a Label, and add that Label
    to all Token in all coref chains identified by
    aligner_data["chain_selector"]

    :param stream_item: document that has a doc-level Rating to translate into token-level Labels.
    :param aligner_data: dict containing:
      chain_selector: ALL or ANY
      annotator_id: string to find at stream_item.Ratings[i].annotator.annotator_id

    If chain_selector==ALL, then only apply Label to chains in which
    all of the Rating.mentions strings appear as substrings within at
    least one of the Token.token strings.

    If chain_selector==ANY, then apply Label to chains in which any of
    the Rating.mentions strings appear as a substring within at least
    one of the Token.token strings.

    If chain_selector==ANY_MULTI_TOKEN, then apply Label to chains in which all
    the names in any of the Rating.mentions strings appear as a substring within at least
    one of the Token.token strings.
    '''
    chain_selector = aligner_data.get('chain_selector', '')
    assert chain_selector in _CHAIN_SELECTORS, \
        'chain_selector: %r not in %r' % (chain_selector, _CHAIN_SELECTORS.keys())

    ## convert chain_selector to a function
    chain_selector = _CHAIN_SELECTORS[chain_selector]

    ## make inverted index equiv_id --> (names, tokens)
    equiv_ids = make_chains_with_names( stream_item.body.sentences )

    for annotator_id, ratings in stream_item.ratings.items():
        if annotator_id == aligner_data['annotator_id']:
            for rating in ratings:
                label = Label(annotator=rating.annotator,
                              target=rating.target)

                for eqid, (chain_mentions, chain_tokens) in equiv_ids.items():
                    if chain_selector(rating.mentions, chain_mentions):
                        ## apply the label
                        for tok in chain_tokens:
                            add_annotation(tok, label)
def get_john_smith_tagged_by_lingpipe_without_labels_data(test_data_dir):
    fh = StringIO()
    o_chunk = Chunk(file_obj=fh, mode='wb')

    path = get_john_smith_tagged_by_lingpipe_path(test_data_dir)
    for si in Chunk(path):
        for sentence in si.body.sentences['lingpipe']:
            for token in sentence.tokens:
                for labels in token.labels.values():
                    for label in labels:
                        label.offsets.update(token.offsets)
                        for offset in label.offsets.values():
                            offset.value = token.token
                        add_annotation(si.body, label)
                token.labels = dict()
        o_chunk.add(si)

    o_chunk.flush()
    return fh.getvalue()
def get_john_smith_tagged_by_lingpipe_without_labels_data():
    fh = StringIO()
    o_chunk = Chunk(file_obj=fh, mode='wb')

    path = get_john_smith_tagged_by_lingpipe_path()
    for si in Chunk(path):
        for sentence in si.body.sentences['lingpipe']:
            for token in sentence.tokens:
                for labels in token.labels.values():
                    for label in labels:
                        label.offsets.update(token.offsets)
                        for offset in label.offsets.values():
                            offset.value = token.token
                        add_annotation(si.body, label)
                token.labels = dict()
        o_chunk.add(si)

    o_chunk.flush()
    return fh.getvalue()
def names_in_chains(stream_item, aligner_data):
    '''
    Convert doc-level Rating object into a Label, and add that Label
    to all Token in all coref chains identified by
    aligner_data["chain_selector"]

    :param stream_item: document that has a doc-level Rating to translate into token-level Labels.
    :param aligner_data: dict containing:
      chain_selector: ALL or ANY
      annotator_id: string to find at stream_item.Ratings[i].annotator.annotator_id

    If chain_selector==ALL, then only apply Label to chains in which
    all of the Rating.mentions strings appear as substrings within at
    least one of the Token.token strings.

    If chain_selector==ANY, then apply Label to chains in which any of
    the Rating.mentions strings appear as a substring within at least
    one of the Token.token strings.

    If chain_selector==ANY_MULTI_TOKEN, then apply Label to chains in which all
    the names in any of the Rating.mentions strings appear as a substring within at least
    one of the Token.token strings.
    '''
    chain_selector = aligner_data.get('chain_selector', '')
    assert chain_selector in _CHAIN_SELECTORS, \
        'chain_selector: %r not in %r' % (chain_selector, _CHAIN_SELECTORS.keys())

    ## convert chain_selector to a function
    chain_selector = _CHAIN_SELECTORS[chain_selector]

    ## make inverted index equiv_id --> (names, tokens)
    equiv_ids = make_chains_with_names(stream_item.body.sentences)

    for annotator_id, ratings in stream_item.ratings.items():
        if annotator_id == aligner_data['annotator_id']:
            for rating in ratings:
                label = Label(annotator=rating.annotator, target=rating.target)

                for eqid, (chain_mentions, chain_tokens) in equiv_ids.items():
                    if chain_selector(rating.mentions, chain_mentions):
                        ## apply the label
                        for tok in chain_tokens:
                            add_annotation(tok, label)
    def __call__(self, stream_item, context):
        '''
        Act as an incremental transform in the kba.pipeline
        '''
        ## right now, we only do clean_html
        assert self.config.get('require_clean_html', True)

        if stream_item.body and stream_item.body.clean_html:
            labels = self.make_labels(stream_item.body.clean_html,
                                      stream_item.body.clean_visible)
            if labels:
                if self.offset_type == OffsetType.LINES:
                    ## for LINES-type labels, must replace clean_html
                    ## with a new one that has newlines inserted
                    stream_item.body.clean_html = self.clean_html

                ## Remove any previous author labels
                stream_item.body.labels['author'] = []

                ## also add the new labels
                add_annotation(stream_item.body, *labels)
        return stream_item
Example #14
0
    def __call__(self, stream_item, context):
        '''
        Act as an incremental transform in the kba.pipeline
        '''
        ## right now, we only do clean_html
        assert self.config.get('require_clean_html', True)

        if stream_item.body and stream_item.body.clean_html:
            labels = self.make_labels(stream_item.body.clean_html,
                                      stream_item.body.clean_visible)
            if labels:
                if self.offset_type == OffsetType.LINES:
                    ## for LINES-type labels, must replace clean_html
                    ## with a new one that has newlines inserted
                    stream_item.body.clean_html = self.clean_html

                ## Remove any previous author labels
                stream_item.body.labels['author'] = []

                ## also add the new labels
                add_annotation(stream_item.body, *labels)
        return stream_item
def line_offset_labels(stream_item, aligner_data):
    ## get a set of tokens -- must have OffsetType.LINES in them.
    sentences = stream_item.body.sentences[aligner_data['tagger_id']]

    ## if labels on ContentItem, then make labels on Tokens
    for annotator_id in stream_item.body.labels:
        if annotator_id != aligner_data['annotator_id']:
            continue
        for label in stream_item.body.labels[annotator_id]:

            ## remove the offset from the label, because we are
            ## putting it into the token
            label_off = label.offsets.pop(OffsetType.LINES)

            assert label_off.length == len(label_off.value.split('\n'))
            #print 'L: %d\t%r\t%r' % (label_off.first, label_off.value,
            #    '\n'.join(hope_original.split('\n')[label_off.first:
            #         label_off.first+label_off.length]))

            ## These next few steps are probably the most
            ## memory intensive, because they fully
            ## instantiate all the tokens.
            token_collection = SortedCollection(
                itertools.chain(*[sent.tokens for sent in sentences]),
                key=lambda tok: tok.offsets[OffsetType.LINES].first
                )

            toks = token_collection.find_range(
                    label_off.first, label_off.first + label_off.length)

            for tok in toks:
                add_annotation(tok, label)

                ## only for debugging
                if not tok.token or tok.token not in label_off.value:
                    sys.exit('%r not in %r' % \
                        ([(t.offsets[OffsetType.LINES].first, t.token)
                          for t in toks],
                         label_off.value))
def line_offset_labels(stream_item, aligner_data):
    ## get a set of tokens -- must have OffsetType.LINES in them.
    sentences = stream_item.body.sentences[aligner_data['tagger_id']]

    ## if labels on ContentItem, then make labels on Tokens
    for annotator_id in stream_item.body.labels:
        if annotator_id != aligner_data['annotator_id']:
            continue
        for label in stream_item.body.labels[annotator_id]:

            ## remove the offset from the label, because we are
            ## putting it into the token
            label_off = label.offsets.pop(OffsetType.LINES)

            assert label_off.length == len(label_off.value.split('\n'))
            #print 'L: %d\t%r\t%r' % (label_off.first, label_off.value,
            #    '\n'.join(hope_original.split('\n')[label_off.first:
            #         label_off.first+label_off.length]))

            ## These next few steps are probably the most
            ## memory intensive, because they fully
            ## instantiate all the tokens.
            token_collection = SortedCollection(
                itertools.chain(*[sent.tokens for sent in sentences]),
                key=lambda tok: tok.offsets[OffsetType.LINES].first)

            toks = token_collection.find_range(
                label_off.first, label_off.first + label_off.length)

            for tok in toks:
                add_annotation(tok, label)

                ## only for debugging
                if not tok.token or tok.token not in label_off.value:
                    sys.exit('%r not in %r' % \
                        ([(t.offsets[OffsetType.LINES].first, t.token)
                          for t in toks],
                         label_off.value))
def generate_john_smith_chunk(path_to_original):
    '''
    This _looks_ like a Chunk only in that it generates StreamItem
    instances when iterated upon.
    '''
    ## Every StreamItem has a stream_time property.  It usually comes
    ## from the document creation time.  Here, we assume the JS corpus
    ## was created at one moment at the end of 1998:
    creation_time = '1998-12-31T23:59:59.999999Z'
    correct_time = 915148799

    if not path_to_original.startswith('/'):
        path_to_original = os.path.join(os.getcwd(), path_to_original)

    ## iterate over the files in the 35 input directories
    for label_id in range(35):

        dir_path = os.path.join(path_to_original, str(label_id))
        fnames = os.listdir(dir_path)
        fnames.sort()
        for fname in fnames:

            stream_item = streamcorpus.make_stream_item(
                creation_time, 
                ## make up an abs_url
                os.path.join(
                    'john-smith-corpus', str(label_id), fname))

            if int(stream_item.stream_time.epoch_ticks) != correct_time:
                raise PipelineBaseException('wrong stream_time construction: %r-->%r != %r'\
                                            % (creation_time, stream_item.stream_time.epoch_ticks,
                                               correct_time))

            ## These docs came from the authors of the paper cited above.
            stream_item.source = 'bagga-and-baldwin'

            ## build a ContentItem for the body
            body = streamcorpus.ContentItem()
            raw_string = open(os.path.join(dir_path, fname)).read()
            ## We know that this is already clean and has nothing
            ## tricky in it, because we manually cleansed it.  To
            ## illustrate how we stick all strings into thrift, we
            ## convert this to unicode (which introduces no changes)
            ## and then encode it as utf-8, which also introduces no
            ## changes.  Thrift stores strings as 8-bit character
            ## strings.
            # http://www.mail-archive.com/[email protected]/msg00210.html
            body.clean_visible = unicode(raw_string).encode('utf8')

            ## attach the content_item to the stream_item
            stream_item.body = body

            stream_item.body.language = streamcorpus.Language(code='en', name='ENGLISH')

            ## The authors also annotated the corpus
            anno = streamcorpus.Annotator()
            anno.annotator_id = 'bagga-and-baldwin'
            anno.annotation_time = stream_item.stream_time

            ## build a Label for the doc-level label:
            rating = streamcorpus.Rating()
            rating.annotator = anno
            rating.target = streamcorpus.Target(target_id = str(label_id)) # must be string
            rating.contains_mention = True
            rating.mentions = ['john', 'smith']

            ## put this one label in the array of labels
            streamcorpus.add_annotation(stream_item, rating)

            ## provide this stream_item to the pipeline
            yield stream_item
    def make_sentences(self, stream_item):
        'assemble Sentence and Token objects'
        self.make_label_index(stream_item)
        sentences = []
        token_num = 0
        new_mention_id = 0
        for sent_start, sent_end, sent_str in self._sentences(
                stream_item.body.clean_visible):
            assert isinstance(sent_str, unicode)
            sent = Sentence()
            sentence_pos = 0
            for start, end in self.word_tokenizer.span_tokenize(sent_str):
                try:
                    token_str = sent_str[start:end].encode('utf8')
                except Exception, exc:
                    logger.critical("died on sent_str[%d:%d].encode('utf8')",
                                    start,
                                    end,
                                    exc_info=True)
                    sys.exit('failed to cope with %r in %r' %
                             (sent_str[start:end], sent_str))
                tok = Token(
                    token_num=token_num,
                    token=token_str,
                    sentence_pos=sentence_pos,
                )
                tok.offsets[OffsetType.BYTES] = Offset(
                    type=OffsetType.BYTES,
                    first=sent_start + start,
                    length=end - start,
                )
                ## whitespace tokenizer will never get a token
                ## boundary in the middle of an 'author' label
                try:
                    #logger.debug('searching for %d in %r', sent_start + start, self.label_index._keys)
                    label = self.label_index.find_le(sent_start + start)
                except ValueError:
                    label = None
                if label:
                    off = label.offsets[OffsetType.BYTES]
                    if off.first + off.length > sent_start + start:
                        logger.info('overlapping label: %r' %
                                    label.target.target_id)
                        ## overlaps
                        streamcorpus.add_annotation(tok, label)
                        assert label.annotator.annotator_id in tok.labels

                        logger.info('adding label to tok: %r has %r',
                                    tok.token, label.target.target_id)

                        if label in self.label_to_mention_id:
                            mention_id = self.label_to_mention_id[label]
                        else:
                            mention_id = new_mention_id
                            new_mention_id += 1
                            self.label_to_mention_id[label] = mention_id

                        tok.mention_id = mention_id

                token_num += 1
                sentence_pos += 1
                sent.tokens.append(tok)
            sentences.append(sent)
Example #19
0
def generate_john_smith_chunk(path_to_original):
    '''
    This _looks_ like a Chunk only in that it generates StreamItem
    instances when iterated upon.
    '''
    ## Every StreamItem has a stream_time property.  It usually comes
    ## from the document creation time.  Here, we assume the JS corpus
    ## was created at one moment at the end of 1998:
    creation_time = '1998-12-31T23:59:59.999999Z'
    correct_time = 915148799

    if not os.path.isabs(path_to_original):
        path_to_original = os.path.join(os.getcwd(), path_to_original)

    ## iterate over the files in the 35 input directories
    for label_id in range(35):

        dir_path = os.path.join(path_to_original, str(label_id))
        fnames = os.listdir(dir_path)
        fnames.sort()
        for fname in fnames:

            stream_item = streamcorpus.make_stream_item(
                creation_time,
                ## make up an abs_url
                os.path.join('john-smith-corpus', str(label_id), fname))

            if int(stream_item.stream_time.epoch_ticks) != correct_time:
                raise PipelineBaseException('wrong stream_time construction: %r-->%r != %r'\
                                            % (creation_time, stream_item.stream_time.epoch_ticks,
                                               correct_time))

            ## These docs came from the authors of the paper cited above.
            stream_item.source = 'bagga-and-baldwin'

            ## build a ContentItem for the body
            body = streamcorpus.ContentItem()
            raw_string = open(os.path.join(dir_path, fname)).read()
            ## We know that this is already clean and has nothing
            ## tricky in it, because we manually cleansed it.  To
            ## illustrate how we stick all strings into thrift, we
            ## convert this to unicode (which introduces no changes)
            ## and then encode it as utf-8, which also introduces no
            ## changes.  Thrift stores strings as 8-bit character
            ## strings.
            # http://www.mail-archive.com/[email protected]/msg00210.html
            body.clean_visible = unicode(raw_string).encode('utf8')

            ## attach the content_item to the stream_item
            stream_item.body = body

            stream_item.body.language = streamcorpus.Language(code='en',
                                                              name='ENGLISH')

            ## The authors also annotated the corpus
            anno = streamcorpus.Annotator()
            anno.annotator_id = 'bagga-and-baldwin'
            anno.annotation_time = stream_item.stream_time

            ## build a Label for the doc-level label:
            rating = streamcorpus.Rating()
            rating.annotator = anno
            rating.target = streamcorpus.Target(
                target_id=str(label_id))  # must be string
            rating.contains_mention = True
            rating.mentions = ['john', 'smith']

            ## put this one label in the array of labels
            streamcorpus.add_annotation(stream_item, rating)

            ## provide this stream_item to the pipeline
            yield stream_item
    def _make_stream_item(cls, path, metadata, abs_url, entities):
        '''
        
        '''
        ## Every StreamItem has a stream_time property.  It usually comes
        ## from the document creation time.
        creation_time = os.path.getctime(path)

        ## make stream item
        stream_item = streamcorpus.make_stream_item(creation_time, abs_url)

        stream_item.source = metadata.get('source')

        ## build a ContentItem for the body
        body = streamcorpus.ContentItem()
        body.media_type = magic.from_file(path, mime=True)

        logger.info('opening %r', path)
        with open(path) as f:
            body.raw = f.read()

        ## attach the content_item to the stream_item
        stream_item.body = body

        ## annotations
        anno = streamcorpus.Annotator()
        anno.annotator_id = metadata['annotator_id']
        anno.annotation_time = stream_item.stream_time

        num_ratings = 0
        for entity, is_profile in entities:
            num_ratings += 1

            ## pull out target id and mention tokens
            target_id = str(entity['target_id'])

            ## build a Label for the doc-level label:
            rating = streamcorpus.Rating()
            rating.annotator = anno
            rating.target = streamcorpus.Target(target_id=target_id)
            rating.contains_mention = True

            if is_profile:
                rating.flags = [streamcorpus.FlagType.PROFILE]

            ## parse slots in yaml file
            slots = cls._parse_slots(entity['slots'])

            ## heuristically split the slots string on white space and
            ## use each token as a separate mention.
            rating.mentions = [
                cleanse(unicode(slot[1], 'utf-8')) for slot in slots
            ]

            ## put this one label in the array of labels
            streamcorpus.add_annotation(stream_item, rating)

        ## provide this stream_item to the pipeline
        logger.info('created StreamItem(num ratings=%d, abs_url=%r',
                    num_ratings, stream_item.abs_url)
        return stream_item
    def _make_stream_item(self, dir_path, fname):

        ## could use dirpath as the label.  Instead, we illustrate
        ## using a TSV file to lookup the ground truth using the fname.
        assert fname in self.ground_truth, (dir_path, fname)

        ## "mention" is the name string from the text
        ## "target_id" is the label
        mention, target_id = self.ground_truth[fname]

        ## Every StreamItem has a stream_time property.  It usually comes
        ## from the document creation time.  Here, we assume the JS corpus
        ## was created at one moment at the end of 1998:
        creation_time = "1998-12-31T23:59:59.999999Z"

        stream_item = streamcorpus.make_stream_item(
            creation_time,
            ## make up an abs_url
            os.path.join("john-smith-corpus", target_id, fname),
        )

        ## These docs came from the authors of the paper cited above.
        stream_item.source = "bagga-and-baldwin"

        ## build a ContentItem for the body
        body = streamcorpus.ContentItem()
        raw_string = open(os.path.join(dir_path, fname)).read()
        ## We know that this is already clean and has nothing
        ## tricky in it, because we manually cleansed it.  To
        ## illustrate how we stick all strings into thrift, we
        ## convert this to unicode (which introduces no changes)
        ## and then encode it as utf-8, which also introduces no
        ## changes.  Thrift stores strings as 8-bit character
        ## strings.
        # http://www.mail-archive.com/[email protected]/msg00210.html
        body.clean_visible = unicode(raw_string).encode("utf8")

        ## attach the content_item to the stream_item
        stream_item.body = body

        stream_item.body.language = streamcorpus.Language(code="en", name="ENGLISH")

        ## The authors also annotated the corpus
        anno = streamcorpus.Annotator()
        anno.annotator_id = "bagga-and-baldwin"
        anno.annotation_time = stream_item.stream_time

        ## build a Label for the doc-level label:
        rating = streamcorpus.Rating()
        rating.annotator = anno
        rating.target = streamcorpus.Target(target_id=target_id)
        rating.contains_mention = True

        ## heuristically split the mentions string on white space and
        ## use each token as a separate mention.  For other corpora,
        ## this might need to be more sophisticated.
        rating.mentions = map(cleanse, mention.decode("utf8").split())

        ## put this one label in the array of labels
        streamcorpus.add_annotation(stream_item, rating)

        ## provide this stream_item to the pipeline
        return stream_item