Esempio n. 1
0
def _dump_ratings(fpaths, annotator_ids=[], include_header=False):
    '''
    Read in a streamcorpus.Chunk files and print all Rating objects as
    tab-separated values

    :param fpaths: iterator over file paths to Chunks

    :paramm annotator_ids: if present, only print Rating objects from
    from one of these annotators
    '''
    print '\t'.join(
        ['annotator_id', 'target_id', 'stream_id', 'num_mentions', 'mentions'])
    for fpath in fpaths:
        for si in Chunk(path=fpath, mode='rb'):
            for annotator_id, ratings in si.ratings.items():
                if annotator_ids and annotator_id not in annotator_ids:
                    ## skip ratings not created by one of
                    ## annotator_ids
                    continue
                for rating in ratings:
                    assert rating.annotator.annotator_id == annotator_id, \
                        (rating.annotator.annotator_id, annotator_id)
                    print '\t'.join([
                        annotator_id,
                        rating.target.target_id,
                        si.stream_id,
                        str(len(rating.mentions)),
                        json.dumps(rating.mentions),
                    ])
Esempio n. 2
0
def _dump(fpath, args):
    '''
    Reads in a streamcorpus.Chunk file and prints the metadata of each
    StreamItem to stdout
    '''
    if args.count:
        num_stream_items = 0
        num_labeled_stream_items = set()

    global message_class
    for num, si in enumerate(
            Chunk(path=fpath, mode='rb', message=message_class)):
        if args.limit and num >= args.limit:
            break

        if args.count:
            num_stream_items += 1
            if not args.labels_only:
                continue

        if args.labels_only:
            if si.body.sentences:
                for tagger_id in si.body.sentences:
                    for sent in si.body.sentences[tagger_id]:
                        if args.count and si.stream_id in num_labeled_stream_items:
                            break
                        for tok in sent.tokens:
                            if tok.labels:
                                if args.count:
                                    num_labeled_stream_items.add(si.stream_id)
                                    break
                                else:
                                    print si.stream_id, tok.token.decode(
                                        'utf8').encode('utf8'), tok.labels

        elif not args.show_all:
            si.body = None
            if getattr(si, 'other_content', None):
                for oc in si.other_content:
                    si.other_content[oc] = None

        elif args.show_content_field and si.body:

            print getattr(si.body, args.show_content_field)

        else:
            print si

    if args.count:
        print '%d\t%d\t%s' % (num_stream_items, len(num_labeled_stream_items),
                              fpath)
Esempio n. 3
0
def _show_fields(fpaths, fields):
    '''
    streamcorpus.Chunk files and display each field specified in 'fields'
    '''
    global message_class
    for fpath in fpaths:
        for si in Chunk(path=fpath, mode='rb', message=message_class):
            for field in fields:
                prop = si
                for prop_name in field.split('.'):
                    prop = getattr(prop, prop_name, None)
                    if not prop: break
                if prop:
                    print field, prop
Esempio n. 4
0
def _dump_tokens(fpaths, annotator_ids=[]):
    '''
    Read in a streamcorpus.Chunk files and print all tokens in a fixed
    order that enables diffing.

    :param fpaths: iterator over file paths to Chunks

    :paramm annotator_ids: if present, only print tokens with labels
    from one of these annotators
    '''
    global message_class
    print '\t'.join(token_attrs + ['stream_id', 'labels'])
    for fpath in fpaths:
        for si in Chunk(path=fpath, mode='rb', message=message_class):
            if not si.body:
                print 'no body: %s' % si.stream_id
                continue
            if not si.body.sentences:
                print 'no body.sentences: %s' % si.stream_id
                continue
            tagger_ids = si.body.sentences.keys()
            tagger_ids.sort()
            for tagger_id in tagger_ids:
                for sent in si.body.sentences[tagger_id]:
                    for tok in sent.tokens:
                        vals = []
                        for attr in token_attrs:
                            val = getattr(tok, attr)
                            if isinstance(val, str):
                                vals.append(val.decode('utf8').encode('utf8'))
                            else:
                                vals.append(repr(val))

                        vals += [si.stream_id]

                        target_ids = []
                        for labels in tok.labels.values():
                            for label in labels:
                                target_ids.append(label.target.target_id)
                        vals += [','.join(target_ids)]

                        line = '\t'.join(vals)
                        found_annotator = False
                        for annotator_id in tok.labels:
                            if annotator_id in annotator_ids:
                                found_annotator = True
                                break
                        if found_annotator or not annotator_ids:
                            print line
Esempio n. 5
0
def _find(fpaths, stream_id):
    '''
    Read in a streamcorpus.Chunk file and if any of its stream_ids
    match stream_id, then print stream_item.body.raw to stdout
    '''
    global message_class
    sys.stderr.write('hunting for %r\n' % stream_id)
    for fpath in fpaths:
        for si in Chunk(path=fpath, mode='rb', message=message_class):
            if si.stream_id == stream_id:
                if si.body and si.body.raw:
                    print si.body.raw
                    sys.exit()
                elif si.body:
                    sys.exit('Found %s without si.body.raw' % stream_id)
                else:
                    sys.exit('Found %s without si.body' % stream_id)
Esempio n. 6
0
def _find_missing_labels(fpaths, annotator_ids, component):
    '''
    Read in a streamcorpus.Chunk file and if any of its stream_ids
    match stream_id, then print stream_item.body.raw to stdout
    '''
    global message_class
    for fpath in fpaths:
        for si in Chunk(path=fpath, mode='rb', message=message_class):
            if not si.body:
                print 'no body on %s %r' % (si.stream_id, si.abs_url)
                continue
            if not si.body.raw:
                print 'no body.raw on %s %r' % (si.stream_id, si.abs_url)
                continue

            found_annotator = False
            for tagger_id in si.body.sentences:
                for sent in si.body.sentences[tagger_id]:
                    for tok in sent.tokens:
                        for label in tok.labels:
                            if label.annotator and label.annotator.annotator_id in annotator_ids:
                                found_annotator = True
                                break
                        if found_annotator:
                            break
                    if found_annotator:
                        break
                if found_annotator:
                    break
            ## either we found_annotator or read all tokens
            if found_annotator:
                print '## success with %s' % si.stream_id
            else:
                print 'failed to find annotator_id in %r for %s' % (
                    annotator_ids, si.stream_id)
                if component == 'stream_id':
                    print si.stream_id
                else:
                    print getattr(si.body, component)
Esempio n. 7
0
def _stats(fpaths):
    '''
    Read streamcorpus.Chunk files and print their stats
    '''
    global message_class
    keys = [
        'stream_ids', 'num_targets_from_google', 'raw', 'raw_has_targs',
        'raw_has_wp', 'media_type', 'clean_html', 'clean_has_targs',
        'clean_has_wp', 'clean_visible', 'labelsets', 'labels',
        'labels_has_targs', 'sentences', 'tokens', 'at_least_one_label'
    ]
    print '\t'.join(keys)
    for fpath in fpaths:
        #print fpath
        sys.stdout.flush()
        c = collections.Counter()
        labels = collections.Counter()
        for num, si in enumerate(
                Chunk(path=fpath, mode='rb', message=message_class)):
            #print si.stream_id
            sys.stdout.flush()
            c['stream_ids'] += 1
            if si.body:
                target_ids = [
                    rec['target_id'] for rec in json.loads(
                        si.source_metadata['google'])['MENTION']
                ]
                c['num_targets_from_google'] += len(target_ids)
                c['raw'] += int(bool(si.body.raw))
                c['raw_has_targs'] += sum(
                    map(lambda targ: int(bool(targ in repr(si.body.raw))),
                        target_ids))
                c['raw_has_wp'] += int(bool('wikipedia.org' in si.body.raw))
                c['media_type'] += int(bool(si.body.media_type))
                c['clean_html'] += int(bool(si.body.clean_html))
                if si.body.clean_html:
                    c['clean_has_targs'] += sum(
                        map(
                            lambda targ: int(
                                bool(targ in si.body.clean_html.decode('utf8'))
                            ), target_ids))
                    c['clean_has_wp'] += int(
                        bool('wikipedia.org' in si.body.clean_html))

                c['clean_visible'] += int(bool(si.body.clean_visible))
                c['labelsets'] += len(si.body.labelsets)
                c['labels'] += sum(
                    map(lambda labelset: len(labelset.labels),
                        si.body.labelsets))
                label_targs = []
                for labelset in si.body.labelsets:
                    label_targs += [
                        label.target_id for label in labelset.labels
                    ]
                c['labels_has_targs'] += sum(
                    map(lambda targ: int(bool(targ in label_targs)),
                        target_ids))
                _labels = collections.Counter()
                for sentences in si.body.sentences.values():
                    c['sentences'] += len(sentences)
                    for sent in sentences:
                        for tok in sent.tokens:
                            c['tokens'] += 1
                            for label in tok.labels:
                                _labels[label.annotator.annotator_id] += 1
                #print _labels
                labels += _labels
                c['at_least_one_label'] += int(bool(_labels))
                sys.stdout.flush()

        print '\t'.join([str(c[k]) for k in keys] +
                        ['%s:%d' % it for it in labels.items()])
        #print json.dumps( c, indent=4 )
        #print json.dumps( labels, indent=4 )
        sys.stdout.flush()
Esempio n. 8
0
def verify_offsets(fpaths):
    '''
    Read in a streamcorpus.Chunk files and verify that the 'value'
    property in each offset matches the actual text at that offset.

    :param fpaths: iterator over file paths to Chunks
    '''
    global message_class
    for fpath in fpaths:
        print fpath
        num_valid_line_offsets = 0
        num_valid_byte_offsets = 0
        num_valid_label_offsets = 0
        for si in Chunk(path=fpath, mode='rb', message=message_class):
            if not si.body:
                print 'no body: %s' % si.stream_id
                continue
            if not si.body.sentences:
                print 'no body.sentences: %s' % si.stream_id
                continue
            for tagger_id in si.body.sentences:
                for sent in si.body.sentences[tagger_id]:
                    for tok in sent.tokens:
                        if OffsetType.BYTES in tok.offsets:
                            off = tok.offsets[OffsetType.BYTES]

                            text = getattr(si.body, off.content_form)
                            val = text[off.first:off.first + off.length]

                            if off.value and val != off.value:
                                window = 20
                                print 'ERROR:  %r != %r in %r' % (
                                    off.value, val,
                                    text[off.first - window:off.first +
                                         off.length + window])

                            else:
                                num_valid_byte_offsets += 1

                            for labels in tok.labels.values():
                                for label in labels:
                                    if OffsetType.BYTES in label.offsets:
                                        ## get the offset from the label, and compare the value
                                        off_label = label.offsets[
                                            OffsetType.BYTES]
                                        if off_label.value and val != off_label.value:
                                            window = 20
                                            print 'ERROR:  %r != %r in %r' % (
                                                off.value, val,
                                                text[off.first -
                                                     window:off.first +
                                                     off.length + window])
                                        else:
                                            num_valid_label_offsets += 1

                        if OffsetType.LINES in tok.offsets:
                            off = tok.offsets[OffsetType.LINES]

                            text = getattr(si.body, off.content_form)

                            def get_val(text, start, end):
                                return '\n'.join(text.splitlines()[start:end])

                            val_lines = get_val(text, off.first,
                                                off.first + off.length)

                            if not off.value:
                                print 'UNKNOWN: .value not provided in offset'
                                continue

                            elif off.value and off.value not in val_lines:
                                window = 3
                                print 'ERROR:  %r != %r in %r' % (
                                    off.value, val_lines,
                                    get_val(text, off.first - window,
                                            off.first + off.length + window))

                            else:
                                num_valid_line_offsets += 1

                            for labels in tok.labels.values():
                                for label in labels:
                                    if OffsetType.LINES in label.offsets:
                                        ## get the offset from the label, and compare the value
                                        off_label = label.offsets[
                                            OffsetType.LINES]
                                        if off_label.value and val != off_label.value:
                                            window = 20
                                            print 'ERROR:  %r != %r in %r' % (
                                                off.value, val,
                                                text[off.first -
                                                     window:off.first +
                                                     off.length + window])
                                        else:
                                            num_valid_label_offsets += 1

        print '''
num_valid_byte_offsets: %d
num_valid_line_offsets: %d
num_valid_label_offsets: %d
''' % (num_valid_byte_offsets, num_valid_line_offsets, num_valid_label_offsets)