Esempio n. 1
0
def check_spans(doc_fn, tag_fn, options):
    doc_count, span_count, mismatches = 0, 0, 0
    with open_file(doc_fn, 'r', options) as doc_f:
        doc_reader = DocReader(doc_f)
        with open_file(tag_fn, 'r', options) as tag_f:
            span_reader = SpanReader(tag_f)
            for doc in doc_reader:
                for span in span_reader.document_spans(doc.id):
                    doc_span_text = doc.text[span.start:span.end + 1]
                    if doc_span_text != span.text:
                        dt, st = safe_str(doc_span_text), safe_str(span.text)
                        print(f'text mismatch in {doc.id}: "{dt}" '
                              f'vs "{st}: {span}"')
                        mismatches += 1
                    span_count += 1
                doc_count += 1
                if doc_count % 10000 == 0:
                    print(
                        f'processed {doc_count} documents '
                        f'({span_count} spans)',
                        file=sys.stderr)
            span_count, errors = span_reader.iter.index - 1, span_reader.errors
            if span_reader.current_doc_id() is not None:
                print(f'ERROR: extra lines in {tag_fn}')
            if mismatches or errors:
                print(f'Checked {span_count} spans, found {errors} errors '
                      f'and {mismatches} mismatches')
            else:
                print(f'OK, checked {span_count} spans')
Esempio n. 2
0
def cut_tags(doc_fn, tag_fn, out_fn, options):
    removed, total = 0, 0
    with open_file(doc_fn, 'r', options) as doc_f:
        doc_reader = DocReader(doc_f)
        with open_file(tag_fn, 'r', options) as tag_f:
            span_reader = SpanReader(tag_f, no_type_mapping=True)
            with open_file(out_fn, 'w', options) as out_f:
                for doc_idx, doc in enumerate(doc_reader):
                    offset_map = get_offset_map(doc, options)
                    if offset_map is None:
                        # no-op, quick copy without parsing
                        for span in span_reader.document_lines(doc.id):
                            print(span, end='', file=out_f)
                            total += 1
                    else:
                        # need to parse, map and filter
                        spans = list(span_reader.document_spans(doc.id))
                        mapped = apply_offset_map(spans, offset_map)
                        removed += len(spans) - len(mapped)
                        total += len(spans)
                        for span in mapped:
                            print(span, file=out_f)
                    if (doc_idx+1) % 100000 == 0:
                        print(f'processed {doc_idx+1} documents',
                              file=sys.stderr)
    print(f'removed {removed}/{total} spans ({removed/total:.1%})',
          file=sys.stderr)
Esempio n. 3
0
def char_to_byte_offsets(doc_fn, tag_fn, options):
    doc_count = 0
    with open(doc_fn, encoding=options.encoding) as doc_f:
        doc_reader = DocReader(doc_f)
        with open(tag_fn, encoding=options.encoding) as tag_f:
            span_reader = SpanReader(tag_f)
            for doc in doc_reader:
                if options.max_docs and doc_count >= options.max_docs:
                    break
                if char_and_byte_offsets_are_identical(doc.text, options):
                    # fast common case for trivial mapping
                    for span in span_reader.document_lines(doc.id):
                        print(span, end='')
                else:
                    # non-trivial mapping
                    offset_map = make_offset_map(doc.text, options)
                    for span in span_reader.document_spans(doc.id):
                        span.start = offset_map[span.start]
                        # offsets are end inclusive, so take the last byte
                        # before the next character
                        span.end = offset_map[span.end+1] - 1
                        print(span)
                doc_count += 1
                if doc_count % 10000 == 0:
                    print(f'processed {doc_count} documents', file=sys.stderr)
Esempio n. 4
0
def cut_documents(doc_fn, out_fn, options):
    cut_count = 0
    with open_file(doc_fn, 'r', options) as doc_f:
        doc_reader = DocReader(doc_f)
        with open_file(out_fn, 'w', options) as out_f:
            for doc_idx, doc in enumerate(doc_reader):
                cut_count += cut_document(doc, options)
                print(doc, file=out_f)
                if (doc_idx + 1) % 100000 == 0:
                    print(f'processed {doc_idx+1} documents', file=sys.stderr)
    print(f'cut {cut_count}/{doc_idx} documents ({cut_count/doc_idx:.1%})',
          file=sys.stderr)
Esempio n. 5
0
def convert_to_standoff(doc_fn, tag_fn, out_dir, options):
    NOTE_TYPE = 'AnnotatorNotes'
    with open_file(doc_fn, 'r', options) as doc_f:
        doc_reader = DocReader(doc_f)
        with open_file(tag_fn, 'r', options) as tag_f:
            # Read spans that include source information
            span_reader = SpanReader(tag_f, source=True)
            for doc in doc_reader:
                spans = list(span_reader.document_spans(doc.id))
                try:
                    convert_single(doc, spans, out_dir, options)
                except Exception as e:
                    error(f'failed to convert {doc.id}: {e}')
                    raise
Esempio n. 6
0
def compare_spans(doc_fn, tag_fns, names, doc_out, tag_out, options):
    if names is None:
        names = tag_fns
    doc_count = 0
    stats = Stats(names)
    with open_file(doc_fn, 'r', options) as doc_f:
        doc_reader = DocReader(doc_f)
        tag_fs = []
        for tag_fn in tag_fns:
            tag_fs.append(open_file(tag_fn, 'r', options))
        span_readers = [
            SpanReader(tag_f, source=name)
            for tag_f, name in zip(tag_fs, names)
        ]
        for doc_idx, doc in enumerate(doc_reader):
            if options.max_docs and doc_count >= options.max_docs:
                break
            spans = [r.document_spans(doc.id) for r in span_readers]
            spans = [validate_spans(doc.id, doc.text, s) for s in spans]
            spans = [filter_spans(s, options) for s in spans]
            spans = [deduplicate_spans(s, options) for s in spans]
            selected_for_output = False
            for i in range(len(spans)):
                for j in range(i + 1, len(spans)):
                    doc_stats = compare_document_spans(doc.id, names[i],
                                                       names[j], spans[i],
                                                       spans[j], options)
                    stats.add_stats(doc_stats)
                    if select_document_for_output(doc, doc_stats, options):
                        selected_for_output = True

            if (selected_for_output and
                (options.sample is None or random.random() < options.sample)):
                print(doc, file=doc_out)
                for s in (s for sp in spans for s in sp):
                    print(s, file=tag_out)

            doc_count += 1
            if doc_count % 10000 == 0:
                print(f'processed {doc_count} documents',
                      file=sys.stderr,
                      flush=True)
            if (options.save_interval
                    and doc_count % options.save_interval == 0):
                save_results(options.output, stats, options)
                doc_out.flush()
                tag_out.flush()
                stats.trim()

    save_results(options.output, stats, options)
Esempio n. 7
0
def filter_documents(doc_fn, out_fn, ids, options):
    out_count = 0
    with open_file(doc_fn, 'r', options) as doc_f:
        doc_reader = DocReader(doc_f)
        with open_file(out_fn, 'w', options) as out_f:
            for doc_idx, doc in enumerate(doc_reader):
                if doc.id in ids:
                    print(doc, file=out_f, flush=True)
                    out_count += 1
                if (doc_idx + 1) % 100000 == 0:
                    print(f'processed {doc_idx+1}, output {out_count}',
                          file=sys.stderr)
    print(f'output {out_count}/{doc_idx} documents ({out_count/doc_idx:.1%})',
          file=sys.stderr)