Example #1
0
def main():
    inp_json = open(args.inpf).read()
    inp_json = json.loads(inp_json)

    text = inp_json["signal"]
    annots = []
    for aset in inp_json["asets"]:
        if aset["type"] == "PERSON":
            annots = aset["annots"]
    annots.sort(key=lambda k: k[0])
    text = text.encode("utf-8").decode("utf-8")
    text = text.strip()
    text = text.replace("\r\n", "^^\n")

    current_annot = (annots[0] if annots else None)
    processed_length = 0
    for line in text.split('\n'):
        if not line:
            continue
        script_annots = Scrubber.dry_clean(line)[1]
        if not current_annot:
            continue
        line_len = len(line)
        line_annots = []
        while True:
            if not current_annot:
                break
            start = current_annot[0] - processed_length
            end = current_annot[1] - processed_length
            if (start <= line_len and end <= line_len):
                annots.pop(0)
                line_annots.append([start, end])
                current_annot = (annots[0] if annots else None)
            else:
                break
        processed_length += len(line)
        line_annots = merge_consecutive_markings(line_annots)
        script_annots = merge_consecutive_markings(script_annots)
        check_converage(line, line_annots, script_annots)
    print "Total number of docs: %s" % total_number_of_docs
    print "Docs with no PIIs / all PIIs detected: %s" % docs_with_all_pii_detected
    print "Docs with only partially missed PIIs: %s" % docs_with_partially_detected_pii
    print "Docs with fully missed PIIs: %s" % docs_with_full_missed_pii