def main(): inp_json = open(args.inpf).read() inp_json = json.loads(inp_json) text = inp_json["signal"] annots = [] for aset in inp_json["asets"]: if aset["type"] == "PERSON": annots = aset["annots"] annots.sort(key=lambda k: k[0]) text = text.encode("utf-8").decode("utf-8") text = text.strip() text = text.replace("\r\n", "^^\n") current_annot = (annots[0] if annots else None) processed_length = 0 for line in text.split('\n'): if not line: continue script_annots = Scrubber.dry_clean(line)[1] if not current_annot: continue line_len = len(line) line_annots = [] while True: if not current_annot: break start = current_annot[0] - processed_length end = current_annot[1] - processed_length if (start <= line_len and end <= line_len): annots.pop(0) line_annots.append([start, end]) current_annot = (annots[0] if annots else None) else: break processed_length += len(line) line_annots = merge_consecutive_markings(line_annots) script_annots = merge_consecutive_markings(script_annots) check_converage(line, line_annots, script_annots) print "Total number of docs: %s" % total_number_of_docs print "Docs with no PIIs / all PIIs detected: %s" % docs_with_all_pii_detected print "Docs with only partially missed PIIs: %s" % docs_with_partially_detected_pii print "Docs with fully missed PIIs: %s" % docs_with_full_missed_pii