Example #1
0
def get_cls_list(html_f):
    """
    Given an html file, get a list of objects that's easier to reason about
    :param html_f: The input html file
    :return: [(cls, bb, score)]
    """
    htmlfile2xml(html_f, '/tmp')
    return xml2list(
        f'{os.path.join("/tmp", os.path.basename(html_f)[:-5])}.xml')
Example #2
0
def visualize_xml(xml_dir, img_dir, output_dir):
    for xml in glob.glob(os.path.join(xml_dir, '*.xml')):
        bname = os.path.basename(xml)[:-4]
        png_name = os.path.join(img_dir, bname + '.png')
        img = Image.open(png_name)
        img_np = np.array(img.convert('RGB'))
        xlist = xml2list(xml)
        x_coords_list = [x[1] for x in xlist]
        out_path = os.path.join(output_dir, bname + '.png')
        draw_cc(img_np, x_coords_list, write_img_p=out_path)
Example #3
0
 def convert_to_html(xml_f):
     xpath = os.path.join(xml, xml_f)
     l = xml2list(xpath)
     l = group_cls(l,
                   'Table',
                   do_table_merge=True,
                   merge_over_classes=[
                       'Figure', 'Section Header', 'Page Footer',
                       'Page Header'
                   ])
     l = group_cls(l, 'Figure')
     pdf_name = FILE_NAME.search(f'{xml_f[:-4]}.png').group(1)
     list2html(l, f'{xml_f[:-4]}.png', os.path.join(f'{tmp}', 'images'),
               html, unicodes[pdf_name] if pdf_name in unicodes else None)
Example #4
0
def load_gt(xml_dir, identifier):
    """
    Load an XML ground truth document
    :param xml_dir: base path to xml
    :param identifier: xml document identifier
    :return: [K x 4] Tensor, [cls_names]
    """
    path = os.path.join(xml_dir, f"{identifier}.xml")
    as_lst = xml2list(path)
    if len(as_lst) == 0:
        cls_list = [0]
        tensor_list = [[0,0,0,0]]
    else:
        cls_list, tensor_list = zip(*as_lst)
    # convert to tensors
    gt_boxes = BBoxes(torch.tensor(tensor_list),"xyhw")
    return gt_boxes, cls_list
Example #5
0
def load_data(input_dir, classes):
    features = []
    targets = []
    for f in glob.glob(os.path.join(input_dir, "html/*.html")):
        predict_list = process_html(f)
        target_path = os.path.splitext(os.path.basename(f))[0]
        target_path = os.path.join(input_dir,
                                   "target/{}.xml".format(target_path))
        target_list = xml2list(target_path)

        list_map = match_lists(predict_list, target_list)
        for predict in predict_list:
            target = get_target(predict, list_map, classes)
            if target == -1:
                continue
            targets.append(target)
            features.append(get_feat_vec_train(predict, predict_list, classes))
    return np.array(features), np.array(targets)
Example #6
0
def ingest_file(path):
    """
    Ingest an XML file to a dataframe
    :param path: path to XML file
    :return: dataframe of [id, label, x0,y0,x1,y1]
    """
    lst = xml2list(path)
    labels = [item[0] for item in lst]
    coords = [item[1] for item in lst]
    scores = [float(item[2]) for item in lst]
    x0 = [coord[0] for coord in coords]
    y0 = [coord[1] for coord in coords]
    x1 = [coord[2] for coord in coords]
    y1 = [coord[3] for coord in coords]
    return pd.DataFrame({
            "label": labels,
            "x0": x0,
            "x1": x1,
            "y0": y0,
            "y1": y1,
            "score": scores
        })
Example #7
0
def convert_to_html(xpath, img_dir):
    l = xml2list(xpath)
    print(l)
    print(f'{os.path.basename(xpath)[:-4]}.png')
    list2html(l, f'{os.path.basename(xpath)[:-4]}.png', img_dir, 'html2')
Example #8
0
def run_evaluate(predict_dir,
                 target_dir,
                 output_dir,
                 img_dir=None,
                 simi=False,
                 thres=0):
    fp_list = []
    classification_p_list = []
    total_intersection = 0
    total_prediction = 0
    total_gt = 0
    for predict_f in os.listdir(predict_dir):
        predict_path = os.path.join(predict_dir, predict_f)
        target_path = os.path.join(target_dir, predict_f)
        predict_list = xml2list(predict_path)
        target_list = xml2list(target_path)
        if img_dir is not None:
            img_p = os.path.join(img_dir, predict_f[:-4] + '.png')
            img = Image.open(img_p)
            for predict in predict_list:
                p_cls, p_bb, p_score = predict
                p_bb = [x - 5 for x in p_bb]
                d = ImageDraw.Draw(img)
                d.rectangle(p_bb, outline=color_classes[p_cls])
                img.save(os.path.join(output_dir,
                                      f'{predict_f[:-4] + ".png"}'))

        list_map = match_lists(predict_list, target_list)
        tbb_map = {}
        for predict in predict_list:
            p_cls, p_bb, p_score = predict
            p_score = 0.1
            p_bb = tuple(p_bb)
            matched_target = list_map[(p_cls, p_bb, p_score)]
            if matched_target is None:
                fp_list.append((predict, 'background'))
                continue
            t, iou = matched_target
            t_cls, t_bb, t_score = t
            t_bb = tuple(t_bb)
            #t_cls = ICDAR_convert[t_cls]
            if t_bb in tbb_map:
                tbb_map[t_bb].append(p_bb)
            else:
                tbb_map[t_bb] = [p_bb]
            if p_cls == t_cls:
                if iou < thres:
                    fp_list.append((predict, 'localization'))
                    continue
                fp_list.append((predict, 'correct'))
                classification_p_list.append((p_cls, t_cls))
                continue
            else:
                if iou >= thres:
                    classification_p_list.append((p_cls, t_cls))
                sim = False
                for s in similar_class_sets:
                    if p_cls in s and t_cls in s:
                        sim = True
                        break
                if sim:
                    if simi:
                        fp_list.append((predict, 'correct'))
                    else:
                        fp_list.append((predict, 'similar'))
                else:
                    fp_list.append((predict, 'other'))
        page_intersection = 0
        page_prediction = 0
        page_gt = 0
        for t_bb in tbb_map:
            for prediction in tbb_map[t_bb]:
                x_left = max(t_bb[0], prediction[0])
                y_top = max(t_bb[1], prediction[1])
                x_right = min(t_bb[2], prediction[2])
                y_bottom = min(t_bb[3], prediction[3])
                intersection_area = (x_right - x_left) * (y_bottom - y_top)
                page_intersection += intersection_area
                page_prediction += (prediction[2] - prediction[0]) * (
                    prediction[3] - prediction[1])
            page_gt += (t_bb[2] - t_bb[0]) * (t_bb[3] - t_bb[1])
        total_intersection += page_intersection
        total_prediction += page_prediction
        total_gt += page_gt

    print('Bounding box Precision')
    bb_precision = total_intersection / total_prediction
    print(bb_precision)
    print('--------')
    print('Bounding box Recall')
    bb_recall = total_intersection / total_gt
    print(bb_recall)
    print('--------')
    print('Bounding box F1')
    bb_f1 = 2 * bb_precision * bb_recall / (bb_precision + bb_recall)
    print(bb_f1)
    print('---------')

    class_counts = {}
    for p in classification_p_list:
        p_cls, t_cls = p
        if p_cls not in class_counts:
            class_counts[p_cls] = {}
        if t_cls in class_counts[p_cls]:
            class_counts[p_cls][t_cls] += 1
        else:
            class_counts[p_cls][t_cls] = 1
    class_precisions = {}
    all_tp = 0
    all_denom = 0
    for p_cls in class_counts:
        tp = 0
        fp = 0
        for t_cls in class_counts[p_cls]:
            if p_cls == t_cls:
                tp = class_counts[p_cls][t_cls]
            else:
                fp += class_counts[p_cls][t_cls]
        denom = tp + fp
        all_tp += tp
        all_denom += denom
        class_precisions[
            p_cls] = tp / denom if denom != 0 else 'No false positives or true positives found'
    print('All class precision')
    all_precision = all_tp / all_denom
    print(all_precision)
    print('-----------------')
    all_tp = 0
    all_denom = 0
    class_recalls = {}
    print('DEBUG')
    for p_cls in class_counts:
        print(class_counts)
        tp = class_counts[p_cls][p_cls] if p_cls in class_counts[p_cls] else 0
        fn = 0
        for p2_cls in class_counts:
            if p2_cls == p_cls:
                continue
            if p_cls in class_counts[p2_cls]:
                fn += class_counts[p2_cls][p_cls]
        denom = tp + fn
        all_tp += tp
        all_denom += denom
        class_recalls[
            p_cls] = tp / denom if denom != 0 else 'No false negatives or true positives found'

    print('All class recall')
    all_recall = all_tp / all_denom
    print(all_recall)
    print('--------------')

    print('All class F1')
    all_f1 = 2 * all_precision * all_recall / (all_precision + all_recall)
    print(all_f1)
    print('--------------')

    print('Class recalls')
    print(class_recalls)
    print('------------')
    print('Class precisions')
    print(class_precisions)
    print('------------')

    class_f1 = {}
    for cl in class_recalls:
        rec = class_recalls[cl]
        prec = class_precisions[cl]
        if type(rec) == str:
            print(f'Class: {cl}')
            print(rec)
            continue
        if rec + prec == 0:
            class_f1[cl] = 0
            continue
        class_f1[cl] = 2 * rec * prec / (rec + prec)
    print('Class F1s')
    print(class_f1)
    print('-------------')

    print('Class counts')

    print(class_counts)
    df = pd.DataFrame(class_counts)
    df = df.fillna(value=0)
    df['Total'] = df.sum(axis=1)
    print(df[sorted(df.columns)])
    print('------------')

    tp_num = 0
    fp_num = 0
    current_class = None
    roc_tp = [0]
    roc_fp = [0]
    p_r_curve = []
    for p in fp_list:
        predict, category = p
        is_tp = category == 'correct'
        if is_tp:
            if current_class is None:
                current_class = True
                continue
            if not current_class:
                roc_tp.append(tp_num)
                roc_fp.append(fp_num)
            tp_num += 1
        else:
            if current_class is None:
                current_class = False
                continue
            if current_class:
                roc_tp.append(tp_num)
                roc_fp.append(fp_num)
            fp_num += 1
        precision = tp_num / (tp_num + fp_num)
        p_r_curve.append((precision, tp_num))
    roc_tp.append(tp_num)
    roc_fp.append(fp_num)
    p_r_curve = [(x, y / tp_num) for x, y in p_r_curve]
    max_ps = []
    for i in range(11):
        chk_num = i / 10
        m_p = 0
        for x, y in p_r_curve:
            if y <= chk_num:
                continue
            if x > m_p:
                m_p = x
        max_ps.append(m_p)
    mAP = sum(max_ps) / len(max_ps)

    uz = list(zip(*p_r_curve))
    make_p_r_curve(uz[0], uz[1], output_dir)
    normalized_tp = [x / tp_num for x in roc_tp]
    if fp_num > 0:
        normalized_fp = [x / fp_num for x in roc_fp]
        make_roc_chart(normalized_tp, normalized_fp, output_dir)

    filtered_fp_list = [fp for fp in fp_list if fp[1] != 'correct']
    print(f'True Positives: {tp_num}')
    print(f'False Positives: {fp_num}')
    return filtered_fp_list
Example #9
0
def convert_annotations(xml_dir, output_dir):
    for xml in glob.glob(os.path.join(xml_dir, '*.xml')):
        bname = os.path.basename(xml)[:-4]
        xlist = xml2list(xml)
        writer = Writer(f'{bname}.png', 1920, 1920)
        # three lists. One Equation/equation label, the other not those
        eq_label_list = [x for x in xlist if x[0] == 'Equation label']
        eq_list = [x for x in xlist if x[0] == 'Equation']
        not_eq_list = [x for x in xlist if x not in eq_list and x not in eq_label_list]
        for x in not_eq_list:
            if x[0] == 'Table Note':
                writer.addObject('Body Text', *x[1])
            elif x[0] == 'Figure Note':
                writer.addObject('Body Text', *x[1])
            elif x[0] == 'Abstract':
                writer.addObject('Body Text', *x[1])
            else:
                writer.addObject(x[0], *x[1])

        # Now for each equation label, we associate the closest equation to the left of the equation label
        # Remember that el[1] and x[1] are coordinates in (tl_x, tl_y, br_x, br_y) form
        eq_el_map = {}
        print(eq_label_list)
        print('---')
        print(eq_list)
        print(xml)
        for el in eq_label_list:
            el_midpoint = el[1][1] + int((el[1][3] - el[1][1]) / 2)
            in_row = [x for x in eq_list if x[1][1] <= el_midpoint <= x[1][3]]
            # simple interval checks
            #in_row = [x for x in eq_list if x[1][1] <= el[1][1] <= x[1][3] or x[1][1] <= el[1][3] <= x[1][3] or el[1][1] <= x[1][1] <= el[1][3]]

            dists = [el[1][0] - x[1][2] for x in in_row]
            # only consider positive distances (left of obj)
            dists = [x if x >= 0 else float('inf') for x in dists]
            # Sometimes the equation label is really weirdly formatted. In this case 
            # just drop the equation label
            ind = None
            if len(dists) == 0:
                continue
            min_dist = min(dists)
            if min_dist == float('inf') or min_dist > 700:
                continue
            for i, d in enumerate(dists):
                if d == min_dist:
                    ind = i
                    break
            assert ind is not None
            assoc = in_row[ind]
            eq_list = [x for x in eq_list if x != assoc]
            el = (el[0], tuple(el[1]))
            eq_el_map[el] = assoc
        for eq in eq_list:
            writer.addObject(eq[0], *eq[1])

        for el in eq_el_map:
            eq = eq_el_map[el]
            print(eq)
            print(el)
            print('----')
            new_coords = [eq[1][0], min(eq[1][1], el[1][1]), el[1][2], max(eq[1][3], el[1][3])]
            assert new_coords[3] > new_coords[1]
            assert new_coords[2] > new_coords[0]
            writer.addObject(eq[0], *new_coords)
        save_path = os.path.join(output_dir, f'{bname}.xml')
        writer.save(save_path)