コード例 #1
0
ファイル: Detector.py プロジェクト: cheungzq/WebParser
    def nodedist(self,n1,n2):
        an1 = n1
        an2 = n2

        anchor = [n1,n2]
        ancestor = [[],[]]
        for i in range(2):
            while 'predict' not in anchor[i].attrib:
                ancestor[i].append(anchor[i])
                anchor[i] = anchor[i].getparent()
            ancestor[i].append(anchor[i])
            ancestor[i].reverse()
        if ancestor[0][0] == ancestor[1][0]:
            same_line_seperate = 1
        else:
            same_line_seperate = 0
        i = 0
        for j in range(min(len(ancestor[0]),len(ancestor[1]))):
            if ancestor[0][j].tag != ancestor[1][j].tag:
                break
            else:
                i = j
        totalsize = int(ancestor[0][0].attrib['size']) + int(ancestor[1][0].attrib['size'])
        coversize = int(ancestor[0][i].attrib['size']) + int(ancestor[1][i].attrib['size'])
        subsize = int(n1.attrib['size']) + int(n2.attrib['size'])
        local_position_dist = scipy.sqrt((coversize-subsize)/(totalsize-subsize+1))
        
        local_down_dist = findBackbone.tree_edit_distance(n1,n2)[0]/subsize

        left_diff = int(n1.attrib['left'])-int(n2.attrib['left'])
        width_diff = int(n1.attrib['width']) - int(n2.attrib['width'])
        maxleft = max(int(n1.attrib['left']), int(n2.attrib['left']))
        minright = min(int(n1.attrib['left'])+int(n1.attrib['width']),
                       int(n2.attrib['left'])+int(n2.attrib['width']))
        overlap = 2*(minright - maxleft)/(int(n1.attrib['width']) + int(n2.attrib['width']))
        #nop_dist = scipy.exp(1-overlap) - 1
        nop_dist = 1-overlap
        
        right_diff = left_diff + width_diff
        center_diff = (left_diff+right_diff)/2        
        height_diff = int(n1.attrib['height']) - int(n2.attrib['height'])
        geometry_dist = left_diff**2 + right_diff**2 + center_diff**2 + width_diff**2 + height_diff**2
        #format_diff = int(n1.attrib['isTime']) - int(n2.attrib['isTime'])

        dc = Detector.DistComposer()
        dc.local_position_dist = local_position_dist
        dc.local_down_dist = local_down_dist
        dc.nop_dist = nop_dist
        dc.left_diff = abs(left_diff)
        dc.same_line_seperate = same_line_seperate
        dc.width_ratio = max(int(n1.attrib['width']), int(n2.attrib['width']))/min(int(n1.attrib['width']), int(n2.attrib['width']))-1
        return dc
コード例 #2
0
ファイル: locateTitleLine.py プロジェクト: cheungzq/WebParser
def locateTitleLine(xmldir,max_runner_up,threshold):
    filelist = glob.glob(path.join(xmldir,'*.xml'))
    parser = etree.XMLParser(recover=True)

    truepositive = 0
    falsepositive = 0
    truenegative = 0
    falsenegative = 0
    num_err_fn = 0
    num_err_fp = 0
    num_err = 0
    num_total = 0
    num_backsufficient = 0
    errfids = {}
    
    num = 0
    for file in filelist:
        #file = '../data/2ndxml1117/data/12013.txt.xml'
        
        tree = etree.parse(file,parser=parser)
        listregion = tree.find('//*[@label="%s"]' % LABEL['LIST'])
        if num % 200 == 0 :
            print(num/len(filelist))
        num += 1
        if listregion is None:
            continue
        lines = findBackbone.findPrinciple(listregion,max_runner_up,threshold)
        truelines = listregion.findall('*[@label="%s"]' % LABEL['TITLE_LINE'])

        tp = 0
        fp = 0

        try:
            backbone = findBackbone.findBackbone(lines)
        except Exception:
            print('Error:',file)
        else:
            if collectLabel(backbone) == collectLabel(listregion):
                num_backsufficient += 1

                for line in listregion:
                    dist,ops = findBackbone.tree_edit_distance(backbone,line)
                    acts = {a[0] for a in ops}
                    if 'd' not in acts:
                        if line.attrib['label'] == LABEL['TITLE_LINE']:
                            tp += 1
                        else:
                            fp += 1
            else:
                print(file)
                errfids[tree.getroot().attrib['fid']] = errfids.get(tree.getroot().attrib['fid'],0) + 1

        positive = len(truelines)
        fn = positive - tp
        tn = len(listregion) - (tp+fp) - fn
        num_total += 1

        falsepositive += fp
        truepositive += tp
        truenegative += tn
        falsenegative += fn
        #print(etree.tostring(backbone,encoding='utf-8',pretty_print=True).decode('utf-8'))
        #break
    
    print('true positive:', truepositive)
    print('false positive:', falsepositive)
    print('true negative:', truenegative)
    print('false negative:', falsenegative)
    print('num of list false positive:', num_err_fp)
    print('num of list false negative:', num_err_fn)
    print('num of list with err:', num_err)
    print('total numer of list:', num_total)
    print('num of sufficent backbone:',num_backsufficient)
    print('num of err sites:',len(errfids))
    print('err num in each site:',errfids)