Example #1
0
    def processWithTemplate(self, table, dr):
        """
            according to the # of columns, apply the corresponding template 
        """
        # selection of the dictionaries per columns
        # template 5,10: first col = numbering

        # find calibration column: abp_names
        table.buildNDARRAY()
        #         print (self.findNameColumn(table))
        #         lTemplateIE2 = [
        #              ((slice(1,None),slice(0,1))  ,[ 'numbering'],[ dr.getFieldByName('numbering') ])
        #             , ((slice(1,None),slice(1,2))  ,[ 'abp_names', 'names_aux','numbering','religion'],[ dr.getFieldByName('lastname'), dr.getFieldByName('firstname'),dr.getFieldByName('religion')  ])
        #             , ((slice(1,None),slice(2,3)) ,[ 'abp_profession','religion' ]        ,[ dr.getFieldByName('occupation'), dr.getFieldByName('religion') ])
        #             , ((slice(1,None),slice(3,4))  ,[ 'abp_location' ]                    ,[ dr.getFieldByName('location') ])
        #             , ((slice(1,None),slice(4,5)) ,[ 'abp_family' ]                       ,[ dr.getFieldByName('situation') ])
        #              ,((slice(1,None),slice(5,6)) ,[ 'deathreason','artz']                ,[ dr.getFieldByName('deathreason'),dr.getFieldByName('doktor')])
        #             , ((slice(1,None),slice(6,7)) ,[]                                     , [ ])  #binding
        #             , ((slice(1,None),slice(7,8)) ,['abp_dates', 'abp_dates' ,'abp_year']                        ,[,dr.getFieldByName('deathDate'),dr.getFieldByName('deathYear') ])
        #             , ((slice(1,None),slice(8,9)) ,[ 'abp_dates','abp_location' ]         ,[ dr.getFieldByName('burialDate'),dr.getFieldByName('burialLocation') ])
        #             , ((slice(1,None),slice(9,10)) ,[ 'abp_age','abp_ageunit']                           ,[ dr.getFieldByName('age'), dr.getFieldByName('ageUnit')])
        # #            , ((slice(1,None),slice(9,10)) ,[ dr.getFieldByName('priester')])
        # #            , ((slice(1,None),slice(10,11)),[ dr.getFieldByName('notes')])
        #            ]

        #fuzzy
        lTemplateIECAL = [
            ((slice(1, None), slice(0, 4)),
             ['abp_names', 'names_aux', 'numbering', 'religion'], [
                 dr.getFieldByName('lastname'),
                 dr.getFieldByName('firstname'),
                 dr.getFieldByName('religion')
             ]),
            ((slice(1, None), slice(1, 4)), ['abp_profession', 'religion'],
             [dr.getFieldByName('occupation'),
              dr.getFieldByName('religion')])
        ]

        #detect empty left columns ?
        template = tableTemplateClass()
        template.buildFromPattern(lTemplateIECAL)
        template.labelTable(table)

        iRef = self.findNameColumn(table)
        lTemplateIE = [
            ((slice(1, None), slice(iRef, iRef + 1)),
             ['abp_names', 'names_aux', 'numbering', 'religion'], [
                 dr.getFieldByName('lastname'),
                 dr.getFieldByName('firstname'),
                 dr.getFieldByName('religion')
             ]),
            ((slice(1, None), slice(iRef + 1,
                                    iRef + 2)), ['abp_profession', 'religion'],
             [dr.getFieldByName('occupation'),
              dr.getFieldByName('religion')]),
            ((slice(1, None), slice(iRef + 2, iRef + 3)), ['abp_location'],
             [dr.getFieldByName('location')]),
            ((slice(1, None), slice(iRef + 3, iRef + 4)), ['abp_family'],
             [dr.getFieldByName('situation')])
            #[] binding
            ,
            ((slice(1, None), slice(iRef + 4,
                                    iRef + 6)), ['abp_deathreason', 'artz'],
             [dr.getFieldByName('deathreason'),
              dr.getFieldByName('doktor')]),
            ((slice(1, None), slice(iRef + 5, iRef + 7)),
             ['abp_dates', 'abp_year'], [
                 dr.getFieldByName('MonthDayDateGenerator'),
                 dr.getFieldByName('deathDate'),
                 dr.getFieldByName('deathYear')
             ]),
            ((slice(1, None), slice(iRef + 6, iRef + 8)),
             ['abp_dates', 'abp_year', 'abp_location'], [
                 dr.getFieldByName('burialDate'),
                 dr.getFieldByName('deathYear'),
                 dr.getFieldByName('burialLocation')
             ]),
            ((slice(1, None), slice(iRef + 8,
                                    iRef + 10)), ['abp_age', 'abp_ageunit'],
             [dr.getFieldByName('age'),
              dr.getFieldByName('ageUnit')])
            #            , ((slice(1,None),slice(9,10)) ,[ dr.getFieldByName('priester')])
            #            , ((slice(1,None),slice(10,11)),[ dr.getFieldByName('notes')])
        ]
        # recalibrate template

        # #         lTemplate = lTemplateIE
        #         if table.getNbColumns() >= 12:
        #             lTemplate = lTemplateIE2
        #         else:
        #             lTemplate = lTemplateIE

        self.extractData(table, dr, lTemplateIE)

        # select best solutions
        # store inthe proper final format
        return dr
Example #2
0
    def processDeathWithTemplate(self, table, dr):
        """
            according to the # of columns, apply the corresponding template 
        """
        # selection of the dictionaries per columns
        # template 5,10: first col = numbering

        # find calibration column: abp_names
        table.buildNDARRAY()

        #fuzzy
        lTemplateIECAL = [
            ((slice(1, None), slice(0, 4)),
             ['abp_names', 'names_aux', 'numbering', 'religion'], [
                 dr.getFieldByName('lastname'),
                 dr.getFieldByName('firstname'),
                 dr.getFieldByName('religion')
             ]),
            ((slice(1, None), slice(1, 4)), ['abp_profession', 'religion'],
             [dr.getFieldByName('occupation'),
              dr.getFieldByName('religion')])
        ]

        #detect empty left columns ?
        template = tableTemplateClass()
        template.buildFromPattern(lTemplateIECAL)
        template.labelTable(table)

        iRef = self.findNameColumn(table, dr)
        if self.bDebug: print("=============", iRef)
        lTemplateIE = [
            ((slice(1, None), slice(iRef, iRef + 1)), [], [
                dr.getFieldByName('lastname'),
                dr.getFieldByName('firstname'),
                dr.getFieldByName('religion')
            ]),
            ((slice(1, None), slice(iRef + 1,
                                    iRef + 2)), ['abp_profession', 'religion'],
             [dr.getFieldByName('occupation'),
              dr.getFieldByName('religion')]),
            ((slice(1, None), slice(iRef + 2, iRef + 3)), ['abp_location'],
             [dr.getFieldByName('location')]),
            ((slice(1, None), slice(iRef + 3, iRef + 4)), ['abp_family'],
             [dr.getFieldByName('situation')]),
            ((slice(1, None), slice(iRef + 4,
                                    iRef + 6)), ['abp_deathreason', 'artz'],
             [dr.getFieldByName('deathreason'),
              dr.getFieldByName('doktor')]),
            ((slice(1, None), slice(iRef + 5, iRef + 9)),
             ['abp_dates', 'abp_year'], [
                 dr.getFieldByName('MonthDayDateGenerator'),
                 dr.getFieldByName('deathDate'),
                 dr.getFieldByName('deathYear')
             ]),
            ((slice(1, None), slice(iRef + 6, iRef + 9)),
             ['abp_dates', 'abp_year', 'abp_location'], [
                 dr.getFieldByName('burialDate'),
                 dr.getFieldByName('deathYear'),
                 dr.getFieldByName('burialLocation')
             ]),
            ((slice(1, None), slice(iRef + 8,
                                    iRef + 10)), ['abp_age', 'abp_ageunit'],
             [dr.getFieldByName('age'),
              dr.getFieldByName('ageUnit')]),
            ((slice(1, None), slice(iRef + 9, None)), ['abp_priester'],
             [dr.getFieldByName('priest')])
            #, ((slice(1,None),slice(10,11)),[ dr.getFieldByName('notes')])
        ]

        self.extractData(table, dr, lTemplateIE)

        return dr
Example #3
0
    def extractData(self, table, myRecord, lTemplate):
        """
            layout 
            tag content
            [use scoping for propagating 
                scoping: for tagging and for data
                scope fieldname    scope (fiedlname, fieldvalue)]   
            
            find if possible a contiguous repetition of records
            
            
            find layout level for record completion
            extract data/record
              -inference if IEOnto
            
        """
        #         self.bDebug = False
        #         table.buildNDARRAY()
        if lTemplate is not None:
            # convert string to tableTemplateObject
            template = tableTemplateClass()
            template.buildFromPattern(lTemplate)
            template.labelTable(table)
        else:
            return None
        #         firstNameColIndex =self.findNameColumn(table)

        # create a batch for the full page

        #tag fields with template
        for cell in table.getCells():
            if cell.getFields() != []:
                if self.bDebug:
                    print(table.getPage(), cell.getIndex(), cell.getFields(),
                          cell.getContent())
            for field in cell.getFields():
                if field is not None:
                    res = field.applyTaggers(cell)
                    # res [ (token,label,score) ...]
                    extractedValues = field.extractLabel(res)
                    if extractedValues != []:
                        #                         extractedValues = map(lambda offset,value,label,score:(value,score),extractedValues)
                        extractedValues = list(
                            map(lambda x: (x[1], x[3]), extractedValues))
                        field.setOffset(res[0])
                        field.setValue(extractedValues)
                        #                         field.addValue(extractedValues)
                        if self.bDebug:
                            print('found:', field, field.getValue())

        ### now at record level ?
        ### scope = propagation using only docObject (hardcoded ?)
        ### where to put the propagation mechanism?
#         myRecord.propagate(table)

## 'backpropagation:  select the rows, and collection subobjects with fields  (cells)

        for row in table.getRows():
            #if not row.isHeaders():
            myRecord.addCandidate(row)

#         #for each cell: take the record and
#         ### FR NOW: TAKE THE FIRST COLUMN
#         firstCol = table.getColumns()[0]
#         for cell in firstCol.getCells():
#             myRecord.addCandidate(cell)

        myRecord.rankCandidates()

        lcand = myRecord.getCandidates()
Example #4
0
    def processWeddingWithTemplate(self, table, wr):
        """
            
            tag     
            brautigam(name+religion+occupation)     location    eltern(name, occupation von lcation und name geb occupation location situation),   birthdate location
            braut(name+religion+occupation) location  eltern(name, occupation von lcation und name geb occupation location situation),   birthdate location
            pfarrer
            name, occupation, lovcation und/ΓΌ
            kind of weddong (dispens)
            licence (date)
            
        """
        table.buildNDARRAY()

        #fuzzy
        lTemplateIECAL = [
            ((slice(1, None), slice(0, 4)),
             ['abp_names', 'names_aux', 'numbering', 'religion'], [
                 dr.getFieldByName('lastname'),
                 dr.getFieldByName('firstname'),
                 dr.getFieldByName('religion')
             ]),
            ((slice(1, None), slice(1, 4)), ['abp_profession', 'religion'],
             [dr.getFieldByName('occupation'),
              dr.getFieldByName('religion')])
        ]

        #detect empty left columns ?
        template = tableTemplateClass()
        template.buildFromPattern(lTemplateIECAL)
        template.labelTable(table)

        iRef = self.findNameColumn(table, dr)
        if self.bDebug: print("=============", iRef)
        lTemplateIE = [
            ((slice(1, None), slice(iRef, iRef + 1)), [],
             [dr.getFieldByName('weddingDate')]),
            ((slice(1, None), slice(iRef + 1, iRef + 2)), [], [
                dr.getFieldByName('lastname'),
                dr.getFieldByName('firstname'),
                dr.getFieldByName('religion')
            ]),
            ((slice(1, None), slice(iRef + 2, iRef + 3)), [],
             [dr.getFieldByName('occupation'),
              dr.getFieldByName('religion')]),
            ((slice(1, None), slice(iRef + 3, iRef + 4)), [],
             [dr.getFieldByName('location')]),
            ((slice(1, None), slice(iRef + 4, iRef + 5)), [], [
                dr.getFieldByName('lastname'),
                dr.getFieldByName('firstname'),
                dr.getFieldByName('lastname'),
                dr.getFieldByName('firstname')
            ]),
            ((slice(1, None), slice(iRef + 5, iRef + 6)), [],
             [dr.getFieldByName('situation')]),
            ((slice(1,
                    None), slice(iRef + 6,
                                 iRef + 7)), [], [dr.getFieldByName('Date')]),
            ((slice(1, None), slice(iRef + 7, iRef + 8)), [], [
                dr.getFieldByName('lastname'),
                dr.getFieldByName('firstname'),
                dr.getFieldByName('religion')
            ]),
            ((slice(1, None), slice(iRef + 8, iRef + 9)), [],
             [dr.getFieldByName('occupation'),
              dr.getFieldByName('religion')]),
            ((slice(1, None), slice(iRef + 9, iRef + 10)), [],
             [dr.getFieldByName('location')]),
            ((slice(1, None), slice(iRef + 10, iRef + 11)), [], [
                dr.getFieldByName('lastname'),
                dr.getFieldByName('firstname'),
                dr.getFieldByName('lastname'),
                dr.getFieldByName('firstname')
            ]),
            ((slice(1, None), slice(iRef + 11, iRef + 12)), [],
             [dr.getFieldByName('situation')]),
            ((slice(1,
                    None), slice(iRef + 12,
                                 iRef + 13)), [], [dr.getFieldByName('Date')]),
            ((slice(1, None), slice(iRef + 13, iRef + 14)), [],
             [dr.getFieldByName('priest')]),
            ((slice(1, None), slice(iRef + 14, iRef + 15)), [], [
                dr.getFieldByName('lastname'),
                dr.getFieldByName('firstname'),
                dr.getFieldByName('lastname'),
                dr.getFieldByName('firstname'),
                dr.getFieldByName('location')
            ]),
            ((slice(1, None), slice(15, 16)), [], [dr.getFieldByName('notes')])
        ]

        self.extractData(table, dr, lTemplateIE)

        return dr