def run_extract_cmd(targetfield,
                    inputfile=None,
                    crop=False,
                    where='',
                    limit=1,
                    page=1,
                    psm=7,
                    outputpng=False,
                    printcmd=False,
                    outputcrop=False,
                    outputchop=False,
                    whiteThreshold=97,
                    validfile=None):

    cl_params = {}

    cl_params['limit'] = limit
    cl_params['page'] = page
    cl_params['crop'] = crop
    cl_params['psm'] = psm
    cl_params['targetfield'] = targetfield
    cl_params['where'] = where
    cl_params['outputpng'] = outputpng
    cl_params['outputcrop'] = outputcrop
    cl_params['outputchop'] = outputchop
    cl_params['printcmd'] = printcmd
    cl_params['whiteThreshold'] = whiteThreshold
    if validfile is None:
        validfile = "%s%s.tsv" % (st.raw_data_dir, targetfield)
    else:
        validfile = validfile

    for i, p in enumerate(st.extract_params):

        all_p = dict(p.items() + cl_params.items())

        if i == 0:
            all_p['inputfile'] = inputfile
            #all_p['where'] = "and ep.params='%(b_params)s'" % b_params
        else:
            all_p['inputfile'] = invalidfile

        all_p['outfile'] = "%s%s_%i.tsv" % (st.to_validate_dir, targetfield, i)
        invalidfile = "%s%s_%i.tsv" % (st.invalid_data_dir, targetfield, i)

        if i == len(p) - 1:
            all_p['outputpng'] = True
            all_p['outputcrop'] = True
            all_p['outputchop'] = True

        rp.run_cmd(**all_p)

        osutil.getStdoutFromCmd(
            '''cat %s | python %svalidate.py %s  > %s ''' %
            (all_p['outfile'], st.python_bin, validfile, invalidfile),
            shell=True)
def run_extract_cmd(targetfield, inputfile=None, crop=False, where='', limit=1, page=1, psm=7, outputpng=False, printcmd=False, outputcrop=False, outputchop=False, whiteThreshold=97, validfile=None):

    cl_params = {}

    cl_params['limit'] = limit
    cl_params['page'] = page
    cl_params['crop'] = crop
    cl_params['psm'] = psm
    cl_params['targetfield'] = targetfield
    cl_params['where'] = where
    cl_params['outputpng'] = outputpng
    cl_params['outputcrop'] = outputcrop
    cl_params['outputchop'] = outputchop
    cl_params['printcmd'] = printcmd
    cl_params['whiteThreshold'] = whiteThreshold
    if validfile is None:
        validfile = "%s%s.tsv" % (st.raw_data_dir, targetfield)
    else:
        validfile = validfile 


    for i, p in enumerate(st.extract_params):

        all_p = dict(p.items() + cl_params.items())

        if i == 0:
            all_p['inputfile'] = inputfile
            #all_p['where'] = "and ep.params='%(b_params)s'" % b_params
        else:
            all_p['inputfile'] = invalidfile

        all_p['outfile'] = "%s%s_%i.tsv" % (st.to_validate_dir, targetfield, i)
        invalidfile = "%s%s_%i.tsv" % (st.invalid_data_dir, targetfield, i)

        if i == len(p) - 1:
            all_p['outputpng'] = True
            all_p['outputcrop'] = True
            all_p['outputchop'] = True

        rp.run_cmd(**all_p)

        osutil.getStdoutFromCmd('''cat %s | python %svalidate.py %s  > %s ''' % (all_p['outfile'], st.python_bin, validfile, invalidfile), shell=True)
Beispiel #3
0
 def test_invoice_cropped(self):
     expectedOut = ['', '', 'is: Midwest Communications & Media Attention: Accounts Payable 2015 Roundwyck Lane Powell, OH 43065']
     expectedOut.sort()
     
     params = {'size': 54, 
             'resolution': 398, 
             'targetfield': 'invoice_uncropped', 
             'where' : "KOCO-TV_14043097411984", 
             'limit' : 1,
             'page': 2, 
             'noinfo': True,
             'crop': False,
             'psm': 3
             }
     with patch('sys.stdout', new=BytesIO()) as cap_stdout:
         rp.run_cmd(**params)
         
         out = cap_stdout.getvalue()
         out_list = out.split('\n')
         out_list.sort()
         out_list = [o.strip() for o in out_list]
         self.assertEquals(out_list, expectedOut)
Beispiel #4
0
    def test_invoice(self):
 
        expectedOut = ['06104114 - 06/23/14', '', 'HIETT 4 CORP COMMISS', '', 'Hiean/Corporation Commi:', '1196633', '105126114 - 06120114', '1196633-1', 'W7196822', '', '', '']

        expectedOut.sort()
        
        params = {'size': 54, 
                'resolution': 398, 
                'targetfield': 'invoice', 
                'where' : "KOCO-TV_14043097411984", 
                'limit' : 1,
                'page': 2, 
                'noinfo': True,
                'crop': True,
                'psm': 7
                }
        with patch('sys.stdout', new=BytesIO()) as cap_stdout:
            rp.run_cmd(**params)
            
            out = cap_stdout.getvalue()
            out_list = out.split('\n')
            out_list.sort()
            out_list = [o.strip() for o in out_list]
            self.assertEquals(out_list, expectedOut)