def run_extract_cmd(targetfield, inputfile=None, crop=False, where='', limit=1, page=1, psm=7, outputpng=False, printcmd=False, outputcrop=False, outputchop=False, whiteThreshold=97, validfile=None): cl_params = {} cl_params['limit'] = limit cl_params['page'] = page cl_params['crop'] = crop cl_params['psm'] = psm cl_params['targetfield'] = targetfield cl_params['where'] = where cl_params['outputpng'] = outputpng cl_params['outputcrop'] = outputcrop cl_params['outputchop'] = outputchop cl_params['printcmd'] = printcmd cl_params['whiteThreshold'] = whiteThreshold if validfile is None: validfile = "%s%s.tsv" % (st.raw_data_dir, targetfield) else: validfile = validfile for i, p in enumerate(st.extract_params): all_p = dict(p.items() + cl_params.items()) if i == 0: all_p['inputfile'] = inputfile #all_p['where'] = "and ep.params='%(b_params)s'" % b_params else: all_p['inputfile'] = invalidfile all_p['outfile'] = "%s%s_%i.tsv" % (st.to_validate_dir, targetfield, i) invalidfile = "%s%s_%i.tsv" % (st.invalid_data_dir, targetfield, i) if i == len(p) - 1: all_p['outputpng'] = True all_p['outputcrop'] = True all_p['outputchop'] = True rp.run_cmd(**all_p) osutil.getStdoutFromCmd( '''cat %s | python %svalidate.py %s > %s ''' % (all_p['outfile'], st.python_bin, validfile, invalidfile), shell=True)
def checkKnownFields(tokens, field): regex_arr = [] for t in tokens.keys(): if tokens[t] is None: regex_arr.append(t) else: regex_arr.append(tokens[t]) full_regex_arr = [] for r in regex_arr: regex_t = r.replace('l', '@') regex_t = regex_t.replace('i', '@') regex_t = regex_t.replace('@', '[li]') full_regex_arr.append(regex_t) greps = 'cat validation/validate_%s.tsv' % field for r in full_regex_arr: greps += ''' | grep -w '%s' ''' % r.lower() match = osutil.getStdoutFromCmd(greps, shell=True) match = match.strip() if match != '': for i, r in enumerate(full_regex_arr): matches = re.findall(r.lower(), match) tokens[tokens.items()[i][0]] = matches[0]
def checkUrl(tokens, file_id): regex_arr = [] for t in tokens.keys(): if tokens[t] is None: regex_arr.append(t) else: regex_arr.append(None) # Placeholder so index stays the same full_regex_arr = [] for r in regex_arr: if r is None: full_regex_arr.append(None) else: regex_t = r.replace('l', '@') regex_t = regex_t.replace('i', '@') regex_t = regex_t.replace('@', '[li]') # replace slashes as well # may become obsolete TODO regex_t = regex_t.replace('/', '/ && /') full_regex_arr.append(regex_t) awk_tokens = '/%s/' % '/ && /'.join( [r for r in full_regex_arr if r is not None]) match = osutil.getStdoutFromCmd( '''awk '/%s/ && %s' validation/validate_url.tsv ''' % (file_id, awk_tokens.lower()), shell=True) match = match.strip() if match != '': for i, r in enumerate(full_regex_arr): if r is not None: matches = re.findall(r.lower(), match) tokens[tokens.items()[i][0]] = matches[0]
def run_extract_cmd(targetfield, inputfile=None, crop=False, where='', limit=1, page=1, psm=7, outputpng=False, printcmd=False, outputcrop=False, outputchop=False, whiteThreshold=97, validfile=None): cl_params = {} cl_params['limit'] = limit cl_params['page'] = page cl_params['crop'] = crop cl_params['psm'] = psm cl_params['targetfield'] = targetfield cl_params['where'] = where cl_params['outputpng'] = outputpng cl_params['outputcrop'] = outputcrop cl_params['outputchop'] = outputchop cl_params['printcmd'] = printcmd cl_params['whiteThreshold'] = whiteThreshold if validfile is None: validfile = "%s%s.tsv" % (st.raw_data_dir, targetfield) else: validfile = validfile for i, p in enumerate(st.extract_params): all_p = dict(p.items() + cl_params.items()) if i == 0: all_p['inputfile'] = inputfile #all_p['where'] = "and ep.params='%(b_params)s'" % b_params else: all_p['inputfile'] = invalidfile all_p['outfile'] = "%s%s_%i.tsv" % (st.to_validate_dir, targetfield, i) invalidfile = "%s%s_%i.tsv" % (st.invalid_data_dir, targetfield, i) if i == len(p) - 1: all_p['outputpng'] = True all_p['outputcrop'] = True all_p['outputchop'] = True rp.run_cmd(**all_p) osutil.getStdoutFromCmd('''cat %s | python %svalidate.py %s > %s ''' % (all_p['outfile'], st.python_bin, validfile, invalidfile), shell=True)
def checkCommonTokensFuzzy(tokens): for t in tokens.keys(): if tokens[t] is None: regex_t = t.replace('l', '@') regex_t = regex_t.replace('i', '@') regex_t = regex_t.replace('@', '[li]') match = osutil.getStdoutFromCmd( '''grep -x -m 1 '%s' validation/common_tokens.tsv ''' % regex_t.lower(), shell=True) match = match.strip() if match != '': tokens[t] = match
def run_cmd(resolution, targetfield, size=100, inputfile=None, outfile=None, crop=False, where='', limit=1, page=1, psm=7, rotation=None, language='engarial', outputpng=False, maxprocs=8, printcmd=False, limitIMthreads=True, median=None, sharpen=None, textcleaner=None, nudge=None, bbox=False, targettext=None, outputcrop=False, outputchop=False, threshold=False, deskew=None, cuneiform=False, whiteThreshold=None, noinfo=False, dryrun=None): '''Extract fields from a PDF using GNU Parallel -o <str>, --outfile=<str> -s <int>, --size=<int> -r <int>, --resolution=<int> ''' python_bin = st.python_bin raw_data_dir = st.raw_data_dir chop_top = 50 chop_bottom = 20 chop_right = 1 chop_left = 0 (l, r, d, u) = (0, 0, 0, 0) limitIM = '' resize_arg = '' median_arg = '' sharpen_arg = '' rotation_arg = '' deskew_arg = '' whiteThreshold_arg = '' cropbox = '' convert_crop = '' textcleaner_arg = '' threshold_arg = '' output_file = '' dryrun_arg = '' extract_params = 'resolution %(resolution)s size %(size)s median %(median)s sharpen %(sharpen)s textcleaner %(textcleaner)s nudge %(nudge)s deskew %(deskew)s threshold %(threshold)s cuneiform %(cuneiform)s' % locals() if limitIMthreads: limitIM = '-limit thread 1' if size != 100: resize_arg = '-resize %i%%' % size if median is not None: median_arg = '-median %d' % median if sharpen is not None: sharpen_arg = '-sharpen %s' % sharpen if rotation is not None: rotation_arg = '-rotation %i' % rotation if deskew is not None: deskew_arg = '-deskew %i' % deskew if whiteThreshold is not None: whiteThreshold_arg = '-white-threshold %i%%' % whiteThreshold if nudge is not None: nudgestrs = nudge.strip()[1:-1].split(',') (l, r, d, u) = map(int, nudgestrs) if dryrun is not None: dryrun_arg = '--dryrun' ghostscript = ''''gs -q -dSAFER -sDEVICE=png16m -dFirstPage=%(page)i -dLastPage=%(page)i -g{2}x{3} -r%(resolution)s -o - -c "<</Install {-$((%(r)i-%(l)i+{4})) -$((%(u)i-%(d)i+{5})) translate}>> setpagedevice" -f "{1}" ''' % locals() if inputfile is None: pdfpaths = '''python %(python_bin)spq.py printParallelParams --where="%(where)s" --limit="%(limit)s" --resolution="%(resolution)s" --targetfield="%(targetfield)s"''' % locals() else: pdfpaths = '''cat %(inputfile)s''' % locals() parallel = '''| parallel %(dryrun_arg)s --no-run-if-empty --colsep '\\t' --max-procs=%(maxprocs)i --ungroup''' % locals() convert_resize = '''| convert %(limitIM)s %(median_arg)s %(sharpen_arg)s %(rotation_arg)s %(resize_arg)s - - ''' % locals() if outputpng: convert_resize += ''' | tee /tmp/{1/.}_{8}.png''' if crop: cropbox = '''| python -u %(python_bin)scropbox.py {6} {7}''' % locals() convert_crop = '''| convert %(limitIM)s -gravity North -chop 0x%(chop_top)s%% -gravity East -chop %(chop_right)ix0%% -gravity West -chop %(chop_left)ix0%% -gravity South -chop 0x%(chop_bottom)s%% -bordercolor white -border 4x4 %(deskew_arg)s %(whiteThreshold_arg)s - -''' % locals() if outputcrop: cropbox += ''' | tee /tmp/{1/.}_crop_{8}.png''' if outputchop: convert_crop += ''' | tee /tmp/{1/.}_chop_{8}.png''' if textcleaner is not None: if textcleaner == 0: textcleaner_arg = '| textcleaner -T -s 1 png:- png:-' elif textcleaner == 1: textcleaner_arg = '| textcleaner -T -e normalize png:- png:-' elif textcleaner == 2: textcleaner_arg = '| textcleaner -T -s 1 -e normalize png:- png:-' if threshold: threshold_arg = '| python threshold.py gaussian 40' if not cuneiform: ocr = '''| tesseract303 - - -psm %(psm)s -lang=%(language)s {8} 2> /dev/null | python %(python_bin)snoinput.py''' % locals() else: ocr = '''| cuneiform --singlecolumn - -o /tmp/{1/.}_cuneiform.txt > /dev/null 2> /dev/null || touch /tmp/{1/.}_cuneiform.txt && cat /tmp/{1/.}_cuneiform.txt | python %(python_bin)snoinput.py''' % locals() remove_nl = ''' | tr -s "\\n" " " | sed "s/$/\\n/g" ''' if bbox: ocr += '''| python hocrCoords.py %(targettext)s''' % locals() addinfo = '' if not noinfo: addinfo = ''' | sed -e "s|\(.\+\)$|\\1\\t{8}\\t%(page)i\\t%(extract_params)s\\t{1}\\t{2}\\t{3}\\t{4}\\t{5}\\t{6}\\t{7}\\t{8}|g" ''' % locals() addinfo += '\'' # quote for end of parallel section if outfile is not None: output_file = '''> %(outfile)s ''' % locals() cmd = ' '.join([pdfpaths, parallel, ghostscript, convert_resize, cropbox, convert_crop, textcleaner_arg, threshold_arg, ocr, remove_nl, addinfo, output_file]) if printcmd: print(cmd) else: cmdout = osutil.getStdoutFromCmd(cmd, shell=True) if cmdout.strip() != '': print cmdout # remove cuneiform output files filelist = glob.glob("/tmp/*cuneiform.txt") for f in filelist: os.remove(f)
def run_cmd(resolution, targetfield, size=100, inputfile=None, outfile=None, crop=False, where='', limit=1, page=1, psm=7, rotation=None, language='engarial', outputpng=False, maxprocs=8, printcmd=False, limitIMthreads=True, median=None, sharpen=None, textcleaner=None, nudge=None, bbox=False, targettext=None, outputcrop=False, outputchop=False, threshold=False, deskew=None, cuneiform=False, whiteThreshold=None, noinfo=False, dryrun=None): '''Extract fields from a PDF using GNU Parallel -o <str>, --outfile=<str> -s <int>, --size=<int> -r <int>, --resolution=<int> ''' python_bin = st.python_bin raw_data_dir = st.raw_data_dir chop_top = 50 chop_bottom = 20 chop_right = 1 chop_left = 0 (l, r, d, u) = (0, 0, 0, 0) limitIM = '' resize_arg = '' median_arg = '' sharpen_arg = '' rotation_arg = '' deskew_arg = '' whiteThreshold_arg = '' cropbox = '' convert_crop = '' textcleaner_arg = '' threshold_arg = '' output_file = '' dryrun_arg = '' extract_params = 'resolution %(resolution)s size %(size)s median %(median)s sharpen %(sharpen)s textcleaner %(textcleaner)s nudge %(nudge)s deskew %(deskew)s threshold %(threshold)s cuneiform %(cuneiform)s' % locals( ) if limitIMthreads: limitIM = '-limit thread 1' if size != 100: resize_arg = '-resize %i%%' % size if median is not None: median_arg = '-median %d' % median if sharpen is not None: sharpen_arg = '-sharpen %s' % sharpen if rotation is not None: rotation_arg = '-rotation %i' % rotation if deskew is not None: deskew_arg = '-deskew %i' % deskew if whiteThreshold is not None: whiteThreshold_arg = '-white-threshold %i%%' % whiteThreshold if nudge is not None: nudgestrs = nudge.strip()[1:-1].split(',') (l, r, d, u) = map(int, nudgestrs) if dryrun is not None: dryrun_arg = '--dryrun' ghostscript = ''''gs -q -dSAFER -sDEVICE=png16m -dFirstPage=%(page)i -dLastPage=%(page)i -g{2}x{3} -r%(resolution)s -o - -c "<</Install {-$((%(r)i-%(l)i+{4})) -$((%(u)i-%(d)i+{5})) translate}>> setpagedevice" -f "{1}" ''' % locals( ) if inputfile is None: pdfpaths = '''python %(python_bin)spq.py printParallelParams --where="%(where)s" --limit="%(limit)s" --resolution="%(resolution)s" --targetfield="%(targetfield)s"''' % locals( ) else: pdfpaths = '''cat %(inputfile)s''' % locals() parallel = '''| parallel %(dryrun_arg)s --no-run-if-empty --colsep '\\t' --max-procs=%(maxprocs)i --ungroup''' % locals( ) convert_resize = '''| convert %(limitIM)s %(median_arg)s %(sharpen_arg)s %(rotation_arg)s %(resize_arg)s - - ''' % locals( ) if outputpng: convert_resize += ''' | tee /tmp/{1/.}_{8}.png''' if crop: cropbox = '''| python -u %(python_bin)scropbox.py {6} {7}''' % locals() convert_crop = '''| convert %(limitIM)s -gravity North -chop 0x%(chop_top)s%% -gravity East -chop %(chop_right)ix0%% -gravity West -chop %(chop_left)ix0%% -gravity South -chop 0x%(chop_bottom)s%% -bordercolor white -border 4x4 %(deskew_arg)s %(whiteThreshold_arg)s - -''' % locals( ) if outputcrop: cropbox += ''' | tee /tmp/{1/.}_crop_{8}.png''' if outputchop: convert_crop += ''' | tee /tmp/{1/.}_chop_{8}.png''' if textcleaner is not None: if textcleaner == 0: textcleaner_arg = '| textcleaner -T -s 1 png:- png:-' elif textcleaner == 1: textcleaner_arg = '| textcleaner -T -e normalize png:- png:-' elif textcleaner == 2: textcleaner_arg = '| textcleaner -T -s 1 -e normalize png:- png:-' if threshold: threshold_arg = '| python threshold.py gaussian 40' if not cuneiform: ocr = '''| tesseract303 - - -psm %(psm)s -lang=%(language)s {8} 2> /dev/null | python %(python_bin)snoinput.py''' % locals( ) else: ocr = '''| cuneiform --singlecolumn - -o /tmp/{1/.}_cuneiform.txt > /dev/null 2> /dev/null || touch /tmp/{1/.}_cuneiform.txt && cat /tmp/{1/.}_cuneiform.txt | python %(python_bin)snoinput.py''' % locals( ) remove_nl = ''' | tr -s "\\n" " " | sed "s/$/\\n/g" ''' if bbox: ocr += '''| python hocrCoords.py %(targettext)s''' % locals() addinfo = '' if not noinfo: addinfo = ''' | sed -e "s|\(.\+\)$|\\1\\t{8}\\t%(page)i\\t%(extract_params)s\\t{1}\\t{2}\\t{3}\\t{4}\\t{5}\\t{6}\\t{7}\\t{8}|g" ''' % locals( ) addinfo += '\'' # quote for end of parallel section if outfile is not None: output_file = '''> %(outfile)s ''' % locals() cmd = ' '.join([ pdfpaths, parallel, ghostscript, convert_resize, cropbox, convert_crop, textcleaner_arg, threshold_arg, ocr, remove_nl, addinfo, output_file ]) if printcmd: print(cmd) else: cmdout = osutil.getStdoutFromCmd(cmd, shell=True) if cmdout.strip() != '': print cmdout # remove cuneiform output files filelist = glob.glob("/tmp/*cuneiform.txt") for f in filelist: os.remove(f)