def check_line(x): """Check a line to see if it still has weights. Input is a row from a dataframe (series). Return edited series. """ r = ' ' + x['name'] + ' ' xnew = x.copy() # if (re.search(r_sym, r) or '%' in r) and (x['cent_wt'] != '' and # int(x['cent_wt']) == 100): r_search = re.search(r_sym, r) if (r_search or '%' in r) and \ (x['cent_wt'] != '' and ( int(x['cent_wt']) == 100 or ((r_search and re.search(r_search.group(0) + r'[\s\d]*[\%]', r)) and (x['cent_wt'] > 50 and '.' not in str(x['cent_wt'])) ))): d = {} new_name = r for gwt in re.finditer(r_num, r): full_match = gwt.group(0) if '.' not in full_match and '%' not in full_match: continue wt = '' unit = '' gp = [i for i in gwt.groups() if i is not None] if len(gp) == 1: wt = str(gp[0]) elif len(gp) == 2: wt = str(gp[0]) + '-' + str(gp[1]) elif len(gp) > 2: print('WT Error: ' + str(gp)) continue wt = wt.replace(' ', '') + unit new_name = r.replace(full_match, ' ') new_name = re.sub(r'\s+', ' ', new_name).strip() print('---- ' + str(x.name) + ' ----') print(r + ' ---> ' + new_name) dtemp = {'cas': x['cas'], 'wt': wt} if not (pd.isna(x['ci_color']) or x['ci_color'] == ''): dtemp['ci_color'] = x['ci_color'] d[new_name] = dtemp if len(d) > 0: dfix = fix_dict(d) dnew = chem_format(dfix)[0] for key, val in dnew.items(): xnew[key] = val return xnew
def fix_range(x): """Fix entire range not being picked up.""" # 11963, 11962, 55824 if x['cent_wt'] == '': return x r = ' ' + x['name'] + ' ' cwt = x['cent_wt'] # handle secret at end secret = False if len(r) > 0: if re.search(r'\s*(?:- secret)\s?$', r): r = re.sub(r'\s*(?:- secret)\s?$', '', r).strip() secret = True d = {} xnew = x.copy() gwt = re.search(r_num2, r) if (re.search(r_sym, r) or '%' in r) and (gwt and gwt.group(2) is None): # make sure full match is at the end, ends with - or to or something full_match = gwt.group(0) r_end = re.compile(full_match + r'\s*(?:to|[\-\<\=]{1,2})\s*$') match_end = re.search(r_end, r) if match_end: unit = '' wt = str(gwt.group(1)).strip(' <>=') + '-' + str(cwt) wt = wt.replace(' ', '') + unit new_name = r.replace(match_end.group(0), '').strip() if secret: new_name = new_name + ' - secret' dtemp = {'cas': x['cas'], 'wt': wt} if not (pd.isna(x['ci_color']) or x['ci_color'] == ''): dtemp['ci_color'] = x['ci_color'] d[new_name] = dtemp if len(d) > 0: dfix = fix_dict(d) dnew = chem_format(dfix)[0] for key, val in dnew.items(): xnew[key] = val print('---- ' + str(x.name) + ' ----') print(r + ' / ' + str(cwt) + ' ---> ' + new_name + ' / ' + str(xnew['min_wt']) + ' / ' + str(xnew['max_wt'])) return xnew
def pdf_extract(fname, folder, tcomb, do_OCR=True, all_OCR=False, zipFile=None, fname_list=None): """Take a filename and return chemical info.""" # read pdf f = fname[0] comb = pdf_sort(fname, folder, do_OCR, all_OCR, zipFile, fname_list) # break out output (this info will be used to log) # step0_fail = comb[3] # step1_fail = comb[4] # step1_success = comb[5] # not_pdf = comb[6] # not_sds = comb[7] # too_3or4 = comb[8] # no_3or4 = comb[9] needs_ocr = comb[10] failed_files = comb[11] split_pdfs = comb[12] # data for processing to_sec = comb[0] # text and pdf where the important parts were parsed to_old = comb[1] # text and pdf where the parsing failed to_label = comb[2] # images of labels to parse # run functions sc = [] named = [] casno = [] nlabel = [] # keys for all these were removed # chemicals: key to section to list # chemicals_old: key to list # chemicals_add: key to list # sec_search: key to section to list of dicts # sec_search_wide: key to list of dicts # old_search: key to list of dicts # label_search: key to list # label_search2: key to list (not anymore) # search for information # all of these loops are 1 item long for key, val in to_sec.items(): sc.append('sec') chemicals = fun_chemicals(key, val) chemicals_add = fun_chemicals_add(key, val, chemicals) sec_search = fun_sec_search(key, val, tcomb) sec_search_wide = fun_wide_search(key, val, tcomb) named = named + [j for i in sec_search for j in i] named = named + sec_search_wide casno = casno + [j for i in chemicals for j in i] casno = casno + chemicals_add for key, val in to_old.items(): sc.append('old') chemicals_old = fun_chemicals_old(key, val) old_search = fun_wide_search(key, val, tcomb) named = named + old_search casno = casno + chemicals_old for key, val in to_label.items(): sc.append('label') label_search = fun_label_search(key, val, tcomb) nlabel = nlabel + label_search # aggregate names named = [j for i in named for j in fix_dict(i)] chemct1 = len(named) chemct2 = len(pd.unique(casno)) + len(pd.unique(nlabel)) logging.debug('%s: %s chems to df_search, %s other.', f, str(chemct1), str(chemct2)) df_search = pd.DataFrame(chem_format(named)) df_search.drop_duplicates(inplace=True) # aggregate cas casno = [ i for i in list(pd.unique(casno)) if ('cas' in df_search.columns and i not in df_search['cas'].values) or ('cas' not in df_search.columns) ] to_add = [{ 'name': '', 'cas': n, 'min_wt': '', 'cent_wt': '', 'max_wt': '', 'ci_color': '' } for n in casno] to_add_df = pd.DataFrame(to_add) # aggregate label info nlabel = chem_format([ i for i in list(pd.unique(nlabel)) if ('name' in df_search.columns and i not in df_search['name'].values) or ('name' not in df_search.columns) ]) to_add_label = [{ 'name': n, 'cas': '', 'min_wt': '', 'cent_wt': '', 'max_wt': '', 'ci_color': '' } for n in nlabel] to_add_label_df = pd.DataFrame(to_add_label) # combine df_comb = pd.concat([df_search, to_add_df, to_add_label_df]) \ .reset_index(drop=True) if len(df_comb) == 0: df_comb = pd.DataFrame( { 'name': '', 'cas': '', 'min_wt': '', 'cent_wt': '', 'max_wt': '', 'ci_color': '' }, index=[0]) df_comb = df_comb.loc[df_comb.apply( lambda x: 0 if x.sum().strip() == '' else 1, axis=1) == 1] if (chemct1 == 0 and chemct2 != 0) and len(df_comb) == 0: logging.warning('%s: Check documents for removed chemicals.', f) df_comb.insert(0, 'filename', f) # df_comb.insert(0, 'filename', f.split('.pdf')[0] + '.csv') # df_store.append(df_comb) # create file info for log dinfo = {} dinfo['filename'] = f dinfo['OCR'] = True if len([i for i in needs_ocr if f in i]) > 0 else False dinfo['split'] = True if len([i for i in split_pdfs if f in i]) > 0 \ else False if f in failed_files: dinfo['debug'] = 'failed' elif len(sc) == 0: dinfo['debug'] = 'missing' else: dinfo['debug'] = ','.join(list(pd.unique(sc))) if len(sc) > 1: print(sc) logging.warning('%s: Sorted into multiple sections.', f) dinfo['num_found'] = len(df_comb) logging.debug('%s: OCR=%s, split=%s, section=%s', dinfo['filename'], str(dinfo['OCR']), str(dinfo['split']), dinfo['debug']) logging.info('%s: %s chemicals found.', dinfo['filename'], str(dinfo['num_found'])) # info_df.append(dinfo) return df_comb, dinfo