コード例 #1
0
def check_line(x):
    """Check a line to see if it still has weights.

    Input is a row from a dataframe (series). Return edited series.
    """
    r = ' ' + x['name'] + ' '
    xnew = x.copy()

    # if (re.search(r_sym, r) or '%' in r) and (x['cent_wt'] != '' and
    #                                           int(x['cent_wt']) == 100):
    r_search = re.search(r_sym, r)
    if (r_search or '%' in r) and \
            (x['cent_wt'] != '' and (
                int(x['cent_wt']) == 100 or
                ((r_search and
                  re.search(r_search.group(0) + r'[\s\d]*[\%]', r)) and
                 (x['cent_wt'] > 50 and '.' not in str(x['cent_wt']))
                 ))):
        d = {}
        new_name = r
        for gwt in re.finditer(r_num, r):

            full_match = gwt.group(0)
            if '.' not in full_match and '%' not in full_match:
                continue

            wt = ''
            unit = ''
            gp = [i for i in gwt.groups() if i is not None]
            if len(gp) == 1:
                wt = str(gp[0])
            elif len(gp) == 2:
                wt = str(gp[0]) + '-' + str(gp[1])
            elif len(gp) > 2:
                print('WT Error: ' + str(gp))
                continue
            wt = wt.replace(' ', '') + unit

            new_name = r.replace(full_match, ' ')
            new_name = re.sub(r'\s+', ' ', new_name).strip()

            print('---- ' + str(x.name) + ' ----')
            print(r + ' ---> ' + new_name)

            dtemp = {'cas': x['cas'], 'wt': wt}
            if not (pd.isna(x['ci_color']) or x['ci_color'] == ''):
                dtemp['ci_color'] = x['ci_color']
            d[new_name] = dtemp

        if len(d) > 0:
            dfix = fix_dict(d)
            dnew = chem_format(dfix)[0]

            for key, val in dnew.items():
                xnew[key] = val

    return xnew
コード例 #2
0
def fix_range(x):
    """Fix entire range not being picked up."""
    # 11963, 11962, 55824
    if x['cent_wt'] == '':
        return x
    r = ' ' + x['name'] + ' '
    cwt = x['cent_wt']

    # handle secret at end
    secret = False
    if len(r) > 0:
        if re.search(r'\s*(?:- secret)\s?$', r):
            r = re.sub(r'\s*(?:- secret)\s?$', '', r).strip()
            secret = True

    d = {}
    xnew = x.copy()
    gwt = re.search(r_num2, r)
    if (re.search(r_sym, r) or '%' in r) and (gwt and gwt.group(2) is None):
        # make sure full match is at the end, ends with - or to or something
        full_match = gwt.group(0)
        r_end = re.compile(full_match + r'\s*(?:to|[\-\<\=]{1,2})\s*$')
        match_end = re.search(r_end, r)
        if match_end:
            unit = ''
            wt = str(gwt.group(1)).strip(' <>=') + '-' + str(cwt)
            wt = wt.replace(' ', '') + unit

            new_name = r.replace(match_end.group(0), '').strip()
            if secret:
                new_name = new_name + ' - secret'

            dtemp = {'cas': x['cas'], 'wt': wt}
            if not (pd.isna(x['ci_color']) or x['ci_color'] == ''):
                dtemp['ci_color'] = x['ci_color']
            d[new_name] = dtemp

            if len(d) > 0:
                dfix = fix_dict(d)
                dnew = chem_format(dfix)[0]

                for key, val in dnew.items():
                    xnew[key] = val

                print('---- ' + str(x.name) + ' ----')
                print(r + ' / ' + str(cwt) + ' ---> ' + new_name + ' / ' +
                      str(xnew['min_wt']) + ' / ' + str(xnew['max_wt']))

    return xnew
コード例 #3
0
def pdf_extract(fname,
                folder,
                tcomb,
                do_OCR=True,
                all_OCR=False,
                zipFile=None,
                fname_list=None):
    """Take a filename and return chemical info."""
    # read pdf
    f = fname[0]
    comb = pdf_sort(fname, folder, do_OCR, all_OCR, zipFile, fname_list)

    # break out output (this info will be used to log)
    # step0_fail = comb[3]
    # step1_fail = comb[4]
    # step1_success = comb[5]
    # not_pdf = comb[6]
    # not_sds = comb[7]
    # too_3or4 = comb[8]
    # no_3or4 = comb[9]
    needs_ocr = comb[10]
    failed_files = comb[11]
    split_pdfs = comb[12]

    # data for processing
    to_sec = comb[0]  # text and pdf where the important parts were parsed
    to_old = comb[1]  # text and pdf where the parsing failed
    to_label = comb[2]  # images of labels to parse

    # run functions
    sc = []
    named = []
    casno = []
    nlabel = []

    # keys for all these were removed
    # chemicals: key to section to list
    # chemicals_old: key to list
    # chemicals_add: key to list

    # sec_search: key to section to list of dicts
    # sec_search_wide: key to list of dicts
    # old_search: key to list of dicts

    # label_search: key to list
    # label_search2: key to list (not anymore)

    # search for information
    # all of these loops are 1 item long
    for key, val in to_sec.items():
        sc.append('sec')

        chemicals = fun_chemicals(key, val)
        chemicals_add = fun_chemicals_add(key, val, chemicals)
        sec_search = fun_sec_search(key, val, tcomb)
        sec_search_wide = fun_wide_search(key, val, tcomb)

        named = named + [j for i in sec_search for j in i]
        named = named + sec_search_wide
        casno = casno + [j for i in chemicals for j in i]
        casno = casno + chemicals_add

    for key, val in to_old.items():
        sc.append('old')

        chemicals_old = fun_chemicals_old(key, val)
        old_search = fun_wide_search(key, val, tcomb)

        named = named + old_search
        casno = casno + chemicals_old

    for key, val in to_label.items():
        sc.append('label')

        label_search = fun_label_search(key, val, tcomb)

        nlabel = nlabel + label_search

    # aggregate names
    named = [j for i in named for j in fix_dict(i)]
    chemct1 = len(named)
    chemct2 = len(pd.unique(casno)) + len(pd.unique(nlabel))
    logging.debug('%s: %s chems to df_search, %s other.', f, str(chemct1),
                  str(chemct2))
    df_search = pd.DataFrame(chem_format(named))
    df_search.drop_duplicates(inplace=True)

    # aggregate cas
    casno = [
        i for i in list(pd.unique(casno))
        if ('cas' in df_search.columns and i not in df_search['cas'].values) or
        ('cas' not in df_search.columns)
    ]
    to_add = [{
        'name': '',
        'cas': n,
        'min_wt': '',
        'cent_wt': '',
        'max_wt': '',
        'ci_color': ''
    } for n in casno]
    to_add_df = pd.DataFrame(to_add)

    # aggregate label info
    nlabel = chem_format([
        i for i in list(pd.unique(nlabel))
        if ('name' in df_search.columns and i not in df_search['name'].values)
        or ('name' not in df_search.columns)
    ])
    to_add_label = [{
        'name': n,
        'cas': '',
        'min_wt': '',
        'cent_wt': '',
        'max_wt': '',
        'ci_color': ''
    } for n in nlabel]
    to_add_label_df = pd.DataFrame(to_add_label)

    # combine
    df_comb = pd.concat([df_search, to_add_df, to_add_label_df]) \
        .reset_index(drop=True)
    if len(df_comb) == 0:
        df_comb = pd.DataFrame(
            {
                'name': '',
                'cas': '',
                'min_wt': '',
                'cent_wt': '',
                'max_wt': '',
                'ci_color': ''
            },
            index=[0])
    df_comb = df_comb.loc[df_comb.apply(
        lambda x: 0 if x.sum().strip() == '' else 1, axis=1) == 1]

    if (chemct1 == 0 and chemct2 != 0) and len(df_comb) == 0:
        logging.warning('%s: Check documents for removed chemicals.', f)

    df_comb.insert(0, 'filename', f)
    # df_comb.insert(0, 'filename', f.split('.pdf')[0] + '.csv')
    # df_store.append(df_comb)

    # create file info for log
    dinfo = {}
    dinfo['filename'] = f
    dinfo['OCR'] = True if len([i for i in needs_ocr if f in i]) > 0 else False
    dinfo['split'] = True if len([i for i in split_pdfs if f in i]) > 0 \
        else False
    if f in failed_files:
        dinfo['debug'] = 'failed'
    elif len(sc) == 0:
        dinfo['debug'] = 'missing'
    else:
        dinfo['debug'] = ','.join(list(pd.unique(sc)))
        if len(sc) > 1:
            print(sc)
            logging.warning('%s: Sorted into multiple sections.', f)
    dinfo['num_found'] = len(df_comb)

    logging.debug('%s: OCR=%s, split=%s, section=%s', dinfo['filename'],
                  str(dinfo['OCR']), str(dinfo['split']), dinfo['debug'])
    logging.info('%s: %s chemicals found.', dinfo['filename'],
                 str(dinfo['num_found']))

    # info_df.append(dinfo)

    return df_comb, dinfo