Example #1
0
def make_trinomial_instances_df(doc_dir):
    tri_man = TrinomialManage()
    tri_man.remove_prepended_zeros = True
    df = pd.DataFrame(
        columns=[
            'filename',
            'pos_trinomial',
            'state_num',
            'region_abbr',
            'site_number'
        ]
    )
    i = 0
    for subdir, dirs, files in os.walk(doc_dir):
        for file in files:
            if not file.endswith('.txt'):
                continue
            filepath = os.path.join(subdir, file)
            with open(filepath, 'r') as file_obj:
                content = file_obj.read()
            trinomials = re.findall(r'(\b([0-9]{1,2}[A-Z]{2,}[0-9]{1,})\b)', content)
            trinomials = set(trinomials)
            for t_tup in trinomials:
                t_tup = set(t_tup)
                for trinomial in t_tup:
                    if trinomial.startswith('0'):
                        # not a trinomial
                        continue
                    tri_parts = tri_man.parse_trinomial(trinomial)
                    state = int(tri_parts['state'])
                    if state < 1 or state > 50:
                        # not a state, skip
                        continue
                    df.loc[i] = [
                        file,
                        trinomial,
                        state,
                        tri_parts['county'],
                        tri_parts['site']
                    ]
                    i += 1
                    print('[{}] Found {} in {} ({}, {}, {})'.format(
                            i,
                            trinomial,
                            file,
                            state,
                            tri_parts['county'],
                            tri_parts['site'],
                        )
                    )
    return df