Ejemplo n.º 1
0
def config_builder(tmp):
    '''
    CLI to generate a config file/dict
    Inputs
        - tmp (str or dict): name of template or else template dictionary obj
    Output
        - dict: field names and fill values for form  as k,v pairs
    '''
    template = get_template(tmp) if type(tmp) is str else template
    user_input = {}

    print("\n######\n## Form builder\n######\n")
    for key, attrs in template['fields'].items():

        #query form - case_no exludes all else
        if 'case_no' in user_input and tmp == 'query':
            break

        pstring = f"\nEnter {key}"
        pstring += f" ({attrs['help']})" if attrs.get('help') else ''

        # Create choice variable if choice present
        var_type = click.Choice(
            attrs['choices']) if attrs.get('choices') else attrs['type']
        # Get the user input
        val = click.prompt(pstring,
                           attrs['default'],
                           type=var_type,
                           show_choices=True)

        if val:
            user_input[key] = val

    while True:
        if click.confirm(f"\nDo you want to save this configuration?"):
            fpath = click.prompt("Filepath (.json)")
            fpath = std_path(fpath).resolve()
            if fpath.exists():
                print('File already exists')
            else:
                with open(fpath, 'w+', encoding="utf-8") as wfile:
                    json.dump(user_input, wfile, indent=2)
                break
        else:
            break
    return user_input
Ejemplo n.º 2
0
def load_case(fpath, html=False, recap_orig=False):
    '''
    Loads the case given its filepath

    input:
        fpath (string/Path): a path relative to the project roots
        html (bool): whether to return the html (only works for Pacer, not Recap)
        recap_orig (bool): whether to return the original recap file, rather than the mapped

    output:
        the json of the case (or html if html is True)
    '''
    # Standardise across Windows/OSX and make it a Path object
    fpath = std_path(fpath)

    # Absolute path to json file
    if settings.PROJECT_ROOT.name in fpath.parts:
        # Treat as absolute if its not relative to project root
        jpath = fpath
    else:
        jpath = settings.PROJECT_ROOT / fpath

    if html:
        hpath = get_pacer_html(jpath)
        if hpath:
            return str( open(settings.PROJECT_ROOT / hpath, 'rb').read() )
        else:
            raise FileNotFoundError('HTML file not found')
    else:
        jdata = json.load( open(jpath, encoding="utf-8") )
        jdata['case_id'] = ftools.clean_case_id(jdata['case_id'])

        # TODO: recap orig- include recap orig
        # if recap_orig:
        #     if 'recap' in jdata['source']:
        #         # recap_id = #This needs to be in case json
        #         return json.load(open(settings.RECAP_PATH/f"{recap_id}.json", encoding="utf-8"))

        return jdata
Ejemplo n.º 3
0
def generate_unique_filepaths(outfile=None, nrows=None):
    '''
    Create a list of unique filepaths for all case json in the PACER folder and export to .csv
    Inputs:
        - outfile (str or Path) - the output file name (.csv) relative to the project root if none doesn't output
        - nrows (int) - no. of cases to use (for testing)
    Outputs:
        DataFrame of file metadata (also output to outfile if output=True)
    '''
    import pandas as pd
    tqdm.pandas()

    case_jsons = [court_dir.glob('json/*.json') for court_dir in settings.PACER_PATH.glob('*')
                    if court_dir.is_dir()]

    file_iter = chain(*case_jsons)

    df = convert_filepaths_list(file_iter=file_iter, nrows=nrows)

    #Write the file
    if outfile:
        df.to_csv(std_path(outfile))

    return df
Ejemplo n.º 4
0
def is_recap(fpath):
    '''Determine if a case is a recap case based on the filepath'''
    return 'recap' in std_path(fpath).parts
Ejemplo n.º 5
0
 def _clean_fpath_(x):
     p = std_path(x)
     if settings.PROJECT_ROOT.name in p.parts:
         return str(p.relative_to(settings.PROJECT_ROOT))
     else:
         return str(p)
Ejemplo n.º 6
0
def convert_filepaths_list(infile=None, outfile=None, file_iter=None, nrows=None):
    '''
    Convert the list of unique filepaths into a DataFrame with metadata and exports to csv

    Inputs:
        - infile (str or Path) - the input file, relative to the project root, expects csv with an 'fpath' column
        - outfile (str or Path) - the output file name (.csv) relative to the project root, if None doesn't write to file
        - file_iter (iterable) - list of filepaths, bypasses infile and reads list directly
        - nrows (int) - number of rows, if none then all
    Outputs:
        DataFrame of file metadata (also output to outfile if output=True)
    '''

    # Map of keys to functions that extract their values (avoids keeping separate list of keys/property names)
    #c: case json, f: filepath
    dmap = {
        'court': lambda c,f: c['download_court'] if is_recap(f) else Path(f).parent.parent.name,
        'year': lambda c,f: c['filing_date'].split('/')[-1],
        'filing_date': lambda c,f: c['filing_date'],
        'terminating_date': lambda c,f: c.get('terminating_date'),
        'case_id': lambda c,f: ftools.clean_case_id(c['case_id']),
        'case_type': lambda c,f: c['case_type'],
        'nature_suit': lambda c,f: dei.nos_matcher(c['nature_suit'], short_hand=True) or '',
        'judge': lambda c,f: jf.clean_name(c.get('judge')),
        'recap': lambda c,f: 'recap' in c['source'],
        'is_multi': lambda c,f: c['is_multi'],
        'is_mdl': lambda c,f: c['is_mdl'],
        'mdl_code': lambda c,f: c['mdl_code'],
        'has_html': lambda c,f: 'pacer' in c['source'],
        'source': lambda c,f: c['source']

    }

    properties = list(dmap.keys())

    def get_properties(fpath):
        ''' Get the year, court and type for the case'''
        case = load_case(fpath)
        return tuple(dmap[key](case,fpath) for key in properties)

    # Load fpaths from list or else from infile
    if file_iter is not None:
        if nrows:
            paths = [next(file_iter) for _ in range(nrows)]
        else:
            paths = file_iter
        # Build dataframe of paths relative to PROJECT_ROOT
        df = pd.DataFrame(paths, columns=['fpath'])
    elif infile:
        # Read in text file of filepath names
        df = pd.read_csv(std_path(infile), names=['fpath'], nrows=nrows)
    else:
        raise ValueError("Must provide either 'infile' or 'file_list'")

    # Convert filepath to relative format
    def _clean_fpath_(x):
        p = std_path(x)
        if settings.PROJECT_ROOT.name in p.parts:
            return str(p.relative_to(settings.PROJECT_ROOT))
        else:
            return str(p)

    df.fpath = df.fpath.apply(lambda x: _clean_fpath_(x))
    # Build year and court cols

    # Only do progress bar if it's more than 1
    if len(df) > 1:
        print('\nExtracting case properties...')
        properties_vector = df.fpath.progress_map(get_properties)
    else:
        properties_vector = df.fpath.map(get_properties)
    prop_cols = zip(*properties_vector)

    # Insert new columns, taking names from ordering of properties
    for i, new_col in enumerate(prop_cols):
        df[properties[i]] = new_col

    # Set UCID index
    df['ucid'] = ucid(df.court, df.case_id)#, series=True) #Not sure why this was here
    df = df.set_index('ucid')

    # Judge matching
    jmap = jf.unique_mapping(df.judge.unique())
    df.judge = df.judge.map(jmap)

    columns = properties.copy()
    columns.insert(2,'fpath')

    if outfile:
        df[columns].to_csv(std_path(outfile))
    return df[columns]
Ejemplo n.º 7
0
def remap_recap_data(recap_fpath=None, rjdata=None):
    '''
    Given a recap file, normalizes the process
    * recap_fpath
    output:
    *jdata
    '''

    def standardize_date(tdate):
        '''y-m-d to m/d/y'''
        if not tdate:
            return None
        try:
            y,m,d = tdate.split('-')
            return '/'.join([m, d, y])
        except AttributeError:
            return None

    def get_recap_docket(court, docket_entries):
        '''
        Remap the recap docket
        Inputs:
            - court (str): the court abbreviation
            - docket_entries (list): the value from the 'docket_entries' key in recap
        Output:
            - list of docket entries same as parsed format
        '''

        def get_doc_links(row):
            ''' Get links to documents (most rows don't have attachments, some do)'''
            documents = {}
            for doc in row.get('recap_documents', []):

                # Recap encodes document_type=1 for line doc and document_type=2 for attachment
                if doc.get('document_type') == 1:
                    ind = 0
                elif doc.get('document_type') == 2 and doc.get('attachment_number', False):
                    ind = int(doc['attachment_number'])
                else:
                    # Fallback option, use doc_id
                    ind = f"_{doc['pacer_doc_id']}"

                document_data = {
                    'url': ftools.get_pacer_url(court,'doc_link') + '/' + str(doc['pacer_doc_id']), 'span': {},
                    **{f"recap_{k}": doc[k] for k in ('page_count','filepath_ia', 'filepath_local', 'description', 'is_available')}
                }
                documents[ind] = document_data
            return documents

        rows = [
            {'date_filed': standardize_date(row['date_filed']),
             'ind': row['entry_number'],
             'docket_text': row['description'],
             'documents': get_doc_links(row),
             'edges': None
            }
            for row in docket_entries
        ]
        return rows

    #Load the data
    try:
        if not rjdata:
            recap_fpath = std_path(recap_fpath)
            jpath = settings.PROJECT_ROOT / recap_fpath
            rjdata = json.load(open(jpath), encoding="utf-8")
    except:
        print(f"Error loading file {recap_fpath}")
        return {}
    #Get the termination date
    tdate = standardize_date(rjdata['date_terminated'])
    case_status = 'closed' if tdate else 'open'

    # parties/counts
    case_type = rjdata['docket_number'].split('-')[1]
    is_cr = bool(case_type == 'cr')
    parties = {'plaintiff':{}, 'defendant':{}, 'bk_party':{}, 'other_party':{}, 'misc':{}}
    pending_counts, terminated_counts, complaints = ({},{},{}) if is_cr else (None,None,None)

    for party in rjdata['parties']:
        name = party['name']
        extra_info, terminating_date = parse_extra_info(party['extra_info'], from_recap=True)

        # lawyer dictionary
        lawyer_dict = {}
        is_pro_se = 'PRO SE' in str(party)
        if not is_pro_se and 'attorneys' in party.keys():
            for lawyer in party['attorneys']:
                is_lead = 'LEAD ATTORNEY' in str(lawyer)
                is_pro_hac = 'PRO HAC VICE' in str(lawyer)
                info = lawyer['contact_raw']
                office = info.split('\n')[0]
                addtl_info = {}
                if 'Designation' in info:
                    addtl_info['designation'] = re.search('Designation: ([A-Za-z \'\-]{1,100})', info).group(1)
                if any(x in info for x in ['Trial Bar Status', 'Trial bar Status']): # this seems to appear only in ILND
                    addtl_info['trial_bar_status'] = re.search('tatus: ([A-Za-z \'\-]{1,100})', info).group(1)
                elif 'Bar Status' in info:
                    addtl_info['bar_status'] = re.search('tatus: ([A-Za-z \'\-]{1,100})', info).group(1)
                lawyer_dict[lawyer['name']] = {'office':office,'is_lead_attorney':is_lead,'is_pro_hac_vice':is_pro_hac,'additional_info':addtl_info}

        # role titles, terminating date, extra info
        with open(settings.ROLE_MAPPINGS, 'r') as f:
            mappings = json.load(f)
        for pt in party['party_types']:
            role = pt['name']

            # sometimes these fields vary from party name to party name
            local_extra_info, local_terminating_date = parse_extra_info(pt['extra_info'], from_recap=True)
            extra_info = extra_info+local_extra_info if extra_info and local_extra_info and (
                local_extra_info[0] not in extra_info) else local_extra_info or extra_info
            date_delta = difference_in_dates(local_terminating_date, terminating_date)
            terminating_date = local_terminating_date if date_delta and date_delta>0 else local_terminating_date

            if role not in mappings.keys():
                party_title = role
                party_type = 'misc'
            else:
                party_title = mappings[role]['title']
                party_type = mappings[role]['type']
            dicti = {name: {'roles':[party_title], 'counsel':(lawyer_dict or None), 'is_pro_se':is_pro_se, 'terminating_date':terminating_date, 'extra_info':extra_info}}
            parties[party_type] = update_party(parties[party_type], name, dicti)

        # criminal counts
        if is_cr:
            criminal_counts = [count for pt in party['party_types'] for count in pt['criminal_counts']]
            pending, terminated = [], []
            for cc in criminal_counts:
                cc_parsed = [cc['name'], cc['disposition']]
                if 'dismissed' in cc['disposition'].lower(): # pretty coarse heuristic, maybe update later
                    terminated.append(cc_parsed)
                else:
                    pending.append(cc_parsed)
            if len(pending) > 0:
                pending_counts[name] = pending
            if len(terminated) > 0:
                terminated_counts[name] = terminated

    # Convert the data
    fdata = {
        'bankruptcy_parties':parties['bk_party'],
        'case_flags': '',
        'case_id': ftools.clean_case_id(rjdata['docket_number']),
        'case_name': rjdata['case_name'],
        'case_status': case_status,
        'case_type': case_type,
        'cause': rjdata['cause'],
        'complaints': complaints,
        'defendants': parties['defendant'],
        'docket': get_recap_docket(rjdata['court'], rjdata['docket_entries']) ,
        'download_court': rjdata['court'],
        'filing_date': standardize_date(rjdata['date_filed']),
        'header_case_id':None,
        'judge': rjdata['assigned_to_str'],
        'jurisdiction': rjdata['jurisdiction_type'],
        'jury_demand': rjdata['jury_demand'],
        'lead_case_id':None,
        'misc_participants':parties['misc'],
        'monetary_demand':None,
        'nature_suit': rjdata['nature_of_suit'],
        'other_court':None,
        'other_parties':parties['other_party'],
        'pacer_case_id':rjdata['pacer_case_id'],
        'pending_counts':pending_counts,
        'plaintiffs': parties['plaintiff'],
        'referred_judge': rjdata['referred_to_str'],
        'terminated_counts':terminated_counts,
        'terminating_date': tdate,
        'source':'recap',
        'ucid': ucid(rjdata['court'], ftools.clean_case_id(rjdata['docket_number'])),
        # MDL/Multi keys
        **{k:None for k in ['mdl_code', 'mdl_id_source','is_mdl', 'is_multi']},
        # Billing keys
        **{k:None for k in ['billable_pages', 'cost','n_docket_reports',]},
        # Scraper things
        **{k:None for k in ['download_timestamp', 'download_url', 'docket_available', 'member_case_key']}
    }
    return fdata
Ejemplo n.º 8
0
def docket_aggregator(fpaths, outfile=None):
    '''
    Build a docket report from multiple dockets for same case, outputs new html(dl)

    Inputs:
        - fpaths (list): a list of paths to docket htmls (in chronological order)
            the order supplied will be order of table in output (uses last one as base docket)
        - outfile (str or Path): output html file path
    Output:
        - soup (bs4 object) - the aggregated html as a soup object
        - extra (dict): a dictionary of extra data
    '''
    from bs4 import BeautifulSoup

    def _hash_row(tr):
        ''' Create a hash from the text of: date + # + docket_text[:20]'''
        val = ''
        for cell in tr.select('td'):
            val += cell.text[:20]
        return hash(row)

    rows = []

    # Extra data to be returned (from non-htmls)
    extra = {
        'recap_docket': [],
    }

    for fpath in fpaths:
        fpath = Path(fpath)
        if fpath.suffix == '.html':

            soup = BeautifulSoup(
                open(fpath, 'r', encoding='utf-8').read(), "html.parser")

            tables = soup.select('table')
            docket_table = None
            if len(tables) >= 2:
                docket_table = tables[-2]
                if dei.is_docket_table(docket_table):
                    rows.extend(docket_table.select('tr')[1:])

        # Assuming json implies recap
        elif fpath.suffix == '.json':
            # Grab the recap docketlines (cribbing the relevant code from remap_recap_data so we don't need to call that function)
            try:
                recap_fpath = std_path(fpath)
                jpath = settings.PROJECT_ROOT / recap_fpath
                rjdata = json.load(open(jpath))
                extra['recap_docket'].extend(
                    dtools.get_recap_docket(rjdata['court'],
                                            rjdata['docket_entries']))
                extra['recap_id'] = rjdata['id'] or None
            except:
                print(f"Error loading file {recap_fpath}")
                extra['recap_id'] = None
            # rjdata = dtools.remap_recap_data(fpath)
            # extra['recap_docket'].extend(rjdata['docket'])
            # extra['recap_id'] = rjdata.get('recap_id')

    # Check if need to create empty docket table for most recent docket (otherwise uses newest docket)
    if not dei.is_docket_table(docket_table):
        replace_tag, after_tag = None, None
        docket_table = build_empty_docket_table(soup)
        # Find "There are proceedings" text
        try:
            replace_tag = soup.select('h2[align="center"]')[-1]
        except:
            try:
                # Fallback: use last horizontal line
                replace_tag = soup.select('hr')[-1]
            except:
                # Double-fallback: insert at the very bottom
                after_tag = soup.select('table')[-1]
        if replace_tag:
            replace_tag.replace_with(docket_table)
        else:
            after_tag.insert_after(docket_table)

    # Insert all the rows
    header_row = docket_table.select_one('tr')
    docket_table.clear()
    docket_table.append(header_row)

    hashes = []

    for row in rows:
        rhash = _hash_row(row)
        if rhash not in hashes:
            docket_table.append(row)
            hashes.append(rhash)

    if outfile:
        with open(outfile, 'w', encoding="utf-8") as wfile:
            wfile.write(str(soup))

    return soup, extra