Exemple #1
0
def make_ucid_and_weak(docket, office, district, case_type):
    '''
    Make the ucid and weak_ucid from idb data. Can take str values for a single row,
    or can take series and output as Series.

    Inputs:
        - docket (str or Series): idb 'docket' looks like 1600123 for year=16, case_no=00123
        - office (str or Series): idb office #
        - district (str or Series): court abbreviation e.g. 'ilnd'
        - case_type (str): 'cv' or 'cr'

    Outputs:
        - ucid (str or Series): looks like 'ilnd;;1:16-cv-00123'
        - ucid_weak (str or Series): ucid with office removed, looks like 'ilnd;;16-cv-00001'
    '''
    # Find the ucid and weak_ucid
    # data = {k:row[columns.index(k)] for k in ['DOCKET', 'DISTRICT', 'OFFICE']}
    if type(docket) is pd.Series:
        case_year = docket.str.slice(0, 2)
        case_no = docket.str.slice(2, )
    else:
        case_year = docket[:2]
        case_no = docket[2:]

    # court = IDB_COLS['DISTRICT']['conv'](data['DISTRICT'])

    ucid = dtools.ucid_from_scratch(district, office, case_year, case_type,
                                    case_no)
    ucid_weak = dtools.get_ucid_weak(ucid)
    return ucid, ucid_weak
Exemple #2
0
def idb_merge(idb_data_file,
              case_type,
              preloaded_idb_data_file=None,
              dframe=None):
    '''
    Merge dataframe of cases with idb data

    Inputs
        - idb_data_file (str or Path): the idb csv file to use e.g. 'cv10to19.csv'
        - case_type (str): the case type ('cv' or 'cr') of the cases in the idb file provided
        - preloaded_idb_data_file (DataFrame): specify a preloaded IDB dataframe, e.g. if the consumer has already called load_idb_csv
        - dframe (DataFrame): specify table of case files, instead of using all of unique files table
    Outputs
        - final (DataFrame): the merged table
        - match_rate (float): the no. of original casefiles matched against idb
    '''
    if dframe is None:
        dff = dtools.load_unique_files_df()
        dff = dff[dff.case_type.eq(case_type)].copy()
    else:
        dff = dframe.copy()
    N = dff.shape[0]
    print(f"\n{N:,} SCALES cases provided")

    # Make sure there's a ucid column
    dff.reset_index(inplace=True)
    dff['ucid_copy'] = dff['ucid'].copy()

    if preloaded_idb_data_file is not None:
        df_idb = preloaded_idb_data_file
    else:
        print(f'Loading idb file: {idb_data_file}...')
        df_idb = load_idb_csv(idb_data_file,
                              case_type=case_type,
                              cols=BARE_MIN_COLS)
    df_idb.sort_values(['ucid', 'filedate'], inplace=True)
    df_idb.drop_duplicates('ucid', keep='first', inplace=True)

    #Stage 1 (matching on ucid)
    print(f'STAGE 1: matching on ucid...')
    matched_mask = dff.ucid.isin(df_idb.ucid)
    matched_ucids = dff.ucid[matched_mask]
    keepcols = [
        'fpath', 'case_type', 'filing_date', 'terminating_date', 'source',
        *[x.lower() for x in BARE_MIN_COLS]
    ]
    # *[x.lower() for x in get_recap_idb_cols(case_type)] ]
    if 'nos_subtype' in dff.columns:
        keepcols.append('nos_subtype')

    # Make table of data merged on ucid
    print(f'STAGE 1: merging...')
    merged_ucid = dff[matched_mask].merge(df_idb, how='inner', left_on='ucid', right_on='ucid')\
        .set_index('ucid_copy')[keepcols]
    print(
        f'STAGE 1: {{matched:{sum(matched_mask):,}, unmatched:{sum(~matched_mask):,} }}'
    )

    # Reduce dff to unmatched
    dff = dff[~matched_mask].copy()
    # Create weak ucid
    dff['ucid_weak'] = dtools.get_ucid_weak(dff.ucid)

    # Remove matched from df_idb and reduce to weak_ucid match
    print(f'STAGE 2: matching on weak_ucid...')
    df_idb = df_idb[~df_idb.ucid.isin(matched_ucids)
                    & df_idb.ucid_weak.isin(dff.ucid_weak)]

    # Stage 2 (matching on ucid_weak and filing date)
    print(f'STAGE 2: merging...')
    merged_weak = dff.merge(df_idb, how="inner", left_on=['ucid_weak','filing_date'],
                             right_on=['ucid_weak', 'filedate'])\
                             .set_index('ucid_copy')[keepcols]
    matched_stage2 = merged_weak.shape[0]
    print(
        f"STAGE 2 {{matched:{matched_stage2:,}, unmatched:{sum(~matched_mask) -matched_stage2 :,} }}"
    )

    final = pd.concat([merged_ucid, merged_weak])
    del dff, df_idb

    match_rate = final.shape[0] / N
    print(f"Overall match rate: {match_rate :.2%}")

    return final, match_rate
Exemple #3
0
def split_txt(old_file,
              out_dir,
              case_type,
              year_lb=0,
              nrows=None,
              year_var='DOCKET'):
    '''
    Cut one of the large .txt tab-delimited IDB datasets into multiple csv files, by year.

    Inputs:
        - old_file(str or Path): the .txt file to be split
        - out_dir (str or Path): the output directory for new csv files
        - case_type ('cv' or 'cr')
        - year_lb (int): lower bound on year, to filter out rows with filedate below
        - nrows (int): max number of rows to write (for testing small samples)
        - year_var ('DOCKET', 'FILEDATE'): which IDB varibale to get the year from (for file splitting)
    '''

    # Create directory if it doesn't exist
    out_dir = Path(out_dir).resolve()
    if not out_dir.exists():
        out_dir.mkdir()

    with open(old_file, 'r+', encoding='ISO-8859-1') as rfile:
        # Get the column headers from the first line
        columns = rfile.readline().rstrip('\n').split('\t')
        ind_filedate = columns.index('FILEDATE')
        write_count = 0

        # Session dictionary to map year to open csv writers
        session = {}

        for line in rfile.readlines():
            # Extract the data in the line
            row = line.rstrip('\n').split('\t')
            if len(row) != len(columns):
                # Error, skip row
                continue

            # Filter by year lower bound
            file_year = int(row[ind_filedate].split('/')[-1])
            if file_year < year_lb:
                continue

            if year_var == 'FILEDATE':
                split_year = file_year

            elif year_var == 'DOCKET':
                # Use the year from the DOCKET variable e.g.g 1600001 -> 16
                ind_docket = columns.index('DOCKET')
                split_year = row[ind_docket][:2]
            else:
                raise ValueError("`year_var` must be in ('FILEDATE','DOCKET')")

            # Check if we have a csv for 'year' and if not, start it up
            if split_year not in session.keys():
                filepath = out_dir / f"{case_type}{split_year}.csv"
                session[split_year] = {
                    'file': open(filepath, 'w', encoding="utf-8", newline='\n')
                }
                session[split_year]['writer'] = csv.writer(
                    session[split_year]['file'])
                # Write the header row for this new file
                session[split_year]['writer'].writerow(
                    ['ucid', 'ucid_weak', *columns])

            # Find the ucid and weak_ucid
            data = {
                k: row[columns.index(k)]
                for k in ['DOCKET', 'DISTRICT', 'OFFICE']
            }
            case_year = data['DOCKET'][:2]
            case_no = data['DOCKET'][2:]
            court = IDB_COLS['DISTRICT']['conv'](data['DISTRICT'])
            ucid = dtools.ucid_from_scratch(court, data['OFFICE'], case_year,
                                            case_type, case_no)
            ucid_weak = dtools.get_ucid_weak(ucid)

            # Write the new row, which is (ucid, ucid_weak, <<original row data>>)
            session[split_year]['writer'].writerow([ucid, ucid_weak, *row])

            write_count += 1
            if nrows:
                if write_count >= nrows:
                    break

    for v in session.values():
        v['file'].close()