def build_seal_idx():
    '''
    Build a text file with a list of indexes (from unique file df)
    of cases that contain the words 'seal/redact/protetive/restriced'
    '''
    seal_idx = []
    dff = dt.load_unique_files_df()

    for i,row in tqdm(dff.iterrows(),total=dff.shape[0]):
        case = dt.load_case(row.fpath)
        if dof.find_pattern(case.get('docket', []), RE_WIDE_NET, rlim=RLIM):
def load_sealed_df(file):
    ''' Load an output file from this script'''
    df = pd.read_csv(file)
    for col in ('is_multi', 'is_mdl'):
        df[col].fillna(False)

    # Deal with binary variables stored as 0/1
    bool_cols = [*df.columns[df.columns.get_loc('seal_motion'):] ]
    df = df.astype({col:bool for col in bool_cols})

    dff = dt.load_unique_files_df()
    df.insert(6, 'source', df.ucid.map(dff.source))
    return df
def main(outfile, sample_n, year_inp, court_inp ,nos_inp, allow_non_matches):
    '''
    Process all of the courts to build dataset
    '''
    # Gather the filepaths csv
    files_df = dt.load_unique_files_df()

    # Filter by relevant "seal" cases
    files_df = filter_cases_seal(files_df).copy()

    if year_inp:
        files_df = files_df[files_df.year==year_inp].copy()
        print(f'Running only on cases from {year_inp}, reduced dataset to {len(files_df):,} cases')

    if court_inp:
        files_df = files_df[files_df.court==court_inp].copy()
        print(f'Running only on cases from {court_inp}, reduced dataset to {len(files_df):,} cases')

    if nos_inp:
        files_df = files_df[files_df.nature_suit.fillna('').str.startswith( str(nos_inp) )].copy()
        print(f'Running only on cases from {court_inp}, reduced dataset to {len(files_df):,} cases')

    # If sample size specified, run on random subset
    if sample_n:
        files_df = files_df.sample(sample_n).copy()
        print(f'Running on random subset of size {sample_n:,}')

    print(f'Processing {len(files_df):,} cases...\n')

    #Build the csv file line-by-line
    out_file_name = outfile
    col_names = ['court', 'judge', 'case_id', 'ucid', 'line_ind','fpath', 'case_type',
                 'nature_suit','text', 'date_docket_line', 'days_from_filing',
                 'is_multi','is_mdl','mdl_code', *pats.keys()]
    w_count = 0  #Keep count of lines written

    with open(out_file_name, 'w+', encoding="utf-8") as wfile:
        writer = csv.writer(wfile)
        writer.writerow(col_names)

        # Iterate through all relevant files
        for i, row in tqdm(files_df.iterrows(), total=len(files_df), desc="Files Processed"):

            case = dt.load_case(row.fpath)
            if 'docket' not in case.keys():
                continue

            if type(case['docket'][0])==list:
                tqdm.write(i)
                continue
def build_df_dur(df, year=2016):
    '''
    Build table with relevant case duration data (need to open each case to verify latest date)
    Inputs:
        - df (pd.DataFrame): the main docketline level dataframe of sealed data
        - year (int): the year

    '''

    dff = dtools.load_unique_files_df()
    # Get the subset of cases from unique files table that are patent cases
    cases_pat = dff[dff.nature_suit.eq(PATENT_NOS) & dff.year.eq(year)
                    & ~dff.is_multi.eq(True)].copy()
    cases_pat['is_txed'] = cases_pat.court.eq('txed')

    duration = []
    for ucid, row in tqdm(cases_pat.iterrows(), total=cases_pat.shape[0]):
        case = dtools.load_case(row.fpath)
        if not case.get('docket'):
            continue
Beispiel #5
0
def idb_merge(idb_data_file,
              case_type,
              preloaded_idb_data_file=None,
              dframe=None):
    '''
    Merge dataframe of cases with idb data

    Inputs
        - idb_data_file (str or Path): the idb csv file to use e.g. 'cv10to19.csv'
        - case_type (str): the case type ('cv' or 'cr') of the cases in the idb file provided
        - preloaded_idb_data_file (DataFrame): specify a preloaded IDB dataframe, e.g. if the consumer has already called load_idb_csv
        - dframe (DataFrame): specify table of case files, instead of using all of unique files table
    Outputs
        - final (DataFrame): the merged table
        - match_rate (float): the no. of original casefiles matched against idb
    '''
    if dframe is None:
        dff = dtools.load_unique_files_df()
        dff = dff[dff.case_type.eq(case_type)].copy()
    else:
        dff = dframe.copy()
    N = dff.shape[0]
    print(f"\n{N:,} SCALES cases provided")

    # Make sure there's a ucid column
    dff.reset_index(inplace=True)
    dff['ucid_copy'] = dff['ucid'].copy()

    if preloaded_idb_data_file is not None:
        df_idb = preloaded_idb_data_file
    else:
        print(f'Loading idb file: {idb_data_file}...')
        df_idb = load_idb_csv(idb_data_file,
                              case_type=case_type,
                              cols=BARE_MIN_COLS)
    df_idb.sort_values(['ucid', 'filedate'], inplace=True)
    df_idb.drop_duplicates('ucid', keep='first', inplace=True)

    #Stage 1 (matching on ucid)
    print(f'STAGE 1: matching on ucid...')
    matched_mask = dff.ucid.isin(df_idb.ucid)
    matched_ucids = dff.ucid[matched_mask]
    keepcols = [
        'fpath', 'case_type', 'filing_date', 'terminating_date', 'source',
        *[x.lower() for x in BARE_MIN_COLS]
    ]
    # *[x.lower() for x in get_recap_idb_cols(case_type)] ]
    if 'nos_subtype' in dff.columns:
        keepcols.append('nos_subtype')

    # Make table of data merged on ucid
    print(f'STAGE 1: merging...')
    merged_ucid = dff[matched_mask].merge(df_idb, how='inner', left_on='ucid', right_on='ucid')\
        .set_index('ucid_copy')[keepcols]
    print(
        f'STAGE 1: {{matched:{sum(matched_mask):,}, unmatched:{sum(~matched_mask):,} }}'
    )

    # Reduce dff to unmatched
    dff = dff[~matched_mask].copy()
    # Create weak ucid
    dff['ucid_weak'] = dtools.get_ucid_weak(dff.ucid)

    # Remove matched from df_idb and reduce to weak_ucid match
    print(f'STAGE 2: matching on weak_ucid...')
    df_idb = df_idb[~df_idb.ucid.isin(matched_ucids)
                    & df_idb.ucid_weak.isin(dff.ucid_weak)]

    # Stage 2 (matching on ucid_weak and filing date)
    print(f'STAGE 2: merging...')
    merged_weak = dff.merge(df_idb, how="inner", left_on=['ucid_weak','filing_date'],
                             right_on=['ucid_weak', 'filedate'])\
                             .set_index('ucid_copy')[keepcols]
    matched_stage2 = merged_weak.shape[0]
    print(
        f"STAGE 2 {{matched:{matched_stage2:,}, unmatched:{sum(~matched_mask) -matched_stage2 :,} }}"
    )

    final = pd.concat([merged_ucid, merged_weak])
    del dff, df_idb

    match_rate = final.shape[0] / N
    print(f"Overall match rate: {match_rate :.2%}")

    return final, match_rate