def build_seal_idx(): ''' Build a text file with a list of indexes (from unique file df) of cases that contain the words 'seal/redact/protetive/restriced' ''' seal_idx = [] dff = dt.load_unique_files_df() for i,row in tqdm(dff.iterrows(),total=dff.shape[0]): case = dt.load_case(row.fpath) if dof.find_pattern(case.get('docket', []), RE_WIDE_NET, rlim=RLIM):
def load_sealed_df(file): ''' Load an output file from this script''' df = pd.read_csv(file) for col in ('is_multi', 'is_mdl'): df[col].fillna(False) # Deal with binary variables stored as 0/1 bool_cols = [*df.columns[df.columns.get_loc('seal_motion'):] ] df = df.astype({col:bool for col in bool_cols}) dff = dt.load_unique_files_df() df.insert(6, 'source', df.ucid.map(dff.source)) return df
def main(outfile, sample_n, year_inp, court_inp ,nos_inp, allow_non_matches): ''' Process all of the courts to build dataset ''' # Gather the filepaths csv files_df = dt.load_unique_files_df() # Filter by relevant "seal" cases files_df = filter_cases_seal(files_df).copy() if year_inp: files_df = files_df[files_df.year==year_inp].copy() print(f'Running only on cases from {year_inp}, reduced dataset to {len(files_df):,} cases') if court_inp: files_df = files_df[files_df.court==court_inp].copy() print(f'Running only on cases from {court_inp}, reduced dataset to {len(files_df):,} cases') if nos_inp: files_df = files_df[files_df.nature_suit.fillna('').str.startswith( str(nos_inp) )].copy() print(f'Running only on cases from {court_inp}, reduced dataset to {len(files_df):,} cases') # If sample size specified, run on random subset if sample_n: files_df = files_df.sample(sample_n).copy() print(f'Running on random subset of size {sample_n:,}') print(f'Processing {len(files_df):,} cases...\n') #Build the csv file line-by-line out_file_name = outfile col_names = ['court', 'judge', 'case_id', 'ucid', 'line_ind','fpath', 'case_type', 'nature_suit','text', 'date_docket_line', 'days_from_filing', 'is_multi','is_mdl','mdl_code', *pats.keys()] w_count = 0 #Keep count of lines written with open(out_file_name, 'w+', encoding="utf-8") as wfile: writer = csv.writer(wfile) writer.writerow(col_names) # Iterate through all relevant files for i, row in tqdm(files_df.iterrows(), total=len(files_df), desc="Files Processed"): case = dt.load_case(row.fpath) if 'docket' not in case.keys(): continue if type(case['docket'][0])==list: tqdm.write(i) continue
def build_df_dur(df, year=2016): ''' Build table with relevant case duration data (need to open each case to verify latest date) Inputs: - df (pd.DataFrame): the main docketline level dataframe of sealed data - year (int): the year ''' dff = dtools.load_unique_files_df() # Get the subset of cases from unique files table that are patent cases cases_pat = dff[dff.nature_suit.eq(PATENT_NOS) & dff.year.eq(year) & ~dff.is_multi.eq(True)].copy() cases_pat['is_txed'] = cases_pat.court.eq('txed') duration = [] for ucid, row in tqdm(cases_pat.iterrows(), total=cases_pat.shape[0]): case = dtools.load_case(row.fpath) if not case.get('docket'): continue
def idb_merge(idb_data_file, case_type, preloaded_idb_data_file=None, dframe=None): ''' Merge dataframe of cases with idb data Inputs - idb_data_file (str or Path): the idb csv file to use e.g. 'cv10to19.csv' - case_type (str): the case type ('cv' or 'cr') of the cases in the idb file provided - preloaded_idb_data_file (DataFrame): specify a preloaded IDB dataframe, e.g. if the consumer has already called load_idb_csv - dframe (DataFrame): specify table of case files, instead of using all of unique files table Outputs - final (DataFrame): the merged table - match_rate (float): the no. of original casefiles matched against idb ''' if dframe is None: dff = dtools.load_unique_files_df() dff = dff[dff.case_type.eq(case_type)].copy() else: dff = dframe.copy() N = dff.shape[0] print(f"\n{N:,} SCALES cases provided") # Make sure there's a ucid column dff.reset_index(inplace=True) dff['ucid_copy'] = dff['ucid'].copy() if preloaded_idb_data_file is not None: df_idb = preloaded_idb_data_file else: print(f'Loading idb file: {idb_data_file}...') df_idb = load_idb_csv(idb_data_file, case_type=case_type, cols=BARE_MIN_COLS) df_idb.sort_values(['ucid', 'filedate'], inplace=True) df_idb.drop_duplicates('ucid', keep='first', inplace=True) #Stage 1 (matching on ucid) print(f'STAGE 1: matching on ucid...') matched_mask = dff.ucid.isin(df_idb.ucid) matched_ucids = dff.ucid[matched_mask] keepcols = [ 'fpath', 'case_type', 'filing_date', 'terminating_date', 'source', *[x.lower() for x in BARE_MIN_COLS] ] # *[x.lower() for x in get_recap_idb_cols(case_type)] ] if 'nos_subtype' in dff.columns: keepcols.append('nos_subtype') # Make table of data merged on ucid print(f'STAGE 1: merging...') merged_ucid = dff[matched_mask].merge(df_idb, how='inner', left_on='ucid', right_on='ucid')\ .set_index('ucid_copy')[keepcols] print( f'STAGE 1: {{matched:{sum(matched_mask):,}, unmatched:{sum(~matched_mask):,} }}' ) # Reduce dff to unmatched dff = dff[~matched_mask].copy() # Create weak ucid dff['ucid_weak'] = dtools.get_ucid_weak(dff.ucid) # Remove matched from df_idb and reduce to weak_ucid match print(f'STAGE 2: matching on weak_ucid...') df_idb = df_idb[~df_idb.ucid.isin(matched_ucids) & df_idb.ucid_weak.isin(dff.ucid_weak)] # Stage 2 (matching on ucid_weak and filing date) print(f'STAGE 2: merging...') merged_weak = dff.merge(df_idb, how="inner", left_on=['ucid_weak','filing_date'], right_on=['ucid_weak', 'filedate'])\ .set_index('ucid_copy')[keepcols] matched_stage2 = merged_weak.shape[0] print( f"STAGE 2 {{matched:{matched_stage2:,}, unmatched:{sum(~matched_mask) -matched_stage2 :,} }}" ) final = pd.concat([merged_ucid, merged_weak]) del dff, df_idb match_rate = final.shape[0] / N print(f"Overall match rate: {match_rate :.2%}") return final, match_rate