def _read_in(timePeriod, source): for path in glob.glob(configs[source]['sample_monthly_dir'] + '*.txt'): if timePeriod in path: print "Loading File: {}".format(path) df_payments = load_data( path=path, columns=monthlyFileColList, nrows=None, date_col_fmt_dict={'firstPaymentDate': '%Y%m'}, error_bad_lines=True) return df_payments
source = 'freddie' census_source = 'acs' if __name__ == "__main__": if not os.path.exists(configs[source]['figure_dir']): os.makedirs(configs[source]['figure_dir']) # Determine files to read in if sample_run: # Needs parser for pandas.io.common.CParserError 16364 df_origination = load_data( path=configs[source]['sample_single_dir'] + configs[source]['sample_single_file'], columns=originationFileColList, nrows=None, date_col_fmt_dict={'firstPaymentDate': '%Y%m'}, error_bad_lines=True, engine='c') # Append all otherwise elif not sample_run: df_origination = pd.DataFrame() for path in glob.glob(configs[source]['sample_single_dir'] + '*.txt'): print "Loading file: {}".format(path) df_origination = pd.concat([ df_origination, load_data(path=path, columns=originationFileColList, nrows=None, date_col_fmt_dict={'firstPaymentDate': '%Y%m'},
def lender_CreditScoreSelection(): """ :return: """ # Load in from CSV # Load in from CSV if sample_run: df_origination = load_data( path=configs[source]['sample_dir'] + configs[source]['sample_file'], columns=originationFileColList, date_col_fmt_dict={'firstPaymentDate': '%Y%m'}) elif not sample_run: df_origination = pd.DataFrame() for path in glob.glob(configs[source]['sample_single_dir'] + '*.txt'): print "Loading File: {}".format(path) df_origination = pd.concat([ df_origination, load_data(path=path, columns=originationFileColList, nrows=None, date_col_fmt_dict={'firstPaymentDate': '%Y%m'}, error_bad_lines=True) ], axis=0) df_origination.loc[:, 'creditScore'] = \ pd.to_numeric(df_origination['creditScore']) # Credit Score Spread by MSA df_msa_to_score = df_origination.groupby(by=['MSA'], as_index=False).agg( {'creditScore': [np.mean, np.std, np.max, np.min, np.var]}) df_msa_to_score = compress_columns(df_msa_to_score) df_msa_to_score.rename(columns={ 'mean': 'MSACreditScore_mean', 'std': 'MSACreditScore_std', 'amax': 'MSACreditScore_max', 'amin': 'MSACreditScore_min', 'var': 'MSACreditScore_var' }, inplace=True) # Loan Level (Safety Merge -no Pandas alert if left adds rows) pre_len = len(df_origination) df_loan = pd.merge(df_origination, df_msa_to_score, how='left', on=['MSA']) if len(df_loan) != pre_len: raise Exception("Merge of type \'left\' has added rows unexpectedly.") # Outlier Over msk = (df_loan['creditScore'] >= (df_loan['MSACreditScore_mean'] + (df_loan['MSACreditScore_std'] * 3))) df_loan['creditScoreOutlier_over'] = 0 df_loan.loc[msk, 'creditScoreOutlier_over'] = 1 # Outlier Under msk = (df_loan['creditScore'] <= (df_loan['MSACreditScore_mean'] - (df_loan['MSACreditScore_std'] * 3))) df_loan['creditScoreOutlier_under'] = 0 df_loan.loc[msk, 'creditScoreOutlier_under'] = 1 df_loan['year'] = df_loan.loc[:, 'firstPaymentDate'].dt.year return df_loan
def lender_PerformanceByMSA(): """ :return: """ # Load in from CSV if sample_run: df_origination = load_data( path=configs[source]['sample_single_dir'] + configs[source]['sample_single_file'], columns=originationFileColList, date_col_fmt_dict={'firstPaymentDate': '%Y%m'}) elif not sample_run: df_origination = pd.DataFrame() for path in glob.glob(configs[source]['sample_single_dir'] + '*.txt'): print "Loading File: {}".format(path) df_origination = pd.concat([ df_origination, load_data(path=path, columns=originationFileColList, nrows=None, date_col_fmt_dict={'firstPaymentDate': '%Y%m'}, error_bad_lines=True) ], axis=0) # Credit Score Spread by MSA df_msa_to_score = df_origination.groupby(by=['MSA'], as_index=False).agg( {'creditScore': [np.mean, np.std, np.max, np.min, np.var]}) df_msa_to_score = compress_columns(df_msa_to_score) df_msa_to_score.rename(columns={ 'mean': 'MSACreditScore_mean', 'std': 'MSACreditScore_std', 'amax': 'MSACreditScore_max', 'amin': 'MSACreditScore_min', 'var': 'MSACreditScore_var' }, inplace=True) # Lender selection of Credit Score lender = df_origination.groupby(by=['servicerName', 'MSA'], as_index=False).agg( {'creditScore': [np.mean, 'count']}) lender = compress_columns(lender) lender.rename(columns={ 'mean': 'LenderMSACreditScore_mean', 'count': 'totalMortgages' }, inplace=True) lender = pd.merge(lender, df_msa_to_score, how='left', left_on=['MSA'], right_on=['MSA']) # Z - Score of Lender Avg in MSA comp to MSA Avg lender['lenderAvg_zScore'] = 0 lender.loc[:, 'zScore'] = ( (lender['LenderMSACreditScore_mean'] - lender['MSACreditScore_mean']) / lender['MSACreditScore_std']) return lender
def update_bins(dest_dir, source, duration, qtrYears): """ :param dest_dir: :param source: :param duration: :param chunk: :return: """ # Duration / File Type if duration == 'single': fileType = 'sample_single_dir' elif duration == 'monthly': fileType = 'sample_monthly_dir' # Try reading in Map try: df_map = pd.read_csv(dest_dir + 'bin_mapping.csv') except: df_map = pd.DataFrame(columns=[ 'loanSeqNumber', 'binNumber', 'binOccupancy', 'maxBinOccupancy' ]) # Load in all files for path in glob.glob(configs[source][fileType] + '*.txt'): # Check if this is a file desired for being read in if any([qtrYear in path for qtrYear in qtrYears]): # Process file print "Loading File: {}".format(path) currFile = load_data( path=path, columns=monthlyFileColList, nrows=None, date_col_fmt_dict={'firstPaymentDate': '%Y%m'}, error_bad_lines=True) print "{} many loans in file".format( len(list(set(currFile['loanSeqNumber'])))) # # # ---- ---- ---- ---- ---- ---- # ---- ---- ---- ---- ---- ---- # Split file into loans that are continuing and loans that are new print "Updating bin structure: {}".format(path) # Update bins in which we can already find loans from new file old_new_msk = (currFile['loanSeqNumber'].isin( set(df_map['loanSeqNumber']))) df = currFile.loc[old_new_msk, :] # Iterate over bins we already have and append affected_bins = list( set(df_map.loc[df_map['loanSeqNumber'].isin( list(set(df['loanSeqNumber']))), :]['binNumber'])) for b_ in affected_bins: print "{} :: Updating bins with new loan data".format( dt.datetime.now().strftime("%m_%d_%Y")) df_bin = pd.read_csv(dest_dir + 'bin_{}.csv'.format(str(b_))) binLoans = list( set(df_map.loc[df_map['binNumber'] == b_, :] ['loanSeqNumber'])) df_bin = pd.concat( [df_bin, df.loc[df['loanSeqNumber'].isin(binLoans), :]], axis=0) df_bin.to_csv(dest_dir + 'bin_{}.csv'.format(str(b_))) # # # ---- ---- ---- ---- ---- ---- # ---- ---- ---- ---- ---- ---- # Save loan information for loans that cannot be found in # pre-existing bins df = currFile.loc[~old_new_msk, :] newLoans = list(set(df['loanSeqNumber'])) # Check if there is a bin not yet full open_bin = df_map.loc[ df_map['binOccupancy'] < df_map['maxBinOccupancy'], :] open_bin = open_bin.loc[:, ['binNumber', 'binOccupancy']].\ drop_duplicates(inplace=False) assert len(list(open_bin['binNumber'])) <= 1 # ---- ---- ----- # Fill up a non-full bin before making new ones if len(list(open_bin['binNumber'])) == 1: print "{} :: Updating any non-full bin with new loans".format( dt.datetime.now().strftime("%m_%d_%Y")) curr_bin = list(open_bin['binNumber'])[0] open_spots = (configs[source]['maxBinOccupancy'] - list(open_bin['binOccupancy'])[0]) cut = df.loc[ df['loanSeqNumber'].isin(newLoans[:open_spots]), :] # Read in bin df_bin = pd.read_csv(dest_dir + 'bin_{}.csv'.format(str(curr_bin))) df_bin = pd.concat([df_bin, cut], axis=0) df_bin.to_csv(dest_dir + 'bin_{}.csv'.format(str(curr_bin))) df = df.loc[ ~(df['loanSeqNumber'].isin(newLoans[:open_spots])), :] # Update df_map to show bin as full df_map.loc[df_map['loanSeqNumber'].isin( list(cut['loanSeqNumber'])), 'binOccupancy'] = \ configs[source]['maxBinOccupancy'] # Send the rest of the loans to a new bin try: curr_bin = max(df_map['binNumber']) + 1 except ValueError as VE: curr_bin = 0 # ---- ---- ---- # Create new bin, update map and save open_spots = configs[source]['maxBinOccupancy'] remaining_loans = list(set(df['loanSeqNumber'])) while len(remaining_loans) > 0: print "{} :: Created new bin {} for remaining loans".format( dt.datetime.now().strftime("%m_%d_%Y"), str(curr_bin)) loan_chunk = remaining_loans[:open_spots] cut = df.loc[df['loanSeqNumber'].isin(loan_chunk), :] cut.to_csv(dest_dir + 'bin_{}.csv'.format(str(curr_bin))) # Update df_map df_map = df_map.append( pd.DataFrame({ 'loanSeqNumber': list(set(cut['loanSeqNumber'])), 'binNumber': [curr_bin for i in list(set(cut['loanSeqNumber']))], 'binOccupancy': [ configs[source]['maxBinOccupancy'] for i in list(set(cut['loanSeqNumber'])) ], 'maxBinOccupancy': [ configs[source]['maxBinOccupancy'] for i in list(set(cut['loanSeqNumber'])) ] })) # Update Current Bin curr_bin += 1 # Get new list of remaining loans remaining_loans = list( set([x for x in remaining_loans if x not in loan_chunk])) print(" " * 15) + "{} many loans remaining".format( str(len(remaining_loans))) df_map.to_csv(dest_dir + 'bin_mapping.csv')
def create_bins(dest_dir, source, duration, chunk=10000): """ :return: """ # Duration / File Type if duration == 'single': fileType = 'sample_single_dir' elif duration == 'monthly': fileType = 'sample_monthly_dir' # Load in all files df_origination = pd.DataFrame() for path in glob.glob(configs[source][fileType] + '*.txt'): # Process file print "Loading File: {}".format(path) df_origination = pd.concat([ df_origination, load_data(path=path, columns=monthlyFileColList, nrows=None, date_col_fmt_dict={'firstPaymentDate': '%Y%m'}, error_bad_lines=True) ], axis=0) # Mapping try: df_map = pd.read_csv(dest_dir + 'bin_mapping.csv') except: df_map = pd.DataFrame(columns=['loanSeqNumber', 'bin_number']) # Iterate over all loanSegNumber found in files loanNumbers = list(set(df_origination['loanSeqNumber'])) s = 0 bin_number = 1 for group in loanNumbers[s:s + chunk]: # Filter and save to bin CSV msk = (df_origination['loanSeqNumber'].isin(group)) df_curr = df_origination.loc[msk, :] df_curr.to_csv(dest_dir + 'bin_{}.csv'.format(str(bin_number))) # Make append to mapping df_map = df_map.append( pd.DataFrame({ 'loanSeqNumber': list(set(df_curr['loanSeqNumber'])), 'bin': [ bin_number for i in range(len(set(df_curr['loanSeqNumber']))) ] })) # Update iterable counts s += chunk bin_number += 1 # Save map back to file df_map.to_csv(dest_dir + 'bin_mapping.csv')