def main(): start_time = time.time() print("Reading input parameters..") configParams = config.readMainConfigs() #try: config.setGlobals(configParams) N = configParams[config.mode]['n'] companyIDSeed = configParams[config.mode]['company_id_seed'] #print('Choose what you wan to do') #print('1. Generate metadata\n2. Generate transactions from existing metadata\n3. Perform sanity check') m.generateMetadata(N, companyIDSeed) print("Metadata generation successful!! Check output file at " + config.metadataFile) #except: #print ("Error is generating metadata..") #try: t.generateTransactionData(companyIDSeed) end_time = time.time() print("Execution time: ", end_time - start_time) #except: #print ("Error is generating transaction data..") start_time = time.time() ds.summarizeData() end_time = time.time() print("Execution time: ", end_time - start_time) start_time = time.time() #dc.sanityCheck() end_time = time.time() print("Execution time: ", end_time - start_time)
def postProcessor(): # Read config files print('Reading config parameters for post-processing') try: mainConfig = con.readMainConfigs() con.setGlobals(mainConfig) configParams = con.readConfigFile(con.paramConfigFile) configDistributions = con.readConfigFile(con.distributionsFile) modeparams = mainConfig[con.mode] except: print( 'Error in reading config files.. Make sure you have not made any mistake in editing parameters' ) if con.mode != 'M': sys.exit( 'BTS is not operating in mixining mode. Please change the mode in mainConfig.ini file to M.' ) tran_info_path = os.path.join(modeparams['mixed_data'], modeparams['traninfo_folder']) # File paths vertices = tran_info_path + 'vertices.csv' edges = tran_info_path + 'edges.csv' metadata_file = modeparams['mixed_data'] + 'combinedMetadata.csv' print('Reading combined metadata file..') metadata_df = pd.read_csv(metadata_file) print('Metadata read for ', len(metadata_df), ' companies.') print('Reading vertices and edges..') # Read vertices and edges #v_set = pd.read_csv(vertices) all_files_v = glob.glob(tran_info_path + 'vertices*.csv') df_list_v = list() for file in all_files_v: df = pd.read_csv(file) df_list_v.append(df) v_set = pd.concat(df_list_v, ignore_index=True) all_files_v.clear() df_list_v.clear() all_files = glob.glob(tran_info_path + 'edge*.csv') df_list = list() for file in all_files: df = pd.read_csv(file) df_list.append(df) e_set = pd.concat(df_list, ignore_index=True) all_files.clear() df_list.clear() acc_under_study = list(metadata_df.loc[:, 'acc_no']) print('Number of edges read = ', len(e_set)) print('Number of vertices read = ', len(v_set)) if modeparams['remove_edges'] == 'Y': print('Removing edges for outside accounts') # Graph with just accounts under study v_set_under_study = v_set.loc[v_set.vertex.isin(acc_under_study), ['vertex']] e_set_under_study = e_set.loc[(e_set.src_acc.isin(acc_under_study)) & (e_set.dest_acc.isin(acc_under_study))] # Write reduced vertices and edges v_set_under_study.to_csv(modeparams['mixed_data'] + '\\reducedVertices.csv', index=False) e_set_under_study.to_csv(modeparams['mixed_data'] + '\\reducedEdges.csv', index=False) else: print('Collapsing edges for outside accounts') # Collapse edges by adding a dummy account n = len(v_set) #v_set.set_index(np.arange(n)) v_set_under_study = v_set.loc[v_set.vertex.isin(acc_under_study), ['vertex']] #v_set_under_study.set_index(np.arange(len(v_set_under_study))) v_set_under_study.loc[len(v_set_under_study), 'vertex'] = 'outside' e_set_under_study = e_set.loc[(e_set.src_acc.isin(acc_under_study)) | (e_set.dest_acc.isin(acc_under_study))] e_set_under_study.loc[(e_set.src_acc.isin(acc_under_study)) & ~(e_set.dest_acc.isin(acc_under_study)), 'dest_acc'] = 'outside' e_set_under_study.loc[(e_set.dest_acc.isin(acc_under_study)) & ~(e_set.src_acc.isin(acc_under_study)), 'src_acc'] = 'outside' e_set_under_study = e_set_under_study.groupby( by=['src_acc', 'dest_acc'], as_index=False).agg({ 'amount': np.sum, 'n_tran': np.sum }) #e_set_under_study.columns = e_set_under_study.columns.droplevel(level=0) e_set_under_study.columns = ['src_acc', 'dest_acc', 'amount', 'n_tran'] # Write reduced vertices and edges v_set_under_study.to_csv(modeparams['mixed_data'] + 'collapsedVertices.csv', index=False) e_set_under_study.to_csv(modeparams['mixed_data'] + 'collapsedEdges.csv', index=False) print('Number of edges reduced to ', len(e_set_under_study)) print('Number of vertices reduced to ', len(v_set_under_study))
def summarizeData (): # Read config files print('Reading config parameters for data summary') try: mainConfig = con.readMainConfigs () con.setGlobals(mainConfig) configParams = con.readConfigFile(con.paramConfigFile) configDistributions = con.readConfigFile(con.distributionsFile) modeparams = mainConfig[con.mode] distributions = configDistributions['distributions'] commonParams = configParams['common'] home_location = commonParams['home_location'] except: print('Error in reading config files.. Make sure you have not made any mistake in editing parameters') # Read metadata metadata_df = pd.DataFrame() try: metadata_df = pd.read_csv(con.metadataFile, converters={'in_amt_wt':ast.literal_eval, 'supply_amt_wt':ast.literal_eval, 'utility_proportions': ast.literal_eval, 'inside_customers': ast.literal_eval, 'inside_suppliers':ast.literal_eval, 'utility_accs':ast.literal_eval},index_col='index') except: print('Error in reading metadata') metadata_df['industry_sector'] = metadata_df.industry_sector.astype('str') company_acc = metadata_df['acc_no'] N = len(metadata_df) print('Metadata for ', N, ' companies is read successfully!') # Load tran data in chunks prepare summary for small data allFiles = glob.glob(con.tranData + '\*.csv') n_files = len(allFiles) if n_files == 0: print('No transaction data found!! Make sure trandatapath parameter is set appropriately.') else: chunk_size = 8 n_iterations = math.ceil(n_files / chunk_size) # Prepare folder for summarized data companyDataPath = modeparams['company_data_path'] summaryDataPath = companyDataPath + 'SummaryData\\' tranInfoPath = companyDataPath + 'TranInfo\\' if not os.path.exists(summaryDataPath): os.makedirs(summaryDataPath) if not os.path.exists(tranInfoPath): os.makedirs(tranInfoPath) i = 0 for i in range(n_iterations): # Read chunk start_index = i * chunk_size end_index = start_index + chunk_size - 1 if i < n_iterations - 1 else n_files -1 print('Files getting read from ', start_index, end_index) tran_data_df = readTransactionData(con.tranData, start_index, end_index) # Summarize chunk partial_summary_df = metadata_df[['acc_no']] partial_summary_df, partial_traninfo_df = prepareSummary (partial_summary_df, tran_data_df, home_location) # Write summary to disk file_path = summaryDataPath + 'partialSummary' + str(i) + '.csv' partial_summary_df.to_csv (file_path) file_path = tranInfoPath + 'partialTranInfo' + str(i) + '.csv' partial_traninfo_df.to_csv (file_path) print('Partial summary files are written..') # Propare final summary summarized_df = metadata_df[['acc_no', 'annual_revenue']] summarized_df = combinePartialSummaries(summaryDataPath, summarized_df) combinedFiles_df_grouped = combineTranInfoFiles (tranInfoPath) # Find out clients and expense account from both # Get n_clients_total, n_clients_within, n_expense_accs_total, n_expense_acc_within acc_no_list = list(metadata_df.loc[:,'acc_no']) for_clients_total = combinedFiles_df_grouped.loc[(combinedFiles_df_grouped.src_acc != 'CASHC') & (combinedFiles_df_grouped.dest_acc.isin(acc_no_list))] for_clients_total_grouped = for_clients_total.groupby(by='dest_acc', as_index=False).agg({'src_acc':pd.Series.nunique}) for_clients_total_grouped.columns = ['acc_no', 'n_clients_total'] summarized_df = summarized_df.merge(for_clients_total_grouped, on='acc_no',how='left') for_clients_within = for_clients_total.loc[for_clients_total.src_acc.isin(acc_no_list)] for_clients_within = for_clients_within.groupby(by='dest_acc', as_index=False).agg({'src_acc':pd.Series.nunique}) for_clients_within.columns = ['acc_no', 'n_clients_within'] summarized_df = summarized_df.merge(for_clients_within, on='acc_no',how='left') for_exp_acc_total = combinedFiles_df_grouped.loc[(~combinedFiles_df_grouped.dest_acc.isin(['CASHS', 'CASHU'])) & (combinedFiles_df_grouped.src_acc.isin(acc_no_list))] for_exp_acc_total_grouped = for_exp_acc_total.groupby(by='src_acc', as_index=False).agg({'dest_acc':pd.Series.nunique}) for_exp_acc_total_grouped.columns = ['acc_no', 'n_exp_acc_total'] summarized_df = summarized_df.merge(for_exp_acc_total_grouped, on='acc_no',how='left') for_exp_acc_within = for_exp_acc_total.loc[for_exp_acc_total.dest_acc.isin(acc_no_list)] for_exp_acc_within = for_exp_acc_within.groupby(by='src_acc', as_index=False).agg({'dest_acc':pd.Series.nunique}) for_exp_acc_within.columns = ['acc_no', 'n_exp_acc_within'] summarized_df = summarized_df.merge(for_exp_acc_within, on='acc_no',how='left') summarized_df.to_csv (summaryDataPath + 'finalSummary.csv', index=False) #src_acc = set(combinedFiles_df_grouped.loc[:,'src_acc']) #dest_acc = set(combinedFiles_df_grouped.loc[:,'dest_acc']) #v = src_acc.union(dest_acc) #v_df = pd.DataFrame(data = list(v), columns=['vertex']) #v_df.to_csv(tranInfoPath + 'vertices.csv', index=False) #e = list(combinedFiles_df_grouped.itertuples(index=False,name=None)) #combinedFiles_df_grouped.to_csv(tranInfoPath + 'edges.csv', index = False) # Print msg print('First level data summarization for sanity check is successful! Check out summary file at ', summaryDataPath) #summarizeData()
def mixer(): # Read config files print('Reading config parameters for data summary') try: mainConfig = con.readMainConfigs() con.setGlobals(mainConfig) configParams = con.readConfigFile(con.paramConfigFile) configDistributions = con.readConfigFile(con.distributionsFile) modeparams = mainConfig[con.mode] except: print( 'Error in reading config files.. Make sure you have not made any mistake in editing parameters' ) # Read metadata files and join them print('Reading both metadata files..') metadata_df = pd.DataFrame() for file in con.metadataFile: df = pd.read_csv(file, converters={ 'in_amt_wt': ast.literal_eval, 'supply_amt_wt': ast.literal_eval, 'utility_proportions': ast.literal_eval, 'inside_customers': ast.literal_eval, 'inside_suppliers': ast.literal_eval, 'utility_accs': ast.literal_eval }, index_col='index') metadata_df = pd.concat([metadata_df, df]) # Write joint metadata file combined_metadata_path = con.mixedData + 'combinedMetadata.csv' metadata_df.to_csv(combined_metadata_path, index=False) print( 'Metadata is combined successfully! Metadata file can be accessed at ', combined_metadata_path) # Combine tranInfo print('Combining transaction information files') combinedFiles_df_grouped = pd.DataFrame() for path in con.tranInfo: df = old_combineTranInfoFiles(path) combinedFiles_df_grouped = pd.concat([combinedFiles_df_grouped, df]) # get vertices and edges mixed_data_path = os.path.join(modeparams['mixed_data'], modeparams['traninfo_folder']) if not os.path.exists(mixed_data_path): os.makedirs(mixed_data_path) src_acc = set(combinedFiles_df_grouped.loc[:, 'src_acc']) dest_acc = set(combinedFiles_df_grouped.loc[:, 'dest_acc']) v = src_acc.union(dest_acc) v_df = pd.DataFrame(data=list(v), columns=['vertex']) #v_df.to_csv(mixed_data_path + 'vertices.csv', index=False) #Write vertices file_size_v = 50000 n_files_v = ceil(len(v_df) / file_size_v) i = 0 while i < n_files_v: start_index = i * file_size_v end_index = start_index + file_size_v vSet = v_df[start_index:end_index] vSet.to_csv(mixed_data_path + 'vertices' + str(i) + '.csv', index=False) i = i + 1 file_size = 50000 n_files = ceil(len(combinedFiles_df_grouped) / file_size) i = 0 while i < n_files: start_index = i * file_size end_index = start_index + file_size edgeSet = combinedFiles_df_grouped[start_index:end_index] edgeSet.to_csv(mixed_data_path + 'edges' + str(i) + '.csv', index=False) i = i + 1 #e = list(combinedFiles_df_grouped.itertuples(index=False,name=None)) #combinedFiles_df_grouped.to_csv(mixed_data_path + 'edges.csv', index = False) print( 'Transaction data combined successfully! Vertices and edges can be found at ', mixed_data_path)
def sanityCheck(): # Read config files print('Reading config parameters') try: mainConfig = con.readMainConfigs() con.setGlobals(mainConfig) configParams = con.readConfigFile(con.paramConfigFile) configDistributions = con.readConfigFile(con.distributionsFile) modeparams = mainConfig[con.mode] distributions = configDistributions['distributions'] commonParams = configParams['common'] home_location = commonParams['home_location'] except: print(sys.exc_info()[0], ' error occured!') print( 'Error in reading config files.. Make sure you have not made any mistake in editing parameters' ) sys.exit(1) # Read metadata try: metadata_df = pd.read_csv(con.metadataFile, converters={ 'in_amt_wt': ast.literal_eval, 'supply_amt_wt': ast.literal_eval, 'utility_proportions': ast.literal_eval, 'inside_customers': ast.literal_eval, 'inside_suppliers': ast.literal_eval, 'utility_accs': ast.literal_eval }, index_col='index') # Read summary companyDataPath = modeparams['company_data_path'] summaryDataPath = companyDataPath + 'SummaryData\\finalSummary.csv' summary_Df = pd.read_csv(summaryDataPath) tranInfoPath = companyDataPath + '\\TranInfo' edgeList = pd.read_csv(tranInfoPath + '\\edges.csv') vertexList = pd.read_csv(tranInfoPath + '\\vertices.csv') N_tran_total = np.sum(edgeList['n_tran']) group_id_list = list(metadata_df.groupby('group_id').groups.keys()) n_accounts = len(vertexList) n_edges = len(edgeList) n_groups = len(group_id_list) except: print(sys.exc_info()[0], ' error occured!') print('Error in reading metadata') sys.exit(1) metadata_df['industry_sector'] = metadata_df.industry_sector.astype('str') company_acc = metadata_df['acc_no'] N = len(metadata_df) print('Metadata for ', N, ' companies is read successfully!') # Save to file metafile_path = con.additionalFiles + '\\metafile.txt' meta_file = open(metafile_path, 'w') meta_file.write('Data set type :' + con.mode) meta_file.write('\nNumber of companies :' + str(N)) meta_file.write('\nTotal transactions generated : ' + str(N_tran_total)) meta_file.write('\nNumber of vertices : ' + str(n_accounts)) meta_file.write('\nNumber of edges (connections) : ' + str(n_edges)) meta_file.write('\nNumber of groups : ' + str(n_groups)) meta_file.close() # Read sanity check file sanityCheck_df = pd.read_csv('sanityCheck.csv', converters={ 'param_bins': ast.literal_eval, 'distribution': ast.literal_eval }) n_checks = len(sanityCheck_df) sanityCheck_df['KL_divergence'] = [0.0] * n_checks sanityCheck_df['status'] = ['F'] * n_checks for i in range(n_checks): param_name = sanityCheck_df.loc[i, 'parameter'] print('Checking for parameter ', param_name) type = sanityCheck_df.loc[i, 'type'] if type == 'm': param_dist = commonParams[param_name] paramValue = {k: v for k, v in sorted(param_dist.items())} expected_dist = np.array(list(paramValue.values())) param_grp = metadata_df.groupby( by=param_name, as_index=False).comp_id.agg('count') param_grp = param_grp.sort_values(by=param_name) param_grp['comp_id'] = param_grp['comp_id'] / N param_grp = {str(k): v for k, v in param_grp.items()} actual_dist = param_grp['comp_id'].get_values() KLD = entropy(expected_dist, actual_dist) else: bins = sanityCheck_df.loc[i, 'param_bins'] distribution = sanityCheck_df.loc[i, 'distribution'] if param_name in metadata_df.columns: slice = metadata_df.loc[:, param_name] else: slice = summary_Df.loc[:, param_name] bin_len = len(bins) fig1 = plt.figure(1) ax1 = fig1.add_subplot(111) arr = ax1.hist(slice, bins, facecolor='red', alpha=0.5, edgecolor='brown') observed_distribution = list(arr[0] / N) KLD = entropy(distribution, observed_distribution) sanityCheck_df.loc[i, 'KL_divergence'] = KLD if abs(KLD * 100) < 5.0: sanityCheck_df.loc[i, 'status'] = 'P' print('KL divergence is ', KLD, ' Status - Successful') else: print('KL divergence is ', KLD, ' Status - Failed') sanityCheck_df.to_csv(con.additionalFiles + '\\sanityCheckReport.csv') ## Visualizations # Industry sector fig1 = plt.figure(1) sector_grps = metadata_df.groupby( by='industry_sector', as_index=False).agg({'comp_id': 'count'}) industry_sector_names = list(sector_grps.loc[:, 'industry_sector']) industry_sectors = np.arange(len(industry_sector_names)) cnt_per_sector = list(sector_grps.loc[:, 'comp_id']) ax1 = fig1.add_subplot(111) ax1.bar(industry_sectors, cnt_per_sector, label='Number of companies per sector', facecolor='green', edgecolor='gray') ax1.set_xticks(industry_sectors) ax1.set_xticklabels(industry_sector_names, rotation=45) ax1.set_title('No. of companies per sector', fontsize=10) ax1.set_xlabel('Industry sector', fontsize=8) ax1.set_ylabel('No. of companies', fontsize=8) fig1.savefig(con.additionalFiles + '\\SectrwiseNoCompanies.png') # Employee base fig2 = plt.figure(2) emp_base_grps = metadata_df.groupby( by='employee_base', as_index=False).agg({'comp_id': 'count'}) emp_base_names = list(emp_base_grps.loc[:, 'employee_base']) base_ticks = np.arange(len(emp_base_names)) cnt_per_size = list(emp_base_grps.loc[:, 'comp_id']) ax1 = fig2.add_subplot(111) ax1.bar(base_ticks, cnt_per_size, label='Number of companies per base size', facecolor='green', edgecolor='gray') ax1.set_xticks(base_ticks) ax1.set_xticklabels(emp_base_names, rotation=45) ax1.set_title('No. of companies per base size', fontsize=10) ax1.set_xlabel('Employee base sizes', fontsize=8) ax1.set_ylabel('No. of companies', fontsize=8) fig2.savefig(con.additionalFiles + '\\EmployeeBaseDistribution.png') # Number of employees fig3 = plt.figure(3) num_employees = metadata_df.loc[:, 'num_employees'] bins = [0, 50, 100, 200, 500, 2000] ax1 = fig3.add_subplot(111) arr = ax1.hist(num_employees, bins, facecolor='lightblue', edgecolor='gray') ax1.set_xlabel('Number of employees', fontsize=8) ax1.set_ylabel('Number of companies', fontsize=8) ax1.set_title('Distribution of number of employees', fontsize=10) for i in range(len(arr[0])): plt.text(arr[1][i], arr[0][i], str(arr[0][i] / N * 100)) #ax1.set_xscale('log') max_no = max(num_employees) min_no = min(num_employees) text = 'Max. = ' + str(max_no) + '\nMin. = ' + str(min_no) plt.text(0, 40, text, fontsize=10) fig3.savefig(con.additionalFiles + '\\NumEmployees.png') # Employee profitability fig4 = plt.figure(4) emp_profitability = metadata_df.loc[:, 'rev_per_emp_per_month'] bins = [1e3, 1e4, 1e5, 1e6, 1e7, 1e8] ax1 = fig4.add_subplot(111) arr = ax1.hist(emp_profitability, bins, facecolor='lightblue', edgecolor='gray') ax1.set_xlabel('Employee profitability in Rs.', fontsize=8) ax1.set_ylabel('Number of companies', fontsize=8) ax1.set_title('Distribution of employee profitability', fontsize=10) ax1.set_xscale('log') for i in range(len(arr[0])): plt.text(arr[1][i], arr[0][i], str(arr[0][i] / N * 100)) max_no = max(emp_profitability) min_no = min(emp_profitability) text = 'Max. = ' + str(max_no) + '\nMin. = ' + str(min_no) plt.text(1e3, 100, text, fontsize=10) fig4.savefig(con.additionalFiles + '\\EmployeeProfitability.png') # Annual revenue fig5 = plt.figure(5) annual_rev = summary_Df.loc[:, 'generated_annual_revenue'] bins = [1e5, 1e6, 1e7, 1e8, 1e9, 1e10, 1e11, 1e12] ax1 = fig5.add_subplot(111) arr = ax1.hist(annual_rev, bins, facecolor='lightblue', edgecolor='gray') ax1.set_xlabel('Genearted annual revenue by BTS in Rs.', fontsize=8) ax1.set_ylabel('Number of companies', fontsize=8) ax1.set_title('Distribution of annual revenue', fontsize=10) #ax3.set_yscale('log') ax1.set_xscale('log') for i in range(len(arr[0])): plt.text(arr[1][i], arr[0][i], str(arr[0][i] / N * 100)) max_revenue = max(annual_rev) min_revenue = min(annual_rev) text = 'Max. = ' + str(max_revenue) + '\nMin. = ' + str(min_revenue) plt.text(1e5, 100, text, fontsize=10) fig5.savefig(con.additionalFiles + '\\AnnualRevenue.png') # deviation in annual revenue fig6 = plt.figure(6) dev_annual_rev = summary_Df.loc[:, 'revenue_deviation_per'] bins = 11 ax1 = fig6.add_subplot(111) arr = ax1.hist(dev_annual_rev, bins, facecolor='lightblue', edgecolor='gray') ax1.set_xlabel('Deviation in generated annual revenue in Rs.', fontsize=8) ax1.set_ylabel('Number of companies', fontsize=8) ax1.set_title('Distribution of deviation in generated annual revenue', fontsize=10) for i in range(len(arr[0])): plt.text(arr[1][i], arr[0][i], str(arr[0][i] / N * 100)) max_deviation = min(dev_annual_rev) text = 'Max. = ' + str(max_deviation) plt.text(arr[1][0], 100, text, fontsize=10) fig6.savefig(con.additionalFiles + '\\AnnualRevenueDeviation.png') # Generated expenses fig7 = plt.figure(7) generated_expenses = summary_Df.loc[:, 'generated_expenses'] bins = [1e5, 1e6, 1e7, 1e8, 1e9, 1e10, 1e11, 1e12, 1e13] ax1 = fig7.add_subplot(111) arr = ax1.hist(generated_expenses, bins, facecolor='lightblue', edgecolor='gray') ax1.set_xlabel('Generated expenses in Rs.', fontsize=8) ax1.set_ylabel('Number of companies', fontsize=8) ax1.set_title('Distribution of generation of expenses', fontsize=10) ax1.set_xscale('log') for i in range(len(arr[0])): plt.text(arr[1][i], arr[0][i], str(arr[0][i])) max_expenses = max(generated_expenses) max_expenses = min(generated_expenses) text = 'Max. = ' + str(max_expenses) + '\nMin. = ' + str(max_expenses) plt.text(1e5, 100, text, fontsize=10) fig7.savefig(con.additionalFiles + '\\GeneratedExpenses.png') # Profit loss fig8 = plt.figure(8) profit_loss_per = summary_Df.loc[:, 'profit_loss_per'] bins = [-100, -75, -50, -25, 0, 10, 20, 30, 100] ax1 = fig8.add_subplot(111) arr = ax1.hist(profit_loss_per, bins, facecolor='red', alpha=0.5, edgecolor='brown') #ax1.yticks([0, 10, 20, 30, 40, 50, 60, 70, 80,90]) ax1.set_xlabel('Net profit/loss in %', fontsize=8) ax1.set_ylabel('Number of companies', fontsize=8) ax1.set_title('Distribution of net profit/loss', fontsize=10) #ax1.set_yscale('log') for i in range(len(arr[0])): ax1.text(arr[1][i], arr[0][i], str(arr[0][i])) max_profit = max(profit_loss_per) max_loss = abs(min(profit_loss_per)) text = 'Maximum profit = ' + str( max_profit) + '%' + '\nMaximum loss = ' + str( max_loss) + '%' + '\nN=' + str(N) ax1.text(-100, 40, text, fontsize=12) fig8.savefig(con.additionalFiles + '\\ProfitLoss.png') # Percentage of incoming transactions fig9 = plt.figure(9) nan_indices = np.argwhere( np.isnan(summary_Df['i_tran_within_percentage'])) nan_indices = list(nan_indices[:, 0]) summary_Df.loc[nan_indices, 'i_tran_within_percentage'] = 0 i_tran_within = summary_Df.loc[:, 'i_tran_within_percentage'] bins = 11 ax1 = fig9.add_subplot(111) arr = ax1.hist(i_tran_within, bins, facecolor='red', alpha=0.5, edgecolor='brown') ax1.set_xlabel('Percentage of incoming transations from within set', fontsize=8) ax1.set_ylabel('Number of companies', fontsize=8) ax1.set_title( 'Distribution of percenatge of incoming transactions from within set', fontsize=10) #ax1.set_yscale('log') for i in range(len(arr[0])): ax1.text(arr[1][i], arr[0][i], str(arr[0][i])) max_per = max(i_tran_within) min_per = min(i_tran_within) text = 'Maximum = ' + str(max_per) + '%' + '\nMaximum loss = ' + str( min_per) + '%' + '\nN=' + str(N) ax1.text(0, 40, text, fontsize=12) fig9.savefig(con.additionalFiles + '\\PercentageTranWithin.png') # Percentage of outgoing transactions fig10 = plt.figure(10) nan_indices = np.argwhere( np.isnan(summary_Df['o_tran_within_percentage'])) nan_indices = list(nan_indices[:, 0]) summary_Df.loc[nan_indices, 'o_tran_within_percentage'] = 0 o_tran_within = summary_Df.loc[:, 'o_tran_within_percentage'] bins = [0, 10, 20, 30, 40, 50, 70, 80, 90, 100] ax1 = fig10.add_subplot(111) arr = ax1.hist(o_tran_within, bins, facecolor='red', alpha=0.5, edgecolor='brown') ax1.set_xlabel('Percentage of outgoing transations to within set', fontsize=8) ax1.set_ylabel('Number of companies', fontsize=8) ax1.set_title( 'Distribution of percenatge of outgoing transactions to companies within set', fontsize=10) #ax1.set_yscale('log') for i in range(len(arr[0])): ax1.text(arr[1][i], arr[0][i], str(arr[0][i])) max_per = max(o_tran_within) min_per = min(o_tran_within) text = 'Maximum = ' + str(max_per) + '%' + '\nMaximum loss = ' + str( min_per) + '%' + '\nN=' + str(N) ax1.text(0, 40, text, fontsize=12) fig10.savefig(con.additionalFiles + '\\PercentageTranWithinOutgoing.png')