def generate_scatterplots(config_params, outfolder, mean_control_profile, raw_dataset, lowess_dataset, deviation_dataset, scaled_dev_dataset): scatter_outfolder = os.path.join(outfolder, 'scatterplots') if not os.path.isdir(scatter_outfolder): os.makedirs(scatter_outfolder) if config_params['scatter_label_scheme'] != '0': assert config_params['scatter_label_scheme'].isdigit(), '"scatter_label_scheme" must be an integer!' try: float(config_params['scatter_label_cutoff']) except ValueError: assert False, "'scatter_label_cutoff' must be a float!" # Load in sample table sample_table = get_sample_table(config_params) # Load in barcode table barcode_table = get_barcode_table(config_params) # Get formatted condition names condition_ids = raw_dataset[1] if config_params['scatter_condition_columns'] == '': condition_fmt_string = 'screen_name,expt_id' else: condition_fmt_string = config_params['scatter_condition_columns'] condition_ids_final = customize_conditions(condition_ids, sample_table, condition_fmt_string) # If specified, get formatted strain names # If the strain labeling scheme is 0, then all the labels are blank! if config_params['scatter_label_scheme'] == '0': pass # strain_ids_final = ['' for i in range(raw_dataset[0].shape[0])] elif config_params['scatter_label_scheme'] in ['1', '2']: strain_ids = raw_dataset[0] strain_fmt_string = config_params['scatter_strain_columns'] assert strain_fmt_string != '', 'scatter_strain_columns must be specified if scatter_label_scheme is not "0"' strain_ids_custom = customize_strains(strain_ids, barcode_table, strain_fmt_string) else: assert False, '"scatter_label_scheme" must be one of [0, 1, 2]!' # Loop over the columns of the datasets and make plots! for i in range(raw_dataset[2].shape[1]): # Extract the profiles to work with raw_prof = np.log(raw_dataset[2][:, i].squeeze()) lowess_prof = lowess_dataset[2][:, i].squeeze() deviation_prof = deviation_dataset[2][:, i].squeeze() scaled_dev_prof = scaled_dev_dataset[2][:, i].squeeze() if get_verbosity(config_params) >= 2: print raw_prof print lowess_prof print deviation_prof print scaled_dev_prof # If labeling is to occur, get which strains to label from the final profile if config_params['scatter_label_scheme'] == '1': strains_to_label_inds = np.abs(scaled_dev_prof) > float(config_params['scatter_label_cutoff']) if get_verbosity(config_params) >= 2: print np.sum(strains_to_label_inds) strain_ids_final = strain_ids_custom.copy() strain_ids_final[np.invert(strains_to_label_inds)] = '' if get_verbosity(config_params) >= 2: print strain_ids_final[strains_to_label_inds] elif config_params['scatter_label_scheme'] == '2': strains_to_label_inds = rankdata(-np.abs(scaled_dev_prof), method = 'min') <= float(config_params['scatter_label_cutoff']) if get_verbosity(config_params) >= 2: print np.sum(strains_to_label_inds) strain_ids_final = strain_ids_custom.copy() strain_ids_final[np.invert(strains_to_label_inds)] = '' if get_verbosity(config_params) >= 2: print strain_ids_final[strains_to_label_inds] #strains_to_label = scaled_dev_dataset[0][strains_to_label_inds, :] # Functions to plot both individual versions of the data processing # plots and a combined version! make_individual_plots(mean_control_profile, raw_prof, lowess_prof, deviation_prof, scaled_dev_prof, strain_ids_final, condition_ids_final[i], config_params, scatter_outfolder) make_2x2_plot(mean_control_profile, raw_prof, lowess_prof, deviation_prof, scaled_dev_prof, strain_ids_final, condition_ids_final[i], config_params, scatter_outfolder)
def generate_scatterplots(config_params, outfolder, mean_control_profile, raw_dataset, lowess_dataset, deviation_dataset, scaled_dev_dataset): scatter_outfolder = os.path.join(outfolder, 'scatterplots') if not os.path.isdir(scatter_outfolder): os.makedirs(scatter_outfolder) if config_params['scatter_label_scheme'] != '0': assert config_params['scatter_label_scheme'].isdigit(), '"scatter_label_scheme" must be an integer!' try: float(config_params['scatter_label_cutoff']) except ValueError: assert False, "'scatter_label_cutoff' must be a float!" # Load in sample table sample_table = get_sample_table(config_params) # Load in barcode table barcode_table = get_barcode_table(config_params) # Get formatted condition names condition_ids = raw_dataset[1] if config_params['scatter_condition_columns'] == '': condition_fmt_string = 'screen_name,expt_id' else: condition_fmt_string = config_params['scatter_condition_columns'] condition_ids_final = customize_conditions(condition_ids, sample_table, condition_fmt_string) # If specified, get formatted strain names # If the strain labeling scheme is 0, then all the labels are blank! if config_params['scatter_label_scheme'] == '0': pass # strain_ids_final = ['' for i in range(raw_dataset[0].shape[0])] elif config_params['scatter_label_scheme'] in ['1', '2']: strain_ids = raw_dataset[0] strain_fmt_string = config_params['scatter_strain_columns'] assert strain_fmt_string != '', 'scatter_strain_columns must be specified if scatter_label_scheme is not "0"' strain_ids_custom = customize_strains(strain_ids, barcode_table, strain_fmt_string) else: assert False, '"scatter_label_scheme" must be one of [0, 1, 2]!' # Loop over the columns of the datasets and make plots! for i in range(raw_dataset[2].shape[1]): # Extract the profiles to work with raw_prof = np.log(raw_dataset[2][:, i].squeeze()) lowess_prof = lowess_dataset[2][:, i].squeeze() deviation_prof = deviation_dataset[2][:, i].squeeze() scaled_dev_prof = scaled_dev_dataset[2][:, i].squeeze() if get_verbosity(config_params) >= 2: print raw_prof print lowess_prof print deviation_prof print scaled_dev_prof # If labeling is to occur, get which strains to label from the final profile if config_params['scatter_label_scheme'] == '1': strains_to_label_inds = np.abs(scaled_dev_prof) > float(config_params['scatter_label_cutoff']) strain_ids_final = strain_ids_custom.copy() strain_ids_final[np.invert(strains_to_label_inds)] = '' elif config_params['scatter_label_scheme'] == '2': strains_to_label_inds = rankdata(-np.abs(scaled_dev_prof), method = 'min') <= float(config_params['scatter_label_cutoff']) strain_ids_final = strain_ids_custom.copy() strain_ids_final[np.invert(strains_to_label_inds)] = '' #strains_to_label = scaled_dev_dataset[0][strains_to_label_inds, :] # Set up and draw the 2x2 plot! f, axarr = plt.subplots(2, 2, sharex = True) # also create a big plot so x axis label is common #ax = f.add_subplot(111) # Turn off axis lines and ticks of the big subplot #ax.spines['top'].set_color('none') #ax.spines['bottom'].set_color('none') #ax.spines['left'].set_color('none') #ax.spines['right'].set_color('none') #ax.tick_params(labelcolor='w', top='off', bottom='off', left='off', right='off') # Draw scatterplots axarr[0, 0].scatter(mean_control_profile, raw_prof) axarr[0, 1].scatter(mean_control_profile, lowess_prof) axarr[1, 0].scatter(mean_control_profile, deviation_prof) axarr[1, 1].scatter(mean_control_profile, scaled_dev_prof) # Add point labels #if config_params['scatter_label_scheme'] == '0': # for j, lab in enumerate(strain_ids_final): # axarr[0, 0].annotate(lab, xy = (mean_control_profile[j], raw_prof[j])) # for j, lab in enumerate(strain_ids_final): # axarr[0, 1].annotate(lab, xy = (mean_control_profile[j], lowess_prof[j])) # for j, lab in enumerate(strain_ids_final): # axarr[1, 0].annotate(lab, xy = (mean_control_profile[j], deviation_prof[j])) # for j, lab in enumerate(strain_ids_final): # axarr[1, 1].annotate(lab, xy = (mean_control_profile[j], scaled_dev_prof[j])) # Add common x label #ax.set_xlabel('mean control profile (log counts)') # Add title! #ax.set_title(condition_ids_final[i]) # Add individual y labels axarr[0, 0].set_ylabel('Read counts') axarr[0, 1].set_ylabel('Lowess-normalized read counts') axarr[1, 0].set_ylabel('Deviation from normalized counts') axarr[1, 1].set_ylabel('z-score') outfile = os.path.join(scatter_outfolder, '{}.pdf'.format(condition_ids_final[i])) plt.savefig(outfile, bbox_inches = 'tight') plt.close()
def main(dataset, dataset_file, table, val_name, strain_table_f, strain_columns, sample_table_f, condition_columns, verbosity): strains, conditions, matrix = dataset full_fname = os.path.abspath(dataset_file) out_folder = full_fname.replace('.dump.gz', '') if not os.path.isdir(out_folder): os.makedirs(out_folder) # Here is where the script determines what modifications to make to the row/columns labels if strain_table_f is not None: strain_table = read_sample_table(strain_table_f) custom_strains = clus_wrap.customize_strains(strains, strain_table, strain_columns) # If the table boolean is not True, write out three fiies: matrix, rownames, and colnames if not table: strain_fname = os.path.join(out_folder, 'strains.txt') cond_fname = os.path.join(out_folder, 'conditions.txt') mat_fname = os.path.join(out_folder, 'matrix.txt.gz') # Write out the strains! with open(strain_fname, 'wt') as strain_f: strain_f.write('Strain_ID\tBarcode\n') for strain in strains: strain_f.write('\t'.join(strain) + '\n') # Write out the conditions! with open(cond_fname, 'wt') as cond_f: cond_f.write('screen_name\texpt_id\n') for cond in conditions: cond_f.write('\t'.join(cond) + '\n') # Write out the matrix! with gzip.open(mat_fname, 'wb') as mat_f: np.savetxt(mat_f, matrix, delimiter = '\t') # If the table boolean is True, then reshape into a table and write everything out as one table else: # Create containers for the rowname, colname, and value columns of the new table Strain_IDs = [] Barcodes = [] screen_names = [] expt_ids = [] values = [] # Iterate over the rows and columns of the matrix, and fill in the lists! for i in range(matrix.shape[0]): for j in range(matrix.shape[1]): Strain_IDs.append(strains[i][0]) Barcodes.append(strains[i][1]) screen_names.append(conditions[j][0]) expt_ids.append(conditions[j][1]) values.append(matrix[i, j]) # Create the data frame! if val_name is None: val_name = 'value' table = pd.DataFrame({'Strain_ID': Strain_IDs, 'Barcode': Barcodes, 'screen_name': screen_names, 'expt_id': expt_ids, val_name: values}) # Reorder the data frame columns! table = table.reindex(columns = ['Strain_ID', 'Barcode', 'screen_name', 'expt_id', val_name]) # Write the table out to file! tab_fname = os.path.join(out_folder, 'data_table.txt.gz') with gzip.open(tab_fname, 'wb') as tab_f: table.to_csv(tab_f, sep = '\t', header = True, index = False)