Python customize_strains Exemples, cluster_dataset_wrappers.customize_strains Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : counts_to_zscores.py Projet : RussianImperialScott/BEAN-counter

def generate_scatterplots(config_params, outfolder, mean_control_profile, raw_dataset, lowess_dataset, deviation_dataset, scaled_dev_dataset):

    scatter_outfolder = os.path.join(outfolder, 'scatterplots')
    if not os.path.isdir(scatter_outfolder):
        os.makedirs(scatter_outfolder)
        
    if config_params['scatter_label_scheme'] != '0':
        assert config_params['scatter_label_scheme'].isdigit(), '"scatter_label_scheme" must be an integer!'
        try:
            float(config_params['scatter_label_cutoff'])
        except ValueError:
            assert False, "'scatter_label_cutoff' must be a float!"

    # Load in sample table
    sample_table = get_sample_table(config_params)

    # Load in barcode table
    barcode_table = get_barcode_table(config_params)

    # Get formatted condition names
    condition_ids = raw_dataset[1]
    if config_params['scatter_condition_columns'] == '':
        condition_fmt_string = 'screen_name,expt_id'
    else:
        condition_fmt_string = config_params['scatter_condition_columns']
    condition_ids_final = customize_conditions(condition_ids, sample_table, condition_fmt_string)

    # If specified, get formatted strain names
    # If the strain labeling scheme is 0, then all the labels are blank!
    
    if config_params['scatter_label_scheme'] == '0':
        pass
        # strain_ids_final = ['' for i in range(raw_dataset[0].shape[0])]
    elif config_params['scatter_label_scheme'] in ['1', '2']:
        strain_ids = raw_dataset[0]
        strain_fmt_string = config_params['scatter_strain_columns']
        assert strain_fmt_string != '', 'scatter_strain_columns must be specified if scatter_label_scheme is not "0"'
        strain_ids_custom = customize_strains(strain_ids, barcode_table, strain_fmt_string)
    else:
        assert False, '"scatter_label_scheme" must be one of [0, 1, 2]!'

    # Loop over the columns of the datasets and make plots!
    for i in range(raw_dataset[2].shape[1]):

        # Extract the profiles to work with
        raw_prof = np.log(raw_dataset[2][:, i].squeeze())
        lowess_prof = lowess_dataset[2][:, i].squeeze()
        deviation_prof = deviation_dataset[2][:, i].squeeze()
        scaled_dev_prof = scaled_dev_dataset[2][:, i].squeeze()

        if get_verbosity(config_params) >= 2:
            print raw_prof
            print lowess_prof
            print deviation_prof
            print scaled_dev_prof

        # If labeling is to occur, get which strains to label from the final profile
        if config_params['scatter_label_scheme'] == '1':
            strains_to_label_inds = np.abs(scaled_dev_prof) > float(config_params['scatter_label_cutoff'])
            if get_verbosity(config_params) >= 2:
                print np.sum(strains_to_label_inds)
            strain_ids_final = strain_ids_custom.copy()
            strain_ids_final[np.invert(strains_to_label_inds)] = ''
            if get_verbosity(config_params) >= 2:
                print strain_ids_final[strains_to_label_inds]
        elif config_params['scatter_label_scheme'] == '2':
            strains_to_label_inds = rankdata(-np.abs(scaled_dev_prof), method = 'min') <= float(config_params['scatter_label_cutoff'])
            if get_verbosity(config_params) >= 2:
                print np.sum(strains_to_label_inds)
            strain_ids_final = strain_ids_custom.copy()
            strain_ids_final[np.invert(strains_to_label_inds)] = ''
            if get_verbosity(config_params) >= 2:
                print strain_ids_final[strains_to_label_inds]
        #strains_to_label = scaled_dev_dataset[0][strains_to_label_inds, :]

        # Functions to plot both individual versions of the data processing
        # plots and a combined version!
        make_individual_plots(mean_control_profile, raw_prof, lowess_prof, deviation_prof, scaled_dev_prof,
                              strain_ids_final, condition_ids_final[i], config_params, scatter_outfolder)
        
        make_2x2_plot(mean_control_profile, raw_prof, lowess_prof, deviation_prof, scaled_dev_prof,
                      strain_ids_final, condition_ids_final[i], config_params, scatter_outfolder)

Exemple #2

0

Afficher le fichier

Fichier : counts_to_zscores.py Projet : monprin/BEAN-counter

def generate_scatterplots(config_params, outfolder, mean_control_profile, raw_dataset, lowess_dataset, deviation_dataset, scaled_dev_dataset):

    scatter_outfolder = os.path.join(outfolder, 'scatterplots')
    if not os.path.isdir(scatter_outfolder):
        os.makedirs(scatter_outfolder)
        
    if config_params['scatter_label_scheme'] != '0':
        assert config_params['scatter_label_scheme'].isdigit(), '"scatter_label_scheme" must be an integer!'
        try:
            float(config_params['scatter_label_cutoff'])
        except ValueError:
            assert False, "'scatter_label_cutoff' must be a float!"

    # Load in sample table
    sample_table = get_sample_table(config_params)

    # Load in barcode table
    barcode_table = get_barcode_table(config_params)

    # Get formatted condition names
    condition_ids = raw_dataset[1]
    if config_params['scatter_condition_columns'] == '':
        condition_fmt_string = 'screen_name,expt_id'
    else:
        condition_fmt_string = config_params['scatter_condition_columns']
    condition_ids_final = customize_conditions(condition_ids, sample_table, condition_fmt_string)

    # If specified, get formatted strain names
    # If the strain labeling scheme is 0, then all the labels are blank!
    
    if config_params['scatter_label_scheme'] == '0':
        pass
        # strain_ids_final = ['' for i in range(raw_dataset[0].shape[0])]
    elif config_params['scatter_label_scheme'] in ['1', '2']:
        strain_ids = raw_dataset[0]
        strain_fmt_string = config_params['scatter_strain_columns']
        assert strain_fmt_string != '', 'scatter_strain_columns must be specified if scatter_label_scheme is not "0"'
        strain_ids_custom = customize_strains(strain_ids, barcode_table, strain_fmt_string)
    else:
        assert False, '"scatter_label_scheme" must be one of [0, 1, 2]!'
    

    # Loop over the columns of the datasets and make plots!
    for i in range(raw_dataset[2].shape[1]):

        # Extract the profiles to work with
        raw_prof = np.log(raw_dataset[2][:, i].squeeze())
        lowess_prof = lowess_dataset[2][:, i].squeeze()
        deviation_prof = deviation_dataset[2][:, i].squeeze()
        scaled_dev_prof = scaled_dev_dataset[2][:, i].squeeze()

        if get_verbosity(config_params) >= 2:
            print raw_prof
            print lowess_prof
            print deviation_prof
            print scaled_dev_prof

        # If labeling is to occur, get which strains to label from the final profile
        if config_params['scatter_label_scheme'] == '1':
            strains_to_label_inds = np.abs(scaled_dev_prof) > float(config_params['scatter_label_cutoff'])
            strain_ids_final = strain_ids_custom.copy()
            strain_ids_final[np.invert(strains_to_label_inds)] = ''
        elif config_params['scatter_label_scheme'] == '2':
            strains_to_label_inds = rankdata(-np.abs(scaled_dev_prof), method = 'min') <= float(config_params['scatter_label_cutoff'])
            strain_ids_final = strain_ids_custom.copy()
            strain_ids_final[np.invert(strains_to_label_inds)] = ''
        #strains_to_label = scaled_dev_dataset[0][strains_to_label_inds, :]

        # Set up and draw the 2x2 plot!
        f, axarr = plt.subplots(2, 2, sharex = True)
        
        # also create a big plot so x axis label is common
        #ax = f.add_subplot(111)

        # Turn off axis lines and ticks of the big subplot
        #ax.spines['top'].set_color('none')
        #ax.spines['bottom'].set_color('none')
        #ax.spines['left'].set_color('none')
        #ax.spines['right'].set_color('none')
        #ax.tick_params(labelcolor='w', top='off', bottom='off', left='off', right='off')

        # Draw scatterplots
        axarr[0, 0].scatter(mean_control_profile, raw_prof)
        axarr[0, 1].scatter(mean_control_profile, lowess_prof)
        axarr[1, 0].scatter(mean_control_profile, deviation_prof)
        axarr[1, 1].scatter(mean_control_profile, scaled_dev_prof)

        # Add point labels
        #if config_params['scatter_label_scheme'] == '0':
        #    for j, lab in enumerate(strain_ids_final):
        #        axarr[0, 0].annotate(lab, xy = (mean_control_profile[j], raw_prof[j]))
        #    for j, lab in enumerate(strain_ids_final):
        #        axarr[0, 1].annotate(lab, xy = (mean_control_profile[j], lowess_prof[j]))
        #    for j, lab in enumerate(strain_ids_final):
        #        axarr[1, 0].annotate(lab, xy = (mean_control_profile[j], deviation_prof[j]))
        #    for j, lab in enumerate(strain_ids_final):
        #        axarr[1, 1].annotate(lab, xy = (mean_control_profile[j], scaled_dev_prof[j]))

        # Add common x label
        #ax.set_xlabel('mean control profile (log counts)')

        # Add title!
        #ax.set_title(condition_ids_final[i])

        # Add individual y labels
        axarr[0, 0].set_ylabel('Read counts')
        axarr[0, 1].set_ylabel('Lowess-normalized read counts')
        axarr[1, 0].set_ylabel('Deviation from normalized counts')
        axarr[1, 1].set_ylabel('z-score')

        outfile = os.path.join(scatter_outfolder, '{}.pdf'.format(condition_ids_final[i]))

        plt.savefig(outfile, bbox_inches = 'tight')
        plt.close()

Exemple #3

0

Afficher le fichier

Fichier : dataset_to_text.py Projet : monprin/BEAN-counter

def main(dataset, dataset_file, table, val_name, strain_table_f, strain_columns, sample_table_f, condition_columns, verbosity):

    strains, conditions, matrix = dataset

    full_fname = os.path.abspath(dataset_file)
    out_folder = full_fname.replace('.dump.gz', '')
    if not os.path.isdir(out_folder):
        os.makedirs(out_folder)

    # Here is where the script determines what modifications to make to the row/columns labels
    if strain_table_f is not None:
        strain_table = read_sample_table(strain_table_f)
        custom_strains = clus_wrap.customize_strains(strains, strain_table, strain_columns)

    # If the table boolean is not True, write out three fiies: matrix, rownames, and colnames
    if not table:
        strain_fname = os.path.join(out_folder, 'strains.txt')
        cond_fname = os.path.join(out_folder, 'conditions.txt')
        mat_fname = os.path.join(out_folder, 'matrix.txt.gz')
        
        # Write out the strains!
        with open(strain_fname, 'wt') as strain_f:
            strain_f.write('Strain_ID\tBarcode\n')
            for strain in strains:
                strain_f.write('\t'.join(strain) + '\n')

        # Write out the conditions!
        with open(cond_fname, 'wt') as cond_f:
            cond_f.write('screen_name\texpt_id\n')
            for cond in conditions:
                cond_f.write('\t'.join(cond) + '\n')

        # Write out the matrix!
        with gzip.open(mat_fname, 'wb') as mat_f:
            np.savetxt(mat_f, matrix, delimiter = '\t')
    
    # If the table boolean is True, then reshape into a table and write everything out as one table
    else:
        # Create containers for the rowname, colname, and value columns of the new table
        Strain_IDs = []
        Barcodes = []
        screen_names = []
        expt_ids = []
        values = []
        
        # Iterate over the rows and columns of the matrix, and fill in the lists!
        for i in range(matrix.shape[0]):
            for j in range(matrix.shape[1]):
                Strain_IDs.append(strains[i][0])
                Barcodes.append(strains[i][1])
                screen_names.append(conditions[j][0])
                expt_ids.append(conditions[j][1])
                values.append(matrix[i, j])

        # Create the data frame!
        if val_name is None:
            val_name = 'value'
        table = pd.DataFrame({'Strain_ID': Strain_IDs, 'Barcode': Barcodes, 'screen_name': screen_names, 'expt_id': expt_ids, val_name: values})

        # Reorder the data frame columns!
        table = table.reindex(columns = ['Strain_ID', 'Barcode', 'screen_name', 'expt_id', val_name])

        # Write the table out to file!
        tab_fname = os.path.join(out_folder, 'data_table.txt.gz')
        with gzip.open(tab_fname, 'wb') as tab_f:
            table.to_csv(tab_f, sep = '\t', header = True, index = False)