Beispiel #1
0
def sort_results(rvcf_input_file_path, taxon_table_file_path, transform,
                 r_sqr_median_cutoff, stability_cutoff, snp_count, no_tables,
                 extra_columns):
    print('plotting {} SNPs from {}'.format(snp_count, rvcf_input_file_path))

    # read the rvcf file and sort by rsq_median
    df = pd.read_csv(rvcf_input_file_path, sep='\t')
    #print('df.shape: {}'.format(df.shape))

    sorted_rsq_best_medians_df = df.sort_values(by='rsq_median', ascending=False)

    x_df = sorted_rsq_best_medians_df[sorted_rsq_best_medians_df.rsq_median > r_sqr_median_cutoff]
    print('{} SNPs with r_sqr > {:5.3f}'.format(x_df.shape[0], r_sqr_median_cutoff))

    taxon_table_df = read_taxon_file(taxon_table_file_path, transform=transform)

    for row_i in range(sorted_rsq_best_medians_df.shape[0]):
        if row_i >= snp_count:
            break
        else:
            # get a 1-row dataframe
            snp_df = sorted_rsq_best_medians_df.iloc[[row_i]]
            aligned_snp_df, aligned_taxa_df = align_snp_and_taxa(
                snp_df,
                taxon_table_df
            )
            # get the taxon stability selection scores
            # use the taxon table df index to get column names for snp_df
            taxon_scores_df = snp_df.loc[:, taxon_table_df.index].transpose()
            sorted_taxon_scores_df = taxon_scores_df.sort_values(by=taxon_scores_df.columns[0], ascending=False)
            #sorted_taxon_scores_df = taxon_scores_df.sort(taxon_scores_df.columns[0], ascending=False)

            p_df_list = []
            print('{} {} {:5.3f}'.format(snp_df.iloc[0].GENE, snp_df.iloc[0].ID, snp_df.iloc[0].rsq_median))
            summary_line = '{}\t{}\t'.format(snp_df.iloc[0].GENE, snp_df.iloc[0].ID)
            for i, (selected_taxon, selected_taxon_row) in enumerate(sorted_taxon_scores_df.iterrows()):
                # use selected_taxon_row.index[0] to index the first and only column
                selected_taxon_score = selected_taxon_row.iloc[0]
                if selected_taxon_score < stability_cutoff:
                    #print('done with selected taxa')
                    break
                else:
                    # trim 'Root;' from the front of the taxon name
                    if selected_taxon.startswith('Root;'):
                        taxon_name = selected_taxon[5:]
                    else:
                        taxon_name = selected_taxon
                    print('  {:5.3f} {}'.format(selected_taxon_score, taxon_name))
                    summary_line += '{}, '.format(taxon_name)
                    gts = [
                        snp_df.iloc[0].REF + snp_df.iloc[0].REF,  # 0
                        snp_df.iloc[0].REF + snp_df.iloc[0].ALT,  # 1
                        snp_df.iloc[0].ALT + snp_df.iloc[0].ALT   # 2
                    ]
                    aligned_snp_value_list = aligned_snp_df.values.flatten().tolist()
                    data_dict = {
                        'chromosome': [snp_df.iloc[0].CHROM] * aligned_snp_df.shape[1],
                        'snp_id': [snp_df.iloc[0].ID] * aligned_snp_df.shape[1],
                        'gene': [snp_df.iloc[0].GENE] * aligned_snp_df.shape[1],
                        'taxon': [selected_taxon] * aligned_snp_df.shape[1],
                        'abundance': aligned_taxa_df[selected_taxon].values.tolist(),
                        'variant_allele_count': [str(int(v)) for v in aligned_snp_value_list],
                        'genotype': [gts[int(v)] for v in aligned_snp_value_list],
                        'sample_id' : aligned_snp_df.columns
                    }
                    columns_to_display = ['abundance', 'variant_allele_count', 'genotype', 'sample_id']
                    if extra_columns:
                        for extra_column in extra_columns.split(','):
                            data_dict[extra_column] = snp_df.iloc[0][extra_column]
                            columns_to_display.append(extra_column)
                    p_df = pd.DataFrame(data_dict)
                    p_df_list.append(p_df)
                    if no_tables:
                        pass
                    else:
                        p_df[columns_to_display].to_csv(
                            sys.stdout,
                            sep='\t'
                        )
            # save a stacked bar plot
            if len(p_df_list) > 0:
                file_name = 'stacked_bar_plot_selected_taxa_{}_{}.pdf'.format(
                    snp_df.iloc[0].GENE,
                    snp_df.iloc[0].ID
                )
                p_df = pd.concat(p_df_list, axis=0)
                # at this point the index for p_df looks like
                #   0...76.0...76.0...76
                # replace the index
                p_df.index = range(p_df.shape[0])
                #p_df.to_csv(file_path, sep='\t')
                stacked_bar_title = '{}\n{}'.format(snp_df.iloc[0].GENE, snp_df.iloc[0].ID)
Beispiel #2
0
def box_bar_lasso_lars_cv_C_stability_selection_features(
        rvcf_input_file_path, taxon_table_file_path, transform, plot_output_dir_path, stability_cutoff, snp_count):
    print('plotting {} SNPs from {}'.format(snp_count, rvcf_input_file_path))

    if os.path.exists(plot_output_dir_path):
        # delete it
        print('deleting old plots')
        shutil.rmtree(plot_output_dir_path)
    os.makedirs(plot_output_dir_path)

    # read the rvcf file and sort by rsq_median
    df = pd.read_csv(rvcf_input_file_path, sep='\t')

    sorted_rsq_best_medians_df = df.sort_values(by='rsq_median', ascending=False)

    taxon_table_df = read_taxon_file(taxon_table_file_path, transform=transform)

    # these are proxies for R functions
    taxon_abundance_box_plot = get_taxon_abundance_box_plot()
    taxon_abundance_stacked_bar_plot = get_taxon_abundance_stacked_bar_plot()

    for row_i in range(sorted_rsq_best_medians_df.shape[0]):
        if row_i >= snp_count:
            break
        else:
            # get a 1-row dataframe
            snp_df = sorted_rsq_best_medians_df.iloc[[row_i]]
            aligned_snp_df, aligned_taxa_df = align_snp_and_taxa(
                snp_df,
                taxon_table_df
            )
            # get the taxon stability selection scores
            # use the taxon table df index to get column names for snp_df
            taxon_scores_df = snp_df.loc[:, taxon_table_df.index].transpose()
            sorted_taxon_scores_df = taxon_scores_df.sort(taxon_scores_df.columns[0], ascending=False)
            # print all sorted taxon scores to verify they are sorted high to low

            p_df_list = []
            summary_line = '{}\t{}\t'.format(snp_df.iloc[0].GENE, snp_df.iloc[0].ID)
            for i, (selected_taxon, selected_taxon_row) in enumerate(sorted_taxon_scores_df.iterrows()):
                # use selected_taxon_row.index[0] to index the first and only column
                selected_taxon_score = selected_taxon_row.iloc[0]
                if selected_taxon_score < stability_cutoff:
                    break
                else:
                    # trim 'Root;' from the front of the taxon name
                    if selected_taxon.startswith('Root;'):
                        taxon_name = selected_taxon[5:]
                    else:
                        taxon_name = selected_taxon
                    summary_line += '{}, '.format(taxon_name)
                    # print a box plot
                    r_pdf_file_path = \
                        os.path.join(
                            plot_output_dir_path,
                            'best_taxa_{}_{}_{}_boxplot_{}.pdf'.format(
                                row_i,
                                snp_df.iloc[0].GENE,
                                snp_df.iloc[0].ID,
                                i
                            )
                        )
                    #print('writing file {}'.format(r_pdf_file_path))
                    gts = [
                        snp_df.iloc[0].REF + snp_df.iloc[0].REF,  # 0
                        snp_df.iloc[0].REF + snp_df.iloc[0].ALT,  # 1
                        snp_df.iloc[0].ALT + snp_df.iloc[0].ALT   # 2
                    ]
                    aligned_snp_value_list = aligned_snp_df.values.flatten().tolist()
                    p_df = pd.DataFrame({
                        'chromosome': [snp_df.iloc[0].CHROM] * aligned_snp_df.shape[1],
                        'snp_id': [snp_df.iloc[0].ID] * aligned_snp_df.shape[1],
                        'gene': [snp_df.iloc[0].GENE] * aligned_snp_df.shape[1],
                        'taxon': [selected_taxon] * aligned_snp_df.shape[1],
                        'abundance': aligned_taxa_df[selected_taxon].values.tolist(),
                        'variant_allele_count': [str(int(v)) for v in aligned_snp_value_list],
                        'gt': [gts[int(v)] for v in aligned_snp_value_list]
                    })
                    p_df_list.append(p_df)
                    r_df = rpy2.robjects.vectors.DataFrame({
                        'abundance': rpy2.robjects.FloatVector(aligned_taxa_df[selected_taxon].values.tolist()),
                        'variant_allele_count': rpy2.robjects.StrVector([str(int(v)) for v in aligned_snp_value_list]),
                        'genotype': rpy2.robjects.StrVector([gts[int(v)] for v in aligned_snp_value_list])
                    })
                    print(taxon_name)
                    print(r_df)
                    taxon_abundance_box_plot(
                        r_df,
                        r_pdf_file_path,
                        '{} (score: {:4.3f})'.format(snp_df.iloc[0].GENE, selected_taxon_score),
                        '{} {}'.format(snp_df.iloc[0].GENE, snp_df.iloc[0].ID),
                        selected_taxon
                    )
            # write a summary line and
            print(summary_line[:-2])
            #summary_file.write(summary_line[:-2])
            #summary_file.write('\n')
            # save a stacked bar plot
            if len(p_df_list) > 0:
                file_name = 'stacked_bar_plot_selected_taxa_{}_{}.pdf'.format(
                    snp_df.iloc[0].GENE,
                    snp_df.iloc[0].ID
                )
                stacked_bar_plot_file_path = os.path.join(plot_output_dir_path, file_name)
                p_df = pd.concat(p_df_list, axis=0)
                # at this point the index for p_df looks like
                #   0...76.0...76.0...76
                # replace the index
                p_df.index = range(p_df.shape[0])
                r_all_df = rpy2.robjects.vectors.DataFrame({
                    'abundance': rpy2.robjects.FloatVector(p_df['abundance'].values.tolist()),
                    'variant_allele_count': rpy2.robjects.StrVector([str(int(v)) for v in p_df['variant_allele_count'].values]),
                    'taxon': rpy2.robjects.StrVector(p_df['taxon']),
                    'gene': rpy2.robjects.StrVector(p_df['gene']),
                    'genotype': rpy2.robjects.StrVector(p_df['gt'])
                })
                stacked_bar_title = '{}\n{}'.format(snp_df.iloc[0].GENE, snp_df.iloc[0].ID)
                taxon_abundance_stacked_bar_plot(
                    r_all_df,
                    stacked_bar_plot_file_path,
                    stacked_bar_title,
                    '{} {}'.format(snp_df.iloc[0].GENE, snp_df.iloc[0].ID),
                    'median abundance'
                )