예제 #1
0
    def setUp(self,):

        d = dict()
        d['ids'] = ['a','b','c','d','e']
        d['frac1'] = [1.0,2.0,0.0,0.0,0.0]
        d['frac2'] = [0.0,0.0,3.0,4.0,0.0]
        d['frac3'] = [2.0,1.0,0.0,0.0,7.0]
        d['frac4'] = [0.0,0.0,6.0,8.0,10.0]
        df = pd.DataFrame(d, dtype=float)
        df = df.set_index('ids')
        self.elution = eff.Elut()
        self.elution.df = df

        d2 = dict()
        d2['ids'] = ['a','b','c','d']
        d2['frac1'] = [1.0,1.0,0.0,2.0]
        d2['frac2'] = [0.0,0.0,1.0,4.0]
        d2['frac3'] = [2.0,1.0,0.0,0.0]
        d2['frac4'] = [0.0,0.0,6.0,10.0]
        df2 = pd.DataFrame(d2, dtype=float)
        df2 = df2.set_index('ids')
        self.elution2 = eff.Elut()
        self.elution2.df = df2

        d3 = dict()
        d3['ids'] = ['a','b','c','d']
        d3['frac1'] = [1.0,1.0,0.0,1.0]
        d3['frac2'] = [0.0,0.0,0.0,0.0]
        d3['frac3'] = [0.0,0.0,1.0,0.0]
        d3['frac4'] = [0.0,0.0,0.0,0.0]
        df3 = pd.DataFrame(d3, dtype=float)
        df3 = df3.set_index('ids')
        self.elution3 = eff.Elut()
        self.elution3.df = df3

        d4 = dict()
        d4['ids'] = ['e','f','g','h']
        d4['frac1'] = [10.0,10.0,0.0,20.0]
        d4['frac2'] = [0.0,0.0,10.0,40.0]
        d4['frac3'] = [20.0,10.0,0.0,0.0]
        d4['frac4'] = [0.0,0.0,60.0,10.0]
        df4 = pd.DataFrame(d4, dtype=float)
        df4 = df4.set_index('ids')
        self.elution4 = eff.Elut()
        self.elution4.df = df4
예제 #2
0
def main():

    parser = argparse.ArgumentParser(
        description=
        "Calculate difference features between two fractionation experiments")
    parser.add_argument("--elution_files",
                        action="store",
                        nargs='+',
                        dest="elution_files",
                        required=True,
                        help="Elution files (.elut)")
    parser.add_argument(
        "--features",
        action="store",
        nargs='+',
        dest="features",
        required=False,
        default=['diffrac'],
        help=
        "Features to calculate: diffrac (L1-norm of difference) diffrac_percent diffrac_normalized pearsonr poisson mean_abundance emd zscore sliding_zscore fdr_correct sliding_fdr_correct"
    )
    parser.add_argument(
        "--annotated_list",
        action="store",
        dest="annotated_list",
        required=False,
        default=None,
        help=
        "Filename of annotated ids, used for calculating zscores from compliment of list, default=None"
    )
    parser.add_argument("--contaminate_tag",
                        action="store",
                        dest="contaminate_tag",
                        required=False,
                        default='CONTAMINANT',
                        help="Filters entries with tag, default=CONTAMINANT")
    parser.add_argument(
        "--use_gmm",
        action="store_true",
        dest="use_gmm",
        required=False,
        default=False,
        help=
        "Fit sliding window distributions to Gaussian Mixture Model and use largest gaussian for calculating zscore, default=False"
    )
    parser.add_argument(
        "--log_transform",
        action="store_true",
        dest="log_transform",
        required=False,
        default=False,
        help=
        "Use the log transform of the diffrac score to calculate sliding zscore, default=False"
    )
    parser.add_argument(
        "--window_size",
        action="store",
        type=int,
        dest="window_size",
        required=False,
        default=100,
        help="Window size to use for calculating sliding zscore, default=100")
    parser.add_argument(
        "--output_file",
        action="store",
        dest="out_filename",
        required=False,
        default=None,
        help="Filename of output file, default=None which prints to stdout")

    args = parser.parse_args()

    elutions = []
    for efile in args.elution_files:
        elut = eff.Elut()
        elut.load(efile, format='tsv')
        elut.threshold(thresh=1)
        elutions.append(elut)

    feature_df = pd.DataFrame()
    if len(elutions) >= 2:

        if 'diffrac' in args.features:
            feature_series = calc_diffrac(elutions[0],
                                          elutions[1],
                                          normalize_totalCounts=False)
            feature_series.name = 'diffrac'
            feature_df = join_feature(feature_df, feature_series)
        if 'diffrac_percent' in args.features:
            feature_series = calc_diffrac(elutions[0],
                                          elutions[1],
                                          percent_totalCounts=True)
            feature_series.name = 'diffrac_percent'
            feature_df = join_feature(feature_df, feature_series)
        if 'diffrac_normalized' in args.features:
            feature_series = calc_diffrac(elutions[0],
                                          elutions[1],
                                          normalize_totalCounts=True)
            feature_series.name = 'diffrac_normalized'
            feature_df = join_feature(feature_df, feature_series)
        if 'emd' in args.features:
            feature_series = calc_emd(elutions[0], elutions[1])
            feature_series.name = 'emd'
            feature_df = join_feature(feature_df, feature_series)
        if 'pearsonr' in args.features:
            feature_series = calc_correlation(
                elutions[0],
                elutions[1],
                correlation_func=lambda x, y: stats.pearsonr(x, y)[0])
            feature_series.name = 'pearsonr'
            feature_df = join_feature(feature_df, feature_series)
        if 'poisson' in args.features:
            print("WARNING: poisson not implemented")
            #feature_series = calc_correlation(elutions[0], elutions[1])
            #feature_series.name = 'poisson'
            #feature_df = join_feature(feature_df,feature_series)
        if 'mean_abundance' in args.features:
            feature_series = calc_mean_abundance(elutions[0], elutions[1])
            feature_series.name = 'mean_abundance'
            feature_df = join_feature(feature_df, feature_series)

        if args.annotated_list != None:
            #kdrew: add in training labels
            annotated_df = pd.read_table(args.annotated_list,
                                         header=None,
                                         names=['annotated'])
            annotated = [
                i in annotated_df['annotated'].values for i in feature_df.index
            ]
            feature_df['annotated'] = annotated

        print len(feature_df)
        try:
            feature_df = feature_df[~feature_df.index.str.
                                    contains('CONTAMINANT')]
        except AttributeError:
            print "No contaminants"

        print len(feature_df)

        if 'zscore' in args.features:
            if 'diffrac_normalized' not in args.features:
                #kdrew: calculating diffrac_normalized
                feature_series = calc_diffrac(elutions[0],
                                              elutions[1],
                                              normalize_totalCounts=False)
                feature_series.name = 'diffrac'
                feature_df = join_feature(feature_df, feature_series)
            feature_series = calc_zscore(feature_df)
            feature_series.name = 'zscore'
            feature_df = join_feature(feature_df, feature_series)

        if 'sliding_zscore' in args.features:
            feature_series = calc_sliding_zscore(
                feature_df,
                window=args.window_size,
                use_gmm=args.use_gmm,
                log_transform=args.log_transform)
            feature_series.name = 'sliding_zscore'
            feature_df = join_feature(feature_df, feature_series)

        if 'fdr_correct' in args.features:
            fdr_df = calc_fdr_correct(feature_df)
            feature_df = join_feature(feature_df, fdr_df)

        if 'sliding_fdr_correct' in args.features:
            sliding_fdr_df = calc_sliding_fdr_correct(feature_df)
            feature_df = join_feature(feature_df, sliding_fdr_df)

        if args.out_filename != None:
            feature_df.sort_values(args.features[0],
                                   ascending=False).to_csv(args.out_filename)
        else:
            print feature_df.sort_values(args.features[0], ascending=False)