Ejemplo n.º 1
0
 def NCPHIPzscores_RLM(self):
     ##1: regressed std of negative controls
     print('\tLinear regression of std~median of negative controls')
     #select df with control samples
     NC_df = self.select_NC()
     #print NC_df
     #fit robust linear model
     NC_df, NC_fit=myRegression.linear(NC_df, ('median', 'std'), self.par['dir_stat']).RLM()
     divisor=NC_df['pred_std']
     divisor[divisor<=0]=1
     
     #2:regressed PHIP values against negative controls
     print('\tLinear regression of sample-specific medians of negative controls')
     sdf = self.indf[self.par['sample_names']].copy()
     med_df = pd.DataFrame(np.zeros(shape=sdf.shape), columns=list(sdf), index=list(sdf.index))
     zscores_df = pd.DataFrame(np.zeros(shape=sdf.shape), columns=list(sdf), index=list(sdf.index))
     for sample in self.par['sample_names']:
         sample_dir = self.par['dir_result'] + sample + '/'
         phip_df=pd.DataFrame({'median':NC_df['median'], 'phip':sdf[sample]})
         #fit robust linear model
         phip_df, phip_fit=myRegression.linear(phip_df, ('median', 'phip'), sample_dir).RLM()
         #calculate z-score
         med_df[sample] = phip_df['pred_phip']
         zscores_df[sample] = (sdf[sample]-med_df[sample])/divisor
     #3:
     #RC minus regress RC
     #residuals_df=sdf-med_df        
     #then divied by regressed std of negative control in rows
     #zscores_df=residuals_df.div(divisor, axis=0)
     zscores_df = np.round(zscores_df, 1)
     #print residuals_df.ix[1:4, 8:10]
     myDataframe.basic(zscores_df).export_df(self.outfile, self.par['zscore_threshold'], self.index_label)
     return zscores_df
Ejemplo n.º 2
0
    def sample_zscores(self):
        #
        def median_zscores(x):
            y = x[x>0]
            m = np.median(y)
            s = np.std(y)
            zscores = [(r - m)/float(s) if s > 0 else (r - m) for r in x]
            return zscores
        #calculate z-scores
        zscores_df = self.indf.apply(median_zscores, axis=0)
        zscores_df = np.round(zscores_df, 1)
        #export
        myDataframe.basic(zscores_df).export_df(self.outfile, self.par['zscore_threshold'], self.index_label)
        return zscores_df
##########
#end
Ejemplo n.º 3
0
 def permute_taxon_blast(self, hits_num):
     print('permutation of viral blast:{}\t{}'.format(self.par['type'], hits_num))
     #
     counts_df = pd.DataFrame()
     outfile = '{}{}.txt'.format(myIO.dir_os(self.par['dir_out']).create_dir(), hits_num)
     if os.path.isfile(outfile):
         print('Read file: ', outfile)
         counts_df = pd.read_csv(outfile, header=0, index_col=0, sep="\t", low_memory=False)  
     else:
         #1: permutated peptides
         pep_names = list(self.par['binary_aln_df'].index)
         pep_df = myList.basic(pep_names).permute_list(self.par['permutation_times'], hits_num)
         #2: permutation based on the non-overlapped hits num
         for col, perm_pep in pep_df.items():
             perm_zb = self.par['binary_aln_df'].ix[perm_pep]
             p_collapse_zb, p_sim_tag = myDataframe.basic(perm_zb).unispecie(self.par['sim_threshold'])
             counts_df[col] = p_collapse_zb.apply(sum,axis=0) + p_sim_tag
             #print list(perm_tmp[col])
         #export
         counts_df.to_csv(outfile, sep='\t', header=True, index_label=self.par['type'])
     #combine permuated counts
     #print counts_df.shape
     perm_mean = counts_df.apply(lambda x: np.mean(np.floor(x)), axis=1).round()
     #print perm_mean
     return perm_mean
Ejemplo n.º 4
0
    def init_analysis(self):
        #1: read annotation file
        if 'file_annotation' in self.par.keys():
            self.par['annot_df'] = myDataframe.basic().annot_df(
                self.par['file_annotation'])
            #genome annotation: associations of protein-peptides
            self.par['dict_pro_pep'] = myCommon.basic(
                self.par).protein_peptides()
            #virus only
            if 'VirScan' in self.par['file_annotation']:
                #extract aa stretch
                #get dependent petides that two peptides shared at least  7-aa.
                self.par['dependent_pep'] = myCommon.basic(
                    self.par).taxon_dependent_peptides()

        #2: check bowtie or build bowtie index
        myAlign.alignment(self.par).build_bowtie_index()

        #3: sample info
        self.par = myParallel.samples(self.par).export_sample_info()
        #samples of negative controls
        group1 = self.par['group1']
        if 'NC' in group1.keys():
            self.par['NC_samples'] = group1['NC'].split(',')
            self.par['phip_samples'] = list(
                set(self.par['sample_names']) - set(self.par['NC_samples']))
            print('\nNumber of negative Controls (Beads only): ',
                  self.par['NC_samples'].__len__())
            print('Number of PhIP samples: ',
                  self.par['sample_names'].__len__())
            #myDict.basic(self.par['sample_dirs']).print_dict()

        #read reference sequence file (*.fa)
        ref_dict, ref_ids = myGenome.genome(self.par['file_ref_fa']).read_fa()
        self.par['ref_dict'] = ref_dict
Ejemplo n.º 5
0
 def NCPHIPzscores_linear(self):
     ##1: regressed std of negative controls
     print('\tLinear regression of std ~ median of negative controls')
     sdf = self.indf[self.par['sample_names']].copy()
     #select df with control samples
     NC_df = self.select_NC()
     #print NC_df
     #remove 0 mean and highest outlier
     median99 = np.percentile(NC_df['median'],99)
     NC_regress = NC_df.loc[(NC_df['median']>0)&(NC_df['median']<median99),:].copy()
     #print NC_regress
     #fit linear model
     lm = myRegression.linear(NC_regress, ('median','std'), self.par['dir_stat']).linear()
     #print 'Linear regression:', lm['params']
     
     #2:regressed PHIP values against negative controls
     print('\tLinear regression of medians of phip sample ~ medians of negative controls')
     reg_df = pd.DataFrame(np.zeros(shape=sdf.shape), columns=list(sdf), index=list(sdf.index))
     for sample in self.par['sample_names']:
         subdf = pd.DataFrame({'NC': NC_df['median'], 'phip':sdf[sample]})
         median99 = np.percentile(subdf['phip'], 99)
         #remove top 1% phipseq values
         sub_regress = subdf.loc[(subdf['NC']>0)&(subdf['phip']<median99),:]
         #linear regression
         #print sample
         #print sub_regress
         sample_dir = self.par['dir_result'] + sample + '/'
         phip_lm = myRegression.linear(sub_regress, ('NC', 'phip'), sample_dir).linear()
         #predicted phipRC
         reg_df[sample] = phip_lm['df']['pred_phip']
     #3:
     print('\t calculate z scores against NC:')
     #RC minus regress RC
     residuals_df = sdf - reg_df
     #then divied by regressed std of negative control in rows
     divisor=pd.Series(NC_df['pred_std'])
     divisor[divisor==0]=1
     zscores_df=residuals_df.div(divisor, axis=0)
     zscores_df=np.round(zscores_df,1)
     #print residuals_df.ix[1:4, 8:10]
     
     #export
     myDataframe.basic(zscores_df).export_df(self.outfile, self.par['zscore_threshold'], self.index_label)
     #
     return zscores_df
Ejemplo n.º 6
0
 def NC_zscores(self):
     #select df with control samples
     df0 = self.select_NC()
     #remove 0 mean and highest outlier
     median_99 = np.percentile(NC_median,99)
     #print mean_99
     df1 = df0.loc[(df0['median']>0) & (df0['median']<median_99),:].copy()
     #print df1
     #fit linear model
     lm = myRegression.linear(df1, ('median','std'), self.par['dir_stat']).linear()
     #print 'Linear regression:', lm['params']
     
     print('calculate z scores:')
     self.indf.insert(0, 'predicted_std', lm['df']['pred_std'])
     self.indf.insert(0, 'median', lm['df']['median'])
     #loops of data frame
     zscores_df = self.indf.apply(lambda x: sigAnalysis(x[2:]).Z_test(x[0],x[1])['zscores'], axis=1)
     zscores_df.columns = list(self.indf)[2:]
     #export
     myDataframe.basic(zscores_df).export_df(self.outfile, self.par['zscore_threshold'], self.index_label)
     return zscores_df
Ejemplo n.º 7
0
 def NCPHIPzscores_PN(self):
     ##1: regressed std~mean of negative controls
     #regression of logstd~logmedian across 261 Beads-only file
     #self.par['file_NC'], self.par['scaling_factor']
     wNC, wNC_fit = self.NC_whole_std()
     #RLM of std~median of beads only of this dataset
     NC_df = pd.DataFrame({'wNC_median':wNC['median'],'wNC_std':wNC['std'],\
                 'mean':self.indf[self.par['NC_samples']].mean(axis=1), \
                 'median':self.indf[self.par['NC_samples']].median(axis=1), \
                 'std':self.indf[self.par['NC_samples']].std(axis=1)})
     pNC, pNC_fit = myRegression.linear(NC_df, ('median','std'), self.par['dir_QC']).RLM()
     
     #2:regressed PHIP values against negative controls
     print('\tLinear regression of sample-specific medians of negative controls')
     #reg_x = NC['median'].drop_duplicates()
     zscores_df = self.indf.copy()
     zscores_df[:]=0.0
     for sample in self.par['sample_names']:
         zdf=NC_df.copy()
         zdf['phip'] = self.indf[sample]
         #fit robust linear model
         sample_dir = '{}{}/'.format(self.par['dir_result'], sample)
         mNC, mfit = myRegression.linear(zdf, ('mean','phip'), sample_dir).RLM()
         #zscores
         zdf['pred_phip']=mNC['pred_phip']
         pred_phip=zdf['pred_phip']
         pred_phip[pred_phip<=0] = np.nan
         zdf['pred_logphip'] = np.log10(pred_phip)# work as x value of NCstd~median
         pred_std = 10**wNC_fit.predict({'logmedian':zdf['pred_logphip']})
         zdf['pred_std']=pred_std
         zscores_df[sample] = (zdf['phip'] - zdf['pred_phip'])/pred_std
         zdf['zscores'] = zscores_df[sample]
         #export zscore
         zdf.to_csv(sample_dir+'polynomial_median.csv', header=True, index_label=self.index_label)
     #3:export z scores into self.outfiles
     zscores_df.replace([np.nan, np.inf, -np.inf], 0, inplace=True)
     zscores_df = np.round(zscores_df, 1)
     myDataframe.basic(zscores_df).export_df(self.outfile, self.par['zscore_threshold'], self.index_label)
     return zscores_df
Ejemplo n.º 8
0
    def taxon_blast2(self, file_aln, zscore_file):
        taxon_type = myIO.file_os(file_aln).name_prefix()
        print("\n{}:{}\n".format(taxon_type, zscore_file))
        #read zscore_df
        zdf = myDataframe.basic().standard_df(zscore_file)

        #match order of align score and zscore,replace na
        #read alignment file for specie alignment
        binary_b = myDataframe.basic().aln_df(file_aln,
                                              self.par['align_score'])
        #binary_b = myDataframe.basic(binary_b).filter_aln()
        binary_b = binary_b.reindex(zdf.index).fillna(0)

        #print binary_b

        #sample names in columns, and specie in rows
        sum_df = pd.DataFrame(0, index=list(binary_b), columns=list(zdf))
        pep_df = pd.DataFrame(np.nan, index=list(binary_b), columns=list(zdf))
        p_df = pd.DataFrame(index=list(binary_b), columns=list(zdf))
        #perm_df=pep_df.copy()
        #print binary_z.apply(sum, axis=0)
        #n=0
        for sample_name, column in zdf.iteritems():
            #n += 1
            #1: select peptides
            #column=zscore_df.ix[:,20]
            #first remove all nont-hits
            hits = column[
                column >= self.par['specieZ_threshold']].copy()  #all hits
            hits.sort_values(axis=0, ascending=False, inplace=True)
            #print hits
            #remove overlapped hits
            nonoverlap_hits = myList.basic(hits).gen_ind_hits(
                self.par['dependent_pep'])
            input_num = len(nonoverlap_hits)
            print("{}:\thits={}, nonoverlapped={}".format(
                sample_name, len(hits), input_num))

            #2: remove overlap hits between species
            if input_num > 0:
                zb_df = binary_b.loc[nonoverlap_hits.index]
                #print list(binary_b.apply(lambda x: sum(x), axis=0))
                #loop
                collapse_zb, sim_tag, p_series = myDataframe.basic(
                    zb_df).binom_unispecie(self.par['dir_ref_seq'], input_num,
                                           self.par['p_threshold'],
                                           self.par['x_threshold'])
                #counts of hits
                sum_df[sample_name] = collapse_zb.apply(sum, axis=0) + sim_tag
                #all peptide_id list
                pep_df[sample_name] = collapse_zb.apply(
                    lambda x: myList.basic(x).names_string(0.001), axis=0)
                p_df[sample_name] = p_series
                #padjust_df[sample_name]=p_adjust_series
            #if n==5: break
            #n+=1
        #export to file
        file_head = myIO.file_os(
            zscore_file).file_prefix() + '_' + taxon_type + '_'
        #file_head='random_min_HI_HC_'+taxon_type+'_'
        sum_df.to_csv(file_head + 'counting.txt',
                      sep='\t',
                      header=True,
                      index_label='Specie')
        pep_df.to_csv(file_head + 'peptides.txt',
                      sep='\t',
                      header=True,
                      index_label='Specie')
        p_df.to_csv(file_head + 'p-values.txt',
                    sep='\t',
                    header=True,
                    index_label='Specie')

        #Adjusted p-values using B-H
        '''
        stats = importr('stats')
        for i in p_df:
            pvalue_list = p_df[i].values
            p_adjust = list(stats.p_adjust(FloatVector(pvalue_list), method = 'BH'))
            padjust_df[i] = p_adjust
        padjust_df.to_csv(file_head+'p-adjusted.txt', sep='\t', header=True, index_label='Specie')
        '''
        padjust_df = pd.DataFrame(index=list(binary_b), columns=list(zdf))
        for i in p_df.columns:
            pvals = np.array(p_df[i].values)
            if not np.isnan(pvals).all():
                mask = [j for j in np.where(np.isfinite(pvals))[0]]
                pval_corrected = np.empty(pvals.shape)
                pval_corrected.fill(np.nan)
                pval_corrected[mask] = multipletests(pvals[mask],
                                                     method='fdr_bh')[1]
                padjust_df[i] = pval_corrected
        padjust_df.to_csv(file_head + 'padjusted.txt',
                          sep='\t',
                          header=True,
                          index_label='Specie')
        #perm_df.to_csv(file_head+'permutation.txt', sep='\t', header=True, index_label='Specie')


#end
Ejemplo n.º 9
0
    def taxon_blast(self, file_aln, zscore_file):
        print(
            '###Signficant taxon by removing overlapped hits based on blast alignment.'
        )
        taxon_type = myIO.file_os(file_aln).name_prefix()
        print('{}: {}'.format(taxon_type, zscore_file))
        #read zscore_df
        zdf = myDataframe.basic().standard_df(zscore_file)

        #match order of align score and zscore,replace na
        #read alignment file for specie alignment
        binary_b = myDataframe.basic().aln_df(file_aln,
                                              self.par['align_score'])
        binary_b = binary_b.reindex(zdf.index).fillna(0)
        #print binary_b

        #sample names in columns, and specie in rows
        sum_df = pd.DataFrame(0, index=list(binary_b), columns=list(zdf))
        pep_df = pd.DataFrame(np.nan, index=list(binary_b), columns=list(zdf))
        #perm_df = pep_df.copy()
        #print binary_z.apply(sum, axis = 0)
        #n = 1
        for sample_name, column in zdf.items():
            #1: select peptides
            #column = zscore_df.ix[:,20]
            #first remove all nont-hits
            hits = column[
                column >= self.par['specieZ_threshold']].copy()  #all hits
            hits.sort_values(axis=0, ascending=False, inplace=True)
            #print hits
            #remove overlapped hits
            nonoverlap_hits, overlap_debug = myList.basic(hits).remove_overlap(
                self.par['dependent_pep'])
            input_num = len(nonoverlap_hits)
            print('{}: hits={}, nonoverlapped={}'.format(
                sample_name, len(hits), input_num))

            #2: remove overlap hits between species
            if input_num > 0:
                ###2-1: export peptides
                try:
                    outfile = '{}{}/{}.csv'.format(self.par['dir_result'],
                                                   sample_name, taxon_type)
                    overlap_debug.to_csv(outfile,
                                         header=True,
                                         index_label='peptides')
                except FileNotFoundError:
                    myIO.file_os(self.par['file_err'], "\t").line_replace(
                        {'taxon_blast': sample_name})
                ###2-2: specie-specific hits based on non-overlapped hits
                #sample zscore-alignscore matrix times by zscore
                #print(nonoverlap_hits.index)
                zb_df = binary_b.ix[nonoverlap_hits.index]
                #print(list(binary_b.apply(lambda x: sum(x), axis = 0)))
                #loop
                collapse_zb, sim_tag = myDataframe.basic(zb_df).unispecie(
                    self.par['sim_threshold'])
                #counts of hits
                sum_df[sample_name] = collapse_zb.apply(sum, axis=0) + sim_tag
                #print(list(sum_df[sample_name]))
                #high_sum = sum_df[sample_name]
                #print(high_sum[high_sum>0])
                #all peptide_id list
                pep_df[sample_name] = collapse_zb.apply(
                    lambda x: myList.basic(x).names_string(0.001), axis=0)

                #2-3:permutation
                #perm_df[sample_name] = self.specie_alignment_permutation(input_num)
            #if n == 10: break
            #n+ = 1
        #export to file
        file_head = '{}_{}_'.format(
            myIO.file_os(zscore_file).file_prefix(), taxon_type)
        sum_df.to_csv(file_head + 'counting.txt',
                      sep='\t',
                      header=True,
                      index_label='Specie')
        pep_df.to_csv(file_head + 'peptides.txt',
                      sep='\t',
                      header=True,
                      index_label='Specie')
Ejemplo n.º 10
0
    def taxon_spec(self, count_file, taxon_rank, annot_index):
        #combine two data frame
        combined_df, phip_df = myCommon.basic(self.par).combine_df(
            count_file, annot_index)
        #print(combined_df)
        #print(list(combined_df.index))

        #taxonomy names:
        taxon_group = combined_df.groupby(taxon_rank).groups
        taxon_names = taxon_group.keys()
        taxon_names = [t for t in taxon_names if str(t) != 'nan']  #remove nan
        #print(taxon_names)
        taxon_pairs = {'taxon_specie':'InterSpecie', 'taxon_genus':'InterGenus', \
                     'taxon_family':'InterFamily', 'taxon_phip':'InterTaxon'}
        taxon_inter = taxon_pairs[taxon_rank]

        #inter-score dict
        #taxon_inter should be pep_ids separated by comma
        pepid_taxoninter = pd.Series(combined_df[taxon_inter],
                                     index=list(phip_df.index))
        inter_df = myDataframe.basic(phip_df).interact_df(
            pepid_taxoninter, max, count_file + taxon_inter)

        #make permutation of pep_ids
        #permute_dict = myList.basic(list(phip_df.index)).permute_Series(self.par['permutation_times'], slice_dict = taxon_group)

        #the hits of significant specie specific
        #rows are peptides, and columns are phip samples plus species names
        #z-scores matrix of specific peptides
        #initiate nested dict
        taxon_dict = dict([(s, {}) for s in list(phip_df)])  # number of hits
        taxon_dict['peptides'] = dict([(a, len(b))
                                       for a, b in taxon_group.items()])
        #taxon_pval_dict = dict([(s,{}) for s in list(phip_df)]) #pvalues of the hits by permutations
        taxon_pep_dict = dict([(s, {}) for s in list(phip_df)
                               ])  #pepid and zscores of hits
        debugging_dict = {}  #for identify bugs
        for s in list(phip_df):
            debugging_dict[s + ':all_hits'] = {}
            debugging_dict[s + ':inter_hits'] = {}
            debugging_dict[s + ':intra_hits'] = {}
            debugging_dict[s + ':hits'] = {}
            debugging_dict[s + ':counts'] = {}
            #debugging_dict[s+':pvals'] = {}
        #loop by sample_names
        for sample_name, col in phip_df.items():
            #print(sample_name)
            for s, indexs in taxon_group.items():
                #1: inter-taxon searching
                inter_list = inter_df.ix[indexs][sample_name]
                inter_dict = self.taxon_inter_searching(
                    col[indexs], inter_list)
                #export
                debugging_dict[sample_name +
                               ':all_hits'][s] = inter_dict['all_hits']
                debugging_dict[sample_name +
                               ':inter_hits'][s] = inter_dict['inter_hits']
                #print(inter_dict)

                #2: intra-taxon searching
                intra_dict = self.taxon_intra_searching(
                    col[inter_dict['other_hits']])
                #export
                debugging_dict[sample_name +
                               ':intra_hits'][s] = intra_dict['intra_hits']
                debugging_dict[sample_name + ':hits'][s] = intra_dict['hits']
                all_hits = [
                    '{}:{}'.format('all', len(inter_dict['all_hits'])),
                    '{}:{}'.format('inter', len(inter_dict['inter_hits'])),
                    '{}:{}'.format('intra', len(intra_dict['intra_hits'])),
                    '{}:{}'.format('hits', len(intra_dict['hits']))
                ]
                debugging_dict[sample_name + ':counts'][s] = ','.join(all_hits)
                hit_list = [
                    '({},{})'.format(a, b)
                    for a, b in col[intra_dict['hits']].items()
                ]
                taxon_pep_dict[sample_name][s] = ','.join(hit_list)
                #counts matrix of taxonomy search
                taxon_dict[sample_name][s] = len(intra_dict['hits'])

                #3: permutation
                #hit_scores = col[intra_dict['hits']]
                #permuted_scores = permute_dict[s]#df, pepids in rows, permuted scores in columns
                #pval_dict = self.taxon_permutation(hit_scores, permuted_scores, col)
                #export
                #pval_list = [len(intra_dict['hits']), pval_dict['ttest_pval'], pval_dict['utest_pval']]
                #taxon_pval_dict[sample_name][s] = ','.join(map(str, pval_list))
                #pval_list = [ a+':'+str(b) for a,b in pval_dict.items()]
                #debugging_dict[sample_name+':pvals'][s] = ','.join(pval_list)
        #export to file
        file_head = '{}_{}_'.format(
            myIO.file_os(count_file).file_prefix(), taxon_rank)
        taxon_dict = myDict.basic(taxon_dict).transform_dict2()
        myDict.basic(taxon_dict).dict2_to_file(file_head + 'counting.txt',
                                               "\t")
        taxon_pep_dict = myDict.basic(taxon_pep_dict).transform_dict2()
        myDict.basic(taxon_pep_dict).dict2_to_file(file_head + 'peptides.txt',
                                                   "\t")
        debugging_dict = myDict.basic(debugging_dict).transform_dict2()
        myDict.basic(debugging_dict).dict2_to_file(file_head + 'debugging.txt',
                                                   "\t")
Ejemplo n.º 11
0
    def enrich_pro(self, infile, annot_A, annot_B, sep1, sep2):
        if annot_A is None: annot_A = 'transcript_id'
        if annot_B is None: annot_B = 'pro_motifs'
        print("Enrichment analysis of {} => {} : {}".format(
            annot_A, annot_B, infile))
        #read data frame
        file_sep = ',' if infile.endswith('.csv') else '\t'
        counts_df = pd.read_csv(infile,
                                index_col=0,
                                sep=file_sep,
                                low_memory=False)
        #get all ids connect counts_df with annot_df
        A_ids = list(self.par['annot_df'][annot_A])
        #get all ids based on annot_type in list formate
        B_ids = myDataframe.basic(self.par['annot_df']).df_list(
            annot_B, sep1, sep2)
        #get A_ids vs list of b_ids in dict formate
        AB_dict = myDataframe.basic(self.par['annot_df']).list_dict(
            annot_A, annot_B, sep1, sep2)

        #initiate: #frequency of observed enriched motifs
        hits_observed = myDict.basic().init_dict2(B_ids, list(counts_df), 0)
        #initiate: zscores of obs based on permutation models
        hits_zscores = myDict.basic().init_dict2(B_ids, list(counts_df), 0)
        #initiate: detect bugs
        debugging = myDict.basic().init_dict2(
            B_ids + ['hits_counts', 'interact_counts'], {}, 'NA')
        #loop of data frame by columns
        for sample_name, zscores in counts_df.items():
            #print sample_name
            zscores = pd.Series(zscores)
            zscores.index = list(counts_df.index)
            #1: get ids of significant hits
            sig_zscores = zscores[zscores >= self.par['zscore_threshold']]
            obs_ids = list(sig_zscores.index)
            sig_num = len(obs_ids)
            #print annot_B, sample_name,sig_num
            #2: count frequency of enriched annotations, namely motifs
            obs_freq, obs_details = myDict.basic(AB_dict).elements_frequency(
                obs_ids)
            #print obs_freq.values()
            #debugging
            debugging['hits_counts'][sample_name] = sig_num
            debugging['interact_counts'][sample_name] = sum(obs_freq.values())

            #3: permute samples
            #print "\tenrichment: %s\t%s\t%s" % (sample_name, sig_num, len(obs_freq.keys()))
            perm_dict = {}
            for i in range(self.par['permutation_times']):
                perm_peps = random.sample(A_ids, sig_num)
                tmp_perm, tmp_details = myDict.basic(
                    AB_dict).elements_frequency(perm_peps)  # frequency dict
                for key, value in tmp_perm.items():
                    if key in perm_dict:
                        perm_dict[key].append(value)
                    else:
                        perm_dict[key] = [value]
            #print perm_dict

            #4: calcuate z-scores of observed counts
            for enriched_id, obs_num in obs_freq.items():
                #update hit_observed
                hits_observed[enriched_id][
                    sample_name] = obs_num  #frequency of observed enriched annot
                #update debugging
                debugging[enriched_id][sample_name] = '{}:{}'.format(
                    obs_num, obs_details[enriched_id])
                #update zscores_dict
                if enriched_id in perm_dict:
                    perm_pools = perm_dict[enriched_id]
                    #append zero and all pools are the same length
                    perm_pools = perm_pools + [0] * (5 - len(perm_pools))
                    perm_mean = np.mean(perm_pools)
                    perm_sd = np.std(perm_pools)
                    #zscores of observed hits against the null model
                    zscore = (obs_num -
                              perm_mean) / perm_sd if perm_sd > 0 else (
                                  obs_num - perm_mean)
                    hits_zscores[enriched_id][sample_name] = round(zscore, 2)
                else:
                    hits_zscores[enriched_id][sample_name] = obs_num
            #print hits_zscores

        #export
        file_head = '{}{}_{}_'.format(self.par['dir_enrichment'],
                                      myIO.file_os(infile).name_prefix(),
                                      annot_B)
        myDict.basic(hits_observed).dict2_to_file(out_file=file_head +
                                                  'counting.txt',
                                                  index_label=annot_B)
        myDict.basic(hits_zscores).dict2_to_file(out_file=file_head +
                                                 'zscores.txt',
                                                 index_label=annot_B)
        myDict.basic(debugging).dict2_to_file(out_file=file_head +
                                              'debugging.txt',
                                              index_label=annot_B,
                                              NA='NA')
Ejemplo n.º 12
0
        'sim_threshold': 0.8,
        'dir_bin': dir_bin + '/',
        'dir_home': dir_home + '/',
        'permutation_times': 100
    }
    par['dir_permutation'] = myIO.dir_os(par['dir_home'] +
                                         'permutation/').create_dir()

    print('###permutation procedure\n\n')
    pool = mpd.Pool(processes=par['threads_num'])

    #permuation of organism alignment
    if par['organism_permutation'] == 'yes':
        #read aln file
        file_aln = par['dir_home'] + 'ref_seq/organism_blast.txt'
        par['binary_aln_df'] = myDataframe.basic().aln_df(
            file_aln, par['align_score'])
        par['type'] = myIO.file_os(file_aln).name_prefix()
        par['dir_out'] = myIO.dir_os(par['dir_home'] + 'permutation/' +
                                     par['type']).create_dir()
        #
        for hits_num in range(par['start'], par['end']):
            pool.apply_async(myCommon.basic(par).permute_taxon_blast,
                             args=(hits_num, ))
            time.sleep(1)

    #permuation of specie alignment
    if par['specie_permutation'] == 'yes':
        #read aln file
        file_aln = par['dir_home'] + 'ref_seq/specie_blast.txt'
        par['binary_aln_df'] = myDataframe.basic().aln_df(
            file_aln, par['align_score'])