Example #1
0
    def init_analysis(self):
        #1: read annotation file
        if 'file_annotation' in self.par.keys():
            self.par['annot_df'] = myDataframe.basic().annot_df(
                self.par['file_annotation'])
            #genome annotation: associations of protein-peptides
            self.par['dict_pro_pep'] = myCommon.basic(
                self.par).protein_peptides()
            #virus only
            if 'VirScan' in self.par['file_annotation']:
                #extract aa stretch
                #get dependent petides that two peptides shared at least  7-aa.
                self.par['dependent_pep'] = myCommon.basic(
                    self.par).taxon_dependent_peptides()

        #2: check bowtie or build bowtie index
        myAlign.alignment(self.par).build_bowtie_index()

        #3: sample info
        self.par = myParallel.samples(self.par).export_sample_info()
        #samples of negative controls
        group1 = self.par['group1']
        if 'NC' in group1.keys():
            self.par['NC_samples'] = group1['NC'].split(',')
            self.par['phip_samples'] = list(
                set(self.par['sample_names']) - set(self.par['NC_samples']))
            print('\nNumber of negative Controls (Beads only): ',
                  self.par['NC_samples'].__len__())
            print('Number of PhIP samples: ',
                  self.par['sample_names'].__len__())
            #myDict.basic(self.par['sample_dirs']).print_dict()

        #read reference sequence file (*.fa)
        ref_dict, ref_ids = myGenome.genome(self.par['file_ref_fa']).read_fa()
        self.par['ref_dict'] = ref_dict
Example #2
0
    def sig_polyclonal(self, count_file):
        #count_file = args_tuple
        print("Polyclonal analysis of ", count_file)
        comb_df, pep_df = myCommon.basic(self.par).combine_df(count_file)

        #functions
        def hits_func(x, peps, threshold, pro_id):
            #signficant hits
            hits = x[x >= threshold]
            #non_overlapping peptides
            peps = [str(x) for x in peps]
            hit_peps = [str(x) for x in hits.index]
            none_overlapped_hits_num = myList.basic(peps).un_neighbours(
                hit_peps, return_type='hits_num')
            #if none_overlapped_hits_num>1: print "%d,%d" %(len(list(hits.index)), none_overlapped_hits_num)
            #if len(hit_peps)>0: print pro_id, peps, hit_peps
            #if pro_id == 'Q9YLJ1': print pro_id, peps, hit_peps
            return len(list(
                hits.index)), none_overlapped_hits_num, ','.join(hit_peps)

        #collapse by protein
        hits1 = {}
        hits2 = {}
        #n = 1
        for pro_id, row_index in comb_df.groupby('pro_id').groups.items():
            #row is protein id
            ##get protein-peptides annotations
            peps_str = self.par['dict_pro_pep'][pro_id]
            peps = peps_str.split(',')
            #df by protein
            sub_df = pep_df.ix[row_index]
            #print("{}\t{}".format(pro_id, list(sub_df.index)) )
            #hits num beyond zscore threshold
            hits_num = sub_df.apply(hits_func,
                                    axis=0,
                                    args=(peps, self.par['zscore_threshold'],
                                          pro_id))
            #if pro_id == 'Q9YLJ1': print hits_num
            #all number of significant hits
            num1 = [h[0] for h in hits_num]
            hits1[pro_id] = dict(zip(list(sub_df), list(num1)))
            #number of sig hits without overlapping
            num2 = [h[1] for h in hits_num]
            hits2[pro_id] = dict(zip(list(sub_df), list(num2)))
            #if (np.sum(num1))>10:
            #pd.set_option('display.max_columns', None)
            #pd.set_option('display.max_rows', None)
            #print np.matrix(np.round(sub_df))
            #print num1
            #print num2
            #n+ = 1
            #if n == 10: break

        #export
        file_head = myIO.file_os(count_file).file_prefix() + '_polyclonal'
        myDict.basic(hits1, self.par['pro_ids']).dict2_to_file(
            file_head + '.txt', "\t")
        myDict.basic(hits2, self.par['pro_ids']).dict2_to_file(
            file_head + '_nonoverlapped.txt', "\t")
Example #3
0
    def taxon_spec(self, count_file, taxon_rank, annot_index):
        #combine two data frame
        combined_df, phip_df = myCommon.basic(self.par).combine_df(
            count_file, annot_index)
        #print(combined_df)
        #print(list(combined_df.index))

        #taxonomy names:
        taxon_group = combined_df.groupby(taxon_rank).groups
        taxon_names = taxon_group.keys()
        taxon_names = [t for t in taxon_names if str(t) != 'nan']  #remove nan
        #print(taxon_names)
        taxon_pairs = {'taxon_specie':'InterSpecie', 'taxon_genus':'InterGenus', \
                     'taxon_family':'InterFamily', 'taxon_phip':'InterTaxon'}
        taxon_inter = taxon_pairs[taxon_rank]

        #inter-score dict
        #taxon_inter should be pep_ids separated by comma
        pepid_taxoninter = pd.Series(combined_df[taxon_inter],
                                     index=list(phip_df.index))
        inter_df = myDataframe.basic(phip_df).interact_df(
            pepid_taxoninter, max, count_file + taxon_inter)

        #make permutation of pep_ids
        #permute_dict = myList.basic(list(phip_df.index)).permute_Series(self.par['permutation_times'], slice_dict = taxon_group)

        #the hits of significant specie specific
        #rows are peptides, and columns are phip samples plus species names
        #z-scores matrix of specific peptides
        #initiate nested dict
        taxon_dict = dict([(s, {}) for s in list(phip_df)])  # number of hits
        taxon_dict['peptides'] = dict([(a, len(b))
                                       for a, b in taxon_group.items()])
        #taxon_pval_dict = dict([(s,{}) for s in list(phip_df)]) #pvalues of the hits by permutations
        taxon_pep_dict = dict([(s, {}) for s in list(phip_df)
                               ])  #pepid and zscores of hits
        debugging_dict = {}  #for identify bugs
        for s in list(phip_df):
            debugging_dict[s + ':all_hits'] = {}
            debugging_dict[s + ':inter_hits'] = {}
            debugging_dict[s + ':intra_hits'] = {}
            debugging_dict[s + ':hits'] = {}
            debugging_dict[s + ':counts'] = {}
            #debugging_dict[s+':pvals'] = {}
        #loop by sample_names
        for sample_name, col in phip_df.items():
            #print(sample_name)
            for s, indexs in taxon_group.items():
                #1: inter-taxon searching
                inter_list = inter_df.ix[indexs][sample_name]
                inter_dict = self.taxon_inter_searching(
                    col[indexs], inter_list)
                #export
                debugging_dict[sample_name +
                               ':all_hits'][s] = inter_dict['all_hits']
                debugging_dict[sample_name +
                               ':inter_hits'][s] = inter_dict['inter_hits']
                #print(inter_dict)

                #2: intra-taxon searching
                intra_dict = self.taxon_intra_searching(
                    col[inter_dict['other_hits']])
                #export
                debugging_dict[sample_name +
                               ':intra_hits'][s] = intra_dict['intra_hits']
                debugging_dict[sample_name + ':hits'][s] = intra_dict['hits']
                all_hits = [
                    '{}:{}'.format('all', len(inter_dict['all_hits'])),
                    '{}:{}'.format('inter', len(inter_dict['inter_hits'])),
                    '{}:{}'.format('intra', len(intra_dict['intra_hits'])),
                    '{}:{}'.format('hits', len(intra_dict['hits']))
                ]
                debugging_dict[sample_name + ':counts'][s] = ','.join(all_hits)
                hit_list = [
                    '({},{})'.format(a, b)
                    for a, b in col[intra_dict['hits']].items()
                ]
                taxon_pep_dict[sample_name][s] = ','.join(hit_list)
                #counts matrix of taxonomy search
                taxon_dict[sample_name][s] = len(intra_dict['hits'])

                #3: permutation
                #hit_scores = col[intra_dict['hits']]
                #permuted_scores = permute_dict[s]#df, pepids in rows, permuted scores in columns
                #pval_dict = self.taxon_permutation(hit_scores, permuted_scores, col)
                #export
                #pval_list = [len(intra_dict['hits']), pval_dict['ttest_pval'], pval_dict['utest_pval']]
                #taxon_pval_dict[sample_name][s] = ','.join(map(str, pval_list))
                #pval_list = [ a+':'+str(b) for a,b in pval_dict.items()]
                #debugging_dict[sample_name+':pvals'][s] = ','.join(pval_list)
        #export to file
        file_head = '{}_{}_'.format(
            myIO.file_os(count_file).file_prefix(), taxon_rank)
        taxon_dict = myDict.basic(taxon_dict).transform_dict2()
        myDict.basic(taxon_dict).dict2_to_file(file_head + 'counting.txt',
                                               "\t")
        taxon_pep_dict = myDict.basic(taxon_pep_dict).transform_dict2()
        myDict.basic(taxon_pep_dict).dict2_to_file(file_head + 'peptides.txt',
                                                   "\t")
        debugging_dict = myDict.basic(debugging_dict).transform_dict2()
        myDict.basic(debugging_dict).dict2_to_file(file_head + 'debugging.txt',
                                                   "\t")
Example #4
0
    def main_loop(self):
        print("\n\n####Parameters of PHIP: \n")
        #parallel procesing
        if self.par['phip_alignment'] == 'yes' or self.par[
                'phip_counting'] == 'yes':
            sample_names = self.par['sample_names']
            print(sample_names.__len__(), ' samples will be analyzed.\n')
            #multi-threads
            #myCommon.basic(self.par).pp_map_threads(self.phipseq_alignment, sample_names)
            #multi-processes
            myCommon.basic(self.par).pp_map_process(mp_alignment,
                                                    [(self, s)
                                                     for s in sample_names])

        #combine RC and statistics file
        if self.par['phip_merge'] == 'yes':
            #1: combine RC files into RC matrix
            print('\n\n\n###Combine RC files (phip_merge)\n')
            #get arguments
            args_list = []
            RC_level = 'lowRC'
            #peptide level: lowRC
            out_file = self.par['files_dict']['pep_RC']
            arg_tuple = ('_RC.txt', RC_level, out_file, self.par['pep_ids'])
            args_list.append(arg_tuple)
            if 'file_annotation' in self.par:
                #promax level
                out_file = self.par['files_dict']['promax_RC']
                arg_tuple = ('_pro_maxRC.txt', RC_level, out_file,
                             self.par['pro_ids'])
                args_list.append(arg_tuple)
                #prosum level
                out_file = self.par['files_dict']['prosum_RC']
                arg_tuple = ('_pro_sumRC.txt', RC_level, out_file,
                             self.par['pro_ids'])
                args_list.append(arg_tuple)
            #multi-threads
            myCommon.basic(self.par).pp_map_threads(
                myAlign.alignment(self.par).combine_countfiles, args_list)
            #myCommon.basic(self.par).pp_apply_threads(args_list)
            #2: generate statistics.csv
            myCommon.basic(self.par).QC_statistics()

        #significance analysis using Z score
        if self.par['phip_zscores'] == 'yes':
            print('\n\n\n###normalization of RC (phip_zscores)\n')
            #peptides level
            RC_file = self.par['files_dict']['pep_RC']  #infile
            #1: scaling RCs
            sRC_file = self.par['files_dict']['pep_scalingRC']  # outfile
            myStat.normalization(self.par, RC_file, sRC_file,
                                 'pep_id').RC_scaling()
            #2: z-scores of scaling RCs against negative controls and phipseq samples
            zfile = self.par['files_dict']['pep_NCPHIPzscores']  #outfile
            if 'file_NC' in self.par.keys():
                myStat.normalization(self.par, sRC_file, zfile,
                                     'pep_id').NCPHIPzscores_PN()
            else:
                myStat.normalization(self.par, sRC_file, zfile,
                                     'pep_id').NCPHIPzscores_RLM()

            #3:collpase matrix
            if 'file_annotation' in self.par:
                print("\t######collapse peptide matrix into protein matrix")
                pars = []
                for name in ['scalingRC', 'NCPHIPzscores']:
                    pep_file = self.par['files_dict']['pep_' + name]  #infile
                    sum_file = self.par['files_dict']['pep_' + name +
                                                      '_prosum']  #outfile
                    pars.append((pep_file, sum_file, sum))
                    max_file = self.par['files_dict']['pep_' + name +
                                                      '_promax']  #outfile
                    pars.append((pep_file, max_file, max))
                #multiple-threading
                myCommon.basic(self.par).pp_map_threads(
                    myCommon.basic(self.par).collapse_matrix, pars)

        #Functional analysis after normalization and correction
        #parallel processing
        print('\n\n\n###Functional Analysis (phip_GP and phip_enrichment)\n')
        pool = mpd.Pool(processes=self.par['threads_num'])
        #set the list of parameters
        pep_zfile = self.par['files_dict']['pep_NCPHIPzscores']  #infile
        promax_zfile = self.par['files_dict']['pep_NCPHIPzscores_promax']
        prosum_zfile = self.par['files_dict']['pep_NCPHIPzscores_prosum']
        if self.par['phip_GP'] == 'yes':
            #1: polyclonal of signficant peptides
            pool.apply_async(self.sig_polyclonal, args=(pep_zfile, ))
            #virus only
            if 'VirScan' in self.par['file_annotation']:
                #5: inter/intra specie searching only for virus library#####
                pool.apply_async(self.taxon_spec,
                                 args=(
                                     pep_zfile,
                                     'taxon_phip',
                                     'pep_id',
                                 ))
                #6: specie alignment of virus only
                file_aln = self.par['dir_ref_seq'] + 'specie_blast.txt'
                pool.apply_async(self.taxon_blast,
                                 args=(
                                     file_aln,
                                     pep_zfile,
                                 ))
                #7: organism alignment of virus only
                file_aln = self.par['dir_ref_seq'] + 'organism_blast.txt'
                pool.apply_async(self.taxon_blast,
                                 args=(
                                     file_aln,
                                     pep_zfile,
                                 ))

            ##quality control
            #1: relationship between significant hits and raw read num
            pool.apply_async(myCommon.basic(self.par).QC_hits,
                             args=(pep_zfile, ))
            pool.apply_async(myCommon.basic(self.par).QC_hits,
                             args=(prosum_zfile, ))
            pool.apply_async(myCommon.basic(self.par).QC_hits,
                             args=(promax_zfile, ))
            #2:saturation analysis
            pool.apply_async(myCommon.basic(self.par).QC_saturation)

        if self.par['phip_enrichment'] == 'yes':
            #5:Detection of enriched protein motifs
            E = myCommon.basic(self.par)
            if 'pro_motifs' in list(self.par['annot_df']):
                pool.apply_async(E.enrich_pro,
                                 args=(
                                     pep_zfile,
                                     'pep_id',
                                     'pro_motifs',
                                     ';',
                                     ',',
                                 ))
            #6:GO,loci,PPI,KEGG,InterPro, multifunctional scaffold protein enrichment analysis
            terms = set([
                'GO', 'map', 'PPI', 'KEGG', 'InterPro', 'MIM', 'autoantigen'
            ]) & set(list(self.par['annot_df']))
            for term in terms:
                pool.apply_async(E.enrich_pro,
                                 args=(
                                     prosum_zfile,
                                     'pro_id',
                                     term,
                                     ',',
                                     None,
                                 ))
                pool.apply_async(E.enrich_pro,
                                 args=(
                                     promax_zfile,
                                     'pro_id',
                                     term,
                                     ',',
                                     None,
                                 ))
        pool.close()
        pool.join()
Example #5
0
    print('###permutation procedure\n\n')
    pool = mpd.Pool(processes=par['threads_num'])

    #permuation of organism alignment
    if par['organism_permutation'] == 'yes':
        #read aln file
        file_aln = par['dir_home'] + 'ref_seq/organism_blast.txt'
        par['binary_aln_df'] = myDataframe.basic().aln_df(
            file_aln, par['align_score'])
        par['type'] = myIO.file_os(file_aln).name_prefix()
        par['dir_out'] = myIO.dir_os(par['dir_home'] + 'permutation/' +
                                     par['type']).create_dir()
        #
        for hits_num in range(par['start'], par['end']):
            pool.apply_async(myCommon.basic(par).permute_taxon_blast,
                             args=(hits_num, ))
            time.sleep(1)

    #permuation of specie alignment
    if par['specie_permutation'] == 'yes':
        #read aln file
        file_aln = par['dir_home'] + 'ref_seq/specie_blast.txt'
        par['binary_aln_df'] = myDataframe.basic().aln_df(
            file_aln, par['align_score'])
        par['type'] = myIO.file_os(file_aln).name_prefix()
        par['dir_out'] = myIO.dir_os(par['dir_home'] + 'permutation/' +
                                     par['type']).create_dir()
        #
        for hits_num in range(par['start'], par['end']):
            pool.apply_async(myCommon.basic(par).permute_taxon_blast,