Ejemplo n.º 1
0
    def init_analysis(self):
        #1: read annotation file
        if 'file_annotation' in self.par.keys():
            self.par['annot_df'] = myDataframe.basic().annot_df(
                self.par['file_annotation'])
            #genome annotation: associations of protein-peptides
            self.par['dict_pro_pep'] = myCommon.basic(
                self.par).protein_peptides()
            #virus only
            if 'VirScan' in self.par['file_annotation']:
                #extract aa stretch
                #get dependent petides that two peptides shared at least  7-aa.
                self.par['dependent_pep'] = myCommon.basic(
                    self.par).taxon_dependent_peptides()

        #2: check bowtie or build bowtie index
        myAlign.alignment(self.par).build_bowtie_index()

        #3: sample info
        self.par = myParallel.samples(self.par).export_sample_info()
        #samples of negative controls
        group1 = self.par['group1']
        if 'NC' in group1.keys():
            self.par['NC_samples'] = group1['NC'].split(',')
            self.par['phip_samples'] = list(
                set(self.par['sample_names']) - set(self.par['NC_samples']))
            print('\nNumber of negative Controls (Beads only): ',
                  self.par['NC_samples'].__len__())
            print('Number of PhIP samples: ',
                  self.par['sample_names'].__len__())
            #myDict.basic(self.par['sample_dirs']).print_dict()

        #read reference sequence file (*.fa)
        ref_dict, ref_ids = myGenome.genome(self.par['file_ref_fa']).read_fa()
        self.par['ref_dict'] = ref_dict
Ejemplo n.º 2
0
    def phipseq_alignment(self, sample_name):
        print('\n######Anslysis of {} will be trigerred!#####'.format(
            sample_name))
        #initiate sample par
        sample_var = dict(self.par)
        sample_var['start_time'] = time.time()
        #sample name
        sample_var['sample_name'] = sample_name
        #sample directory
        sample_dir = self.par['sample_dirs'][sample_name]
        sample_var['sample_dir'] = myIO.dir_os(sample_dir).create_dir()
        print('\tSample directory: ', sample_var['sample_dir'])
        #raw data
        sample_var['sample_raw_files'] = ','.join(
            sample_var['sample_to_raw'][sample_name])
        print('\tRaw files: ', sample_var['sample_raw_files'])
        #export
        sample_var['file_head'] = sample_var['sample_dir'] + sample_name
        #default same file
        sample_var['sample_sam_file'] = sample_var['file_head'] + '.sam'
        #file of read counts
        sample_var['sample_RC_file'] = sample_var['file_head'] + '_RC.txt'
        sample_var['sample_pro_sumRC_file'] = sample_var[
            'file_head'] + '_pro_sumRC.txt'
        sample_var['sample_pro_maxRC_file'] = sample_var[
            'file_head'] + '_pro_maxRC.txt'
        #file for saturation analysis
        sample_var['sample_saturation_file'] = sample_var[
            'file_head'] + '_saturation.txt'
        #sample log
        sample_var['sample_log'] = sample_var['file_head'] + '.log'

        #sequence alignment
        if sample_var['phip_alignment'] == 'yes':
            print("\n###sequence alignment", sample_var['tool_aligner'])
            #output is sam file
            if sample_var['tool_aligner'] == 'bowtie1':
                myAlign.alignment(sample_var).bowtie1_alignment()

        #counts reads
        if sample_var['phip_counting'] == 'yes':
            #RC matrix by peptides
            myAlign.alignment(sample_var).count_reads()
            #RC matrix by proteins
            if 'file_annotation' in self.par.keys():
                self.combine_peptides(sample_var)

        #update sample log
        sample_times = mySystem.system().get_time(sample_var['start_time'])
        sample_times['sample_name'] = sample_name
        myIO.file_os(sample_var['sample_log'], '=').line_replace(sample_times)
Ejemplo n.º 3
0
    def main_loop(self):
        print("\n\n####Parameters of PHIP: \n")
        #parallel procesing
        if self.par['phip_alignment'] == 'yes' or self.par[
                'phip_counting'] == 'yes':
            sample_names = self.par['sample_names']
            print(sample_names.__len__(), ' samples will be analyzed.\n')
            #multi-threads
            #myCommon.basic(self.par).pp_map_threads(self.phipseq_alignment, sample_names)
            #multi-processes
            myCommon.basic(self.par).pp_map_process(mp_alignment,
                                                    [(self, s)
                                                     for s in sample_names])

        #combine RC and statistics file
        if self.par['phip_merge'] == 'yes':
            #1: combine RC files into RC matrix
            print('\n\n\n###Combine RC files (phip_merge)\n')
            #get arguments
            args_list = []
            RC_level = 'lowRC'
            #peptide level: lowRC
            out_file = self.par['files_dict']['pep_RC']
            arg_tuple = ('_RC.txt', RC_level, out_file, self.par['pep_ids'])
            args_list.append(arg_tuple)
            if 'file_annotation' in self.par:
                #promax level
                out_file = self.par['files_dict']['promax_RC']
                arg_tuple = ('_pro_maxRC.txt', RC_level, out_file,
                             self.par['pro_ids'])
                args_list.append(arg_tuple)
                #prosum level
                out_file = self.par['files_dict']['prosum_RC']
                arg_tuple = ('_pro_sumRC.txt', RC_level, out_file,
                             self.par['pro_ids'])
                args_list.append(arg_tuple)
            #multi-threads
            myCommon.basic(self.par).pp_map_threads(
                myAlign.alignment(self.par).combine_countfiles, args_list)
            #myCommon.basic(self.par).pp_apply_threads(args_list)
            #2: generate statistics.csv
            myCommon.basic(self.par).QC_statistics()

        #significance analysis using Z score
        if self.par['phip_zscores'] == 'yes':
            print('\n\n\n###normalization of RC (phip_zscores)\n')
            #peptides level
            RC_file = self.par['files_dict']['pep_RC']  #infile
            #1: scaling RCs
            sRC_file = self.par['files_dict']['pep_scalingRC']  # outfile
            myStat.normalization(self.par, RC_file, sRC_file,
                                 'pep_id').RC_scaling()
            #2: z-scores of scaling RCs against negative controls and phipseq samples
            zfile = self.par['files_dict']['pep_NCPHIPzscores']  #outfile
            if 'file_NC' in self.par.keys():
                myStat.normalization(self.par, sRC_file, zfile,
                                     'pep_id').NCPHIPzscores_PN()
            else:
                myStat.normalization(self.par, sRC_file, zfile,
                                     'pep_id').NCPHIPzscores_RLM()

            #3:collpase matrix
            if 'file_annotation' in self.par:
                print("\t######collapse peptide matrix into protein matrix")
                pars = []
                for name in ['scalingRC', 'NCPHIPzscores']:
                    pep_file = self.par['files_dict']['pep_' + name]  #infile
                    sum_file = self.par['files_dict']['pep_' + name +
                                                      '_prosum']  #outfile
                    pars.append((pep_file, sum_file, sum))
                    max_file = self.par['files_dict']['pep_' + name +
                                                      '_promax']  #outfile
                    pars.append((pep_file, max_file, max))
                #multiple-threading
                myCommon.basic(self.par).pp_map_threads(
                    myCommon.basic(self.par).collapse_matrix, pars)

        #Functional analysis after normalization and correction
        #parallel processing
        print('\n\n\n###Functional Analysis (phip_GP and phip_enrichment)\n')
        pool = mpd.Pool(processes=self.par['threads_num'])
        #set the list of parameters
        pep_zfile = self.par['files_dict']['pep_NCPHIPzscores']  #infile
        promax_zfile = self.par['files_dict']['pep_NCPHIPzscores_promax']
        prosum_zfile = self.par['files_dict']['pep_NCPHIPzscores_prosum']
        if self.par['phip_GP'] == 'yes':
            #1: polyclonal of signficant peptides
            pool.apply_async(self.sig_polyclonal, args=(pep_zfile, ))
            #virus only
            if 'VirScan' in self.par['file_annotation']:
                #5: inter/intra specie searching only for virus library#####
                pool.apply_async(self.taxon_spec,
                                 args=(
                                     pep_zfile,
                                     'taxon_phip',
                                     'pep_id',
                                 ))
                #6: specie alignment of virus only
                file_aln = self.par['dir_ref_seq'] + 'specie_blast.txt'
                pool.apply_async(self.taxon_blast,
                                 args=(
                                     file_aln,
                                     pep_zfile,
                                 ))
                #7: organism alignment of virus only
                file_aln = self.par['dir_ref_seq'] + 'organism_blast.txt'
                pool.apply_async(self.taxon_blast,
                                 args=(
                                     file_aln,
                                     pep_zfile,
                                 ))

            ##quality control
            #1: relationship between significant hits and raw read num
            pool.apply_async(myCommon.basic(self.par).QC_hits,
                             args=(pep_zfile, ))
            pool.apply_async(myCommon.basic(self.par).QC_hits,
                             args=(prosum_zfile, ))
            pool.apply_async(myCommon.basic(self.par).QC_hits,
                             args=(promax_zfile, ))
            #2:saturation analysis
            pool.apply_async(myCommon.basic(self.par).QC_saturation)

        if self.par['phip_enrichment'] == 'yes':
            #5:Detection of enriched protein motifs
            E = myCommon.basic(self.par)
            if 'pro_motifs' in list(self.par['annot_df']):
                pool.apply_async(E.enrich_pro,
                                 args=(
                                     pep_zfile,
                                     'pep_id',
                                     'pro_motifs',
                                     ';',
                                     ',',
                                 ))
            #6:GO,loci,PPI,KEGG,InterPro, multifunctional scaffold protein enrichment analysis
            terms = set([
                'GO', 'map', 'PPI', 'KEGG', 'InterPro', 'MIM', 'autoantigen'
            ]) & set(list(self.par['annot_df']))
            for term in terms:
                pool.apply_async(E.enrich_pro,
                                 args=(
                                     prosum_zfile,
                                     'pro_id',
                                     term,
                                     ',',
                                     None,
                                 ))
                pool.apply_async(E.enrich_pro,
                                 args=(
                                     promax_zfile,
                                     'pro_id',
                                     term,
                                     ',',
                                     None,
                                 ))
        pool.close()
        pool.join()