def match_fasta(self):
        files = myIO.dir_os(self.par['dir_out']).incrusive_files()
        #select a fasta file
        fa_files = filter(lambda x: x.endswith(('.fa', '.fasta')), files)
        self.par['match_fa'] = mySystem.system().select_key(fa_files)
        #select a gtf or gff file
        gtf_files = filter(lambda x: x.endswith(('.gtf', '.gff3')), files)
        self.par['match_gtf'] = mySystem.system().select_key(gtf_files)

        #match
        if par['web_site'] == 'ENSEML':
            myGenome.genome(par['match_fa']).match_ensembl_fa(par['match_gtf'])
        elif par['web_site'] == 'NCBI':
            myGenome.genome(par['match_fa']).match_ncbi_fa(par['match_gtf'])
    def init_analysis(self):
        #1: read annotation file
        if 'file_annotation' in self.par.keys():
            self.par['annot_df'] = myDataframe.basic().annot_df(
                self.par['file_annotation'])
            #genome annotation: associations of protein-peptides
            self.par['dict_pro_pep'] = myCommon.basic(
                self.par).protein_peptides()
            #virus only
            if 'VirScan' in self.par['file_annotation']:
                #extract aa stretch
                #get dependent petides that two peptides shared at least  7-aa.
                self.par['dependent_pep'] = myCommon.basic(
                    self.par).taxon_dependent_peptides()

        #2: check bowtie or build bowtie index
        myAlign.alignment(self.par).build_bowtie_index()

        #3: sample info
        self.par = myParallel.samples(self.par).export_sample_info()
        #samples of negative controls
        group1 = self.par['group1']
        if 'NC' in group1.keys():
            self.par['NC_samples'] = group1['NC'].split(',')
            self.par['phip_samples'] = list(
                set(self.par['sample_names']) - set(self.par['NC_samples']))
            print('\nNumber of negative Controls (Beads only): ',
                  self.par['NC_samples'].__len__())
            print('Number of PhIP samples: ',
                  self.par['sample_names'].__len__())
            #myDict.basic(self.par['sample_dirs']).print_dict()

        #read reference sequence file (*.fa)
        ref_dict, ref_ids = myGenome.genome(self.par['file_ref_fa']).read_fa()
        self.par['ref_dict'] = ref_dict
Beispiel #3
0
 def init_RCdict(self):
     RC_dict = {}
     #get all ref names from the refseq file
     ref_names = myGenome.genome(self.par['file_ref_fa']).fa_displayid()
     for ref in ref_names:
         RC_dict[ref] = {'lowRC':0, 'midRC':0, 'highRC':0}
     #
     return RC_dict
Beispiel #4
0
 def download_dna(self):
     #get html
     lines = web(self.url['dna_fa']).get_html()
     chr_files = self.dna_files(lines)
     
     #download and decompress genome files
     local_chr_files = {}
     for key in chr_files.keys():
         #release version
         self.ver = re.sub(r"_chr.*", '', chr_files[key])
         url = self.url['dna_fa']+chr_files[key]
         gz_file = myIO.file_os(url).download(self.out_dir)
         #decompress file
         #ungz_file=myIO.file_os(gz_file).decompress_gz()
         local_chr_files[key] = gz_file
     #combine fa files
     out_file = ''.join([self.out_dir, self.ver,'_dna.fa'])
     #print out_file
     myGenome.genome(out_file).combine_fa(local_chr_files)
     return local_chr_files, out_file
Beispiel #5
0
 def download_dna(self):
     url = self.url['dna_fa']
     #get genome files
     #get html
     lines = web(url).get_html()
     chr_files = self.dna_files(lines)
     
     #download and decompress genome files
     local_chr_files = {}
     for key in chr_files.keys():
         self.ver = re.sub(r"\.chromosome.*", '', chr_files[key])
         gz_file = myIO.file_os(url+chr_files[key]).download(self.out_dir)
         #decompress file
         #ungz_file=myIO.file_os(gz_file).decompress_gz()
         local_chr_files[key] = gz_file
     #combine fa files
     out_file = self.out_dir+self.ver+'.fa'
     #print out_file
     myGenome.genome(out_file).combine_fa(local_chr_files)
     return local_chr_files, out_file
    def main_loop(self):
        print("\n\n####Parameters of PHIP: \n")
        #parallel procesing
        if self.par['phip_alignment'] == 'yes' or self.par[
                'phip_counting'] == 'yes':
            sample_names = self.par['sample_names']
            print(sample_names.__len__(), ' samples will be analyzed.\n')
            #multi-threads
            #myCommon.basic(self.par).pp_map_threads(self.phipseq_alignment, sample_names)
            #multi-processes
            myCommon.basic(self.par).pp_map_process(mp_alignment,
                                                    [(self, s)
                                                     for s in sample_names])

        #combine RC and statistics file
        if self.par['phip_merge'] == 'yes':
            pep_names = myGenome.genome(self.par['file_ref_fa']).read_fa()[1]
            #1: combine RC files into RC matrix
            print('\n\n\n###Combine RC files (phip_merge)\n')
            #get arguments
            args_list = []
            RC_level = 'lowRC'
            #peptide level: lowRC
            out_file = self.par['files_dict']['pep_RC']
            arg_tuple = ('_RC.txt', RC_level, out_file, pep_names)
            args_list.append(arg_tuple)
            if 'file_annotation' in self.par:
                #promax level
                out_file = self.par['files_dict']['promax_RC']
                arg_tuple = ('_pro_maxRC.txt', RC_level, out_file, None)
                args_list.append(arg_tuple)
                #prosum level
                out_file = self.par['files_dict']['prosum_RC']
                arg_tuple = ('_pro_sumRC.txt', RC_level, out_file, None)
                args_list.append(arg_tuple)
            #multi-threads
            myCommon.basic(self.par).pp_map_threads(
                myAlign.alignment(self.par).combine_countfiles, args_list)
            #myCommon.basic(self.par).pp_apply_threads(args_list)
            #2: generate statistics.csv
            myCommon.basic(self.par).QC_statistics()

        #significance analysis using Z score
        if self.par['phip_zscores'] == 'yes':
            print('\n\n\n###normalization of RC (phip_zscores)\n')
            #peptides level
            RC_file = self.par['files_dict']['pep_RC']  #infile
            #1: scaling RCs
            sRC_file = self.par['files_dict']['pep_scalingRC']  # outfile
            myStat.normalization(self.par, RC_file, sRC_file,
                                 'pep_id').RC_scaling()
            #2: z-scores of scaling RCs against negative controls and phipseq samples
            zfile = self.par['files_dict']['pep_NCPHIPzscores']  #outfile
            if 'file_NC' in self.par.keys():
                myStat.normalization(self.par, sRC_file, zfile,
                                     'pep_id').NCPHIPzscores_PN()
            else:
                myStat.normalization(self.par, sRC_file, zfile,
                                     'pep_id').NCPHIPzscores_RLM()

            #3:collpase matrix
            if 'file_annotation' in self.par:
                print("\t######collapse peptide matrix into protein matrix")
                pars = []
                for name in ['scalingRC', 'NCPHIPzscores']:
                    pep_file = self.par['files_dict']['pep_' + name]  #infile
                    sum_file = self.par['files_dict']['pep_' + name +
                                                      '_prosum']  #outfile
                    pars.append((pep_file, sum_file, sum))
                    max_file = self.par['files_dict']['pep_' + name +
                                                      '_promax']  #outfile
                    pars.append((pep_file, max_file, max))
                #multiple-threading
                myCommon.basic(self.par).pp_map_threads(
                    myCommon.basic(self.par).collapse_matrix, pars)

        #Functional analysis after normalization and correction
        #parallel processing
        print('\n\n\n###Functional Analysis (phip_GP and phip_enrichment)\n')
        pool = mpd.Pool(processes=self.par['threads_num'])
        #set the list of parameters
        pep_zfile = self.par['files_dict']['pep_NCPHIPzscores']  #infile
        promax_zfile = self.par['files_dict']['pep_NCPHIPzscores_promax']
        prosum_zfile = self.par['files_dict']['pep_NCPHIPzscores_prosum']
        if self.par['phip_GP'] == 'yes':
            #1: polyclonal of signficant peptides
            pool.apply_async(self.sig_polyclonal, args=(pep_zfile, ))

            #virus only
            if 'VirScan' in self.par['file_annotation']:
                #5: inter/intra specie searching only for virus library#####
                pool.apply_async(self.taxon_spec,
                                 args=(
                                     pep_zfile,
                                     'phip_taxon',
                                     'pep_id',
                                 ))
                #6: specie alignment of virus only
                file_aln = self.par['dir_ref_seq'] + 'specie_blast.txt'
                pool.apply_async(self.taxon_blast,
                                 args=(
                                     file_aln,
                                     pep_zfile,
                                 ))
                #7: organism alignment of virus only
                file_aln = self.par['dir_ref_seq'] + 'organism_blast.txt'
                pool.apply_async(self.taxon_blast,
                                 args=(
                                     file_aln,
                                     pep_zfile,
                                 ))

            ##quality control
            #1: relationship between significant hits and raw read num
            pool.apply_async(myCommon.basic(self.par).QC_hits,
                             args=(pep_zfile, ))
            pool.apply_async(myCommon.basic(self.par).QC_hits,
                             args=(prosum_zfile, ))
            pool.apply_async(myCommon.basic(self.par).QC_hits,
                             args=(promax_zfile, ))
            #2:saturation analysis
            pool.apply_async(myCommon.basic(self.par).QC_saturation)

        if self.par['phip_enrichment'] == 'yes':
            #5:Detection of enriched protein motifs
            E = myCommon.basic(self.par)
            if 'pro_motifs' in list(self.par['annot_df']):
                pool.apply_async(E.enrich_pro,
                                 args=(
                                     pep_zfile,
                                     'pep_id',
                                     'pro_motifs',
                                     ';',
                                     ',',
                                 ))
            #6:GO,loci,PPI,KEGG,InterPro, multifunctional scaffold protein enrichment analysis
            terms = set([
                'GO', 'map', 'PPI', 'KEGG', 'InterPro', 'MIM', 'autoantigen'
            ]) & set(list(self.par['annot_df']))
            for term in terms:
                pro = self.par['protein_assoc']
                pool.apply_async(E.enrich_pro,
                                 args=(
                                     prosum_zfile,
                                     pro,
                                     term,
                                     ',',
                                     None,
                                 ))
                pool.apply_async(E.enrich_pro,
                                 args=(
                                     promax_zfile,
                                     pro,
                                     term,
                                     ',',
                                     None,
                                 ))
        pool.close()
        pool.join()
Beispiel #7
0
        sys.exit(2)

    #no return


##############################
if __name__ == "__main__":
    #home_dir=os.path.expanduser("~")+'/'
    #get parameters from command line
    par = par_command(sys.argv)
    #judge parameters
    judge_par(par)

    #combine index files if  par['I1_file'] and par['I2_file']
    if os.path.isfile(par['I1_file']) and os.path.isfile(par['I2_file']):
        myGenome.genome(par['I1_file']).cbind_fq(par['I2_file'],
                                                 par['index_file'])

    #demultiplexing: split fastq files based on barcode
    if par['multiplexing_mode'] == 3:
        print('The splited FASTQ files are stored into {}'.format(
            par['dir_raw_data']))
        myGenome.genome(par['fq_file']).demultiplex_fq(par)

    #trim fastq files
    if 'fq_files' in par.keys():
        for fq in par['fq_files']:
            myGenome.genome(fq).trim_fq(par['dir_raw_data'], par['seq_start'],
                                        par['seq_end'])

    #generate sample_info file under result dir:
    if par['out'] != 'NA':
   
##############################
if __name__ == "__main__":
    #home_dir=os.path.expanduser("~")+'/'
    #get parameters from command line
    par = par_command(sys.argv)
    #judge parameters
    judge_par(par)
    
    #combine index files if par['index_file'] or par['I1_file'] and par['I2_file']
    #if os.path.isfile(par['I1_file']) and os.path.isfile(par['I2_file']):
    #    myGenome.genome(par['I1_file']).cbind_fq(par['I2_file'], par['index_file'])

    #demultiplexing: split fastq files based on barcode
    if os.path.isfile(par['index_file']):
        myGenome.genome(par['fq_file']).decompose_fq(par)
    elif os.path.isfile(par['I1_file']) and os.path.isfile(par['I2_file']):
        myGenome.genome(par['fq_file']).decompose_fq2(par)
        
    #trim fastq files
    if 'fq_files' in par.keys():
        for fq in par['fq_files']:
            myGenome.genome(fq).trim_fq(par['dir_raw_data'], par['seq_start'], par['seq_end'])
            
    #generate sample_info file under result dir: 
    if par['out'] != 'NA':
        #current dir
        par['dir_bin'] = os.path.dirname(os.path.realpath(__file__)) + '/'
        par['dir_home'] = os.path.abspath(os.path.join(par['dir_bin'], os.pardir)) + '/'
        print('Home directory of phip pipsline: ', par['dir_home'])