def init_analysis(self): #1: read annotation file if 'file_annotation' in self.par.keys(): self.par['annot_df'] = myDataframe.basic().annot_df( self.par['file_annotation']) #genome annotation: associations of protein-peptides self.par['dict_pro_pep'] = myCommon.basic( self.par).protein_peptides() #virus only if 'VirScan' in self.par['file_annotation']: #extract aa stretch #get dependent petides that two peptides shared at least 7-aa. self.par['dependent_pep'] = myCommon.basic( self.par).taxon_dependent_peptides() #2: check bowtie or build bowtie index myAlign.alignment(self.par).build_bowtie_index() #3: sample info self.par = myParallel.samples(self.par).export_sample_info() #samples of negative controls group1 = self.par['group1'] if 'NC' in group1.keys(): self.par['NC_samples'] = group1['NC'].split(',') self.par['phip_samples'] = list( set(self.par['sample_names']) - set(self.par['NC_samples'])) print('\nNumber of negative Controls (Beads only): ', self.par['NC_samples'].__len__()) print('Number of PhIP samples: ', self.par['sample_names'].__len__()) #myDict.basic(self.par['sample_dirs']).print_dict() #read reference sequence file (*.fa) ref_dict, ref_ids = myGenome.genome(self.par['file_ref_fa']).read_fa() self.par['ref_dict'] = ref_dict
def phipseq_alignment(self, sample_name): print('\n######Anslysis of {} will be trigerred!#####'.format( sample_name)) #initiate sample par sample_var = dict(self.par) sample_var['start_time'] = time.time() #sample name sample_var['sample_name'] = sample_name #sample directory sample_dir = self.par['sample_dirs'][sample_name] sample_var['sample_dir'] = myIO.dir_os(sample_dir).create_dir() print('\tSample directory: ', sample_var['sample_dir']) #raw data sample_var['sample_raw_files'] = ','.join( sample_var['sample_to_raw'][sample_name]) print('\tRaw files: ', sample_var['sample_raw_files']) #export sample_var['file_head'] = sample_var['sample_dir'] + sample_name #default same file sample_var['sample_sam_file'] = sample_var['file_head'] + '.sam' #file of read counts sample_var['sample_RC_file'] = sample_var['file_head'] + '_RC.txt' sample_var['sample_pro_sumRC_file'] = sample_var[ 'file_head'] + '_pro_sumRC.txt' sample_var['sample_pro_maxRC_file'] = sample_var[ 'file_head'] + '_pro_maxRC.txt' #file for saturation analysis sample_var['sample_saturation_file'] = sample_var[ 'file_head'] + '_saturation.txt' #sample log sample_var['sample_log'] = sample_var['file_head'] + '.log' #sequence alignment if sample_var['phip_alignment'] == 'yes': print("\n###sequence alignment", sample_var['tool_aligner']) #output is sam file if sample_var['tool_aligner'] == 'bowtie1': myAlign.alignment(sample_var).bowtie1_alignment() #counts reads if sample_var['phip_counting'] == 'yes': #RC matrix by peptides myAlign.alignment(sample_var).count_reads() #RC matrix by proteins if 'file_annotation' in self.par.keys(): self.combine_peptides(sample_var) #update sample log sample_times = mySystem.system().get_time(sample_var['start_time']) sample_times['sample_name'] = sample_name myIO.file_os(sample_var['sample_log'], '=').line_replace(sample_times)
def main_loop(self): print("\n\n####Parameters of PHIP: \n") #parallel procesing if self.par['phip_alignment'] == 'yes' or self.par[ 'phip_counting'] == 'yes': sample_names = self.par['sample_names'] print(sample_names.__len__(), ' samples will be analyzed.\n') #multi-threads #myCommon.basic(self.par).pp_map_threads(self.phipseq_alignment, sample_names) #multi-processes myCommon.basic(self.par).pp_map_process(mp_alignment, [(self, s) for s in sample_names]) #combine RC and statistics file if self.par['phip_merge'] == 'yes': #1: combine RC files into RC matrix print('\n\n\n###Combine RC files (phip_merge)\n') #get arguments args_list = [] RC_level = 'lowRC' #peptide level: lowRC out_file = self.par['files_dict']['pep_RC'] arg_tuple = ('_RC.txt', RC_level, out_file, self.par['pep_ids']) args_list.append(arg_tuple) if 'file_annotation' in self.par: #promax level out_file = self.par['files_dict']['promax_RC'] arg_tuple = ('_pro_maxRC.txt', RC_level, out_file, self.par['pro_ids']) args_list.append(arg_tuple) #prosum level out_file = self.par['files_dict']['prosum_RC'] arg_tuple = ('_pro_sumRC.txt', RC_level, out_file, self.par['pro_ids']) args_list.append(arg_tuple) #multi-threads myCommon.basic(self.par).pp_map_threads( myAlign.alignment(self.par).combine_countfiles, args_list) #myCommon.basic(self.par).pp_apply_threads(args_list) #2: generate statistics.csv myCommon.basic(self.par).QC_statistics() #significance analysis using Z score if self.par['phip_zscores'] == 'yes': print('\n\n\n###normalization of RC (phip_zscores)\n') #peptides level RC_file = self.par['files_dict']['pep_RC'] #infile #1: scaling RCs sRC_file = self.par['files_dict']['pep_scalingRC'] # outfile myStat.normalization(self.par, RC_file, sRC_file, 'pep_id').RC_scaling() #2: z-scores of scaling RCs against negative controls and phipseq samples zfile = self.par['files_dict']['pep_NCPHIPzscores'] #outfile if 'file_NC' in self.par.keys(): myStat.normalization(self.par, sRC_file, zfile, 'pep_id').NCPHIPzscores_PN() else: myStat.normalization(self.par, sRC_file, zfile, 'pep_id').NCPHIPzscores_RLM() #3:collpase matrix if 'file_annotation' in self.par: print("\t######collapse peptide matrix into protein matrix") pars = [] for name in ['scalingRC', 'NCPHIPzscores']: pep_file = self.par['files_dict']['pep_' + name] #infile sum_file = self.par['files_dict']['pep_' + name + '_prosum'] #outfile pars.append((pep_file, sum_file, sum)) max_file = self.par['files_dict']['pep_' + name + '_promax'] #outfile pars.append((pep_file, max_file, max)) #multiple-threading myCommon.basic(self.par).pp_map_threads( myCommon.basic(self.par).collapse_matrix, pars) #Functional analysis after normalization and correction #parallel processing print('\n\n\n###Functional Analysis (phip_GP and phip_enrichment)\n') pool = mpd.Pool(processes=self.par['threads_num']) #set the list of parameters pep_zfile = self.par['files_dict']['pep_NCPHIPzscores'] #infile promax_zfile = self.par['files_dict']['pep_NCPHIPzscores_promax'] prosum_zfile = self.par['files_dict']['pep_NCPHIPzscores_prosum'] if self.par['phip_GP'] == 'yes': #1: polyclonal of signficant peptides pool.apply_async(self.sig_polyclonal, args=(pep_zfile, )) #virus only if 'VirScan' in self.par['file_annotation']: #5: inter/intra specie searching only for virus library##### pool.apply_async(self.taxon_spec, args=( pep_zfile, 'taxon_phip', 'pep_id', )) #6: specie alignment of virus only file_aln = self.par['dir_ref_seq'] + 'specie_blast.txt' pool.apply_async(self.taxon_blast, args=( file_aln, pep_zfile, )) #7: organism alignment of virus only file_aln = self.par['dir_ref_seq'] + 'organism_blast.txt' pool.apply_async(self.taxon_blast, args=( file_aln, pep_zfile, )) ##quality control #1: relationship between significant hits and raw read num pool.apply_async(myCommon.basic(self.par).QC_hits, args=(pep_zfile, )) pool.apply_async(myCommon.basic(self.par).QC_hits, args=(prosum_zfile, )) pool.apply_async(myCommon.basic(self.par).QC_hits, args=(promax_zfile, )) #2:saturation analysis pool.apply_async(myCommon.basic(self.par).QC_saturation) if self.par['phip_enrichment'] == 'yes': #5:Detection of enriched protein motifs E = myCommon.basic(self.par) if 'pro_motifs' in list(self.par['annot_df']): pool.apply_async(E.enrich_pro, args=( pep_zfile, 'pep_id', 'pro_motifs', ';', ',', )) #6:GO,loci,PPI,KEGG,InterPro, multifunctional scaffold protein enrichment analysis terms = set([ 'GO', 'map', 'PPI', 'KEGG', 'InterPro', 'MIM', 'autoantigen' ]) & set(list(self.par['annot_df'])) for term in terms: pool.apply_async(E.enrich_pro, args=( prosum_zfile, 'pro_id', term, ',', None, )) pool.apply_async(E.enrich_pro, args=( promax_zfile, 'pro_id', term, ',', None, )) pool.close() pool.join()