def init_analysis(self): #1: read annotation file if 'file_annotation' in self.par.keys(): self.par['annot_df'] = myDataframe.basic().annot_df( self.par['file_annotation']) #genome annotation: associations of protein-peptides self.par['dict_pro_pep'] = myCommon.basic( self.par).protein_peptides() #virus only if 'VirScan' in self.par['file_annotation']: #extract aa stretch #get dependent petides that two peptides shared at least 7-aa. self.par['dependent_pep'] = myCommon.basic( self.par).taxon_dependent_peptides() #2: check bowtie or build bowtie index myAlign.alignment(self.par).build_bowtie_index() #3: sample info self.par = myParallel.samples(self.par).export_sample_info() #samples of negative controls group1 = self.par['group1'] if 'NC' in group1.keys(): self.par['NC_samples'] = group1['NC'].split(',') self.par['phip_samples'] = list( set(self.par['sample_names']) - set(self.par['NC_samples'])) print('\nNumber of negative Controls (Beads only): ', self.par['NC_samples'].__len__()) print('Number of PhIP samples: ', self.par['sample_names'].__len__()) #myDict.basic(self.par['sample_dirs']).print_dict() #read reference sequence file (*.fa) ref_dict, ref_ids = myGenome.genome(self.par['file_ref_fa']).read_fa() self.par['ref_dict'] = ref_dict
def sig_polyclonal(self, count_file): #count_file = args_tuple print("Polyclonal analysis of ", count_file) comb_df, pep_df = myCommon.basic(self.par).combine_df(count_file) #functions def hits_func(x, peps, threshold, pro_id): #signficant hits hits = x[x >= threshold] #non_overlapping peptides peps = [str(x) for x in peps] hit_peps = [str(x) for x in hits.index] none_overlapped_hits_num = myList.basic(peps).un_neighbours( hit_peps, return_type='hits_num') #if none_overlapped_hits_num>1: print "%d,%d" %(len(list(hits.index)), none_overlapped_hits_num) #if len(hit_peps)>0: print pro_id, peps, hit_peps #if pro_id == 'Q9YLJ1': print pro_id, peps, hit_peps return len(list( hits.index)), none_overlapped_hits_num, ','.join(hit_peps) #collapse by protein hits1 = {} hits2 = {} #n = 1 for pro_id, row_index in comb_df.groupby('pro_id').groups.items(): #row is protein id ##get protein-peptides annotations peps_str = self.par['dict_pro_pep'][pro_id] peps = peps_str.split(',') #df by protein sub_df = pep_df.ix[row_index] #print("{}\t{}".format(pro_id, list(sub_df.index)) ) #hits num beyond zscore threshold hits_num = sub_df.apply(hits_func, axis=0, args=(peps, self.par['zscore_threshold'], pro_id)) #if pro_id == 'Q9YLJ1': print hits_num #all number of significant hits num1 = [h[0] for h in hits_num] hits1[pro_id] = dict(zip(list(sub_df), list(num1))) #number of sig hits without overlapping num2 = [h[1] for h in hits_num] hits2[pro_id] = dict(zip(list(sub_df), list(num2))) #if (np.sum(num1))>10: #pd.set_option('display.max_columns', None) #pd.set_option('display.max_rows', None) #print np.matrix(np.round(sub_df)) #print num1 #print num2 #n+ = 1 #if n == 10: break #export file_head = myIO.file_os(count_file).file_prefix() + '_polyclonal' myDict.basic(hits1, self.par['pro_ids']).dict2_to_file( file_head + '.txt', "\t") myDict.basic(hits2, self.par['pro_ids']).dict2_to_file( file_head + '_nonoverlapped.txt', "\t")
def taxon_spec(self, count_file, taxon_rank, annot_index): #combine two data frame combined_df, phip_df = myCommon.basic(self.par).combine_df( count_file, annot_index) #print(combined_df) #print(list(combined_df.index)) #taxonomy names: taxon_group = combined_df.groupby(taxon_rank).groups taxon_names = taxon_group.keys() taxon_names = [t for t in taxon_names if str(t) != 'nan'] #remove nan #print(taxon_names) taxon_pairs = {'taxon_specie':'InterSpecie', 'taxon_genus':'InterGenus', \ 'taxon_family':'InterFamily', 'taxon_phip':'InterTaxon'} taxon_inter = taxon_pairs[taxon_rank] #inter-score dict #taxon_inter should be pep_ids separated by comma pepid_taxoninter = pd.Series(combined_df[taxon_inter], index=list(phip_df.index)) inter_df = myDataframe.basic(phip_df).interact_df( pepid_taxoninter, max, count_file + taxon_inter) #make permutation of pep_ids #permute_dict = myList.basic(list(phip_df.index)).permute_Series(self.par['permutation_times'], slice_dict = taxon_group) #the hits of significant specie specific #rows are peptides, and columns are phip samples plus species names #z-scores matrix of specific peptides #initiate nested dict taxon_dict = dict([(s, {}) for s in list(phip_df)]) # number of hits taxon_dict['peptides'] = dict([(a, len(b)) for a, b in taxon_group.items()]) #taxon_pval_dict = dict([(s,{}) for s in list(phip_df)]) #pvalues of the hits by permutations taxon_pep_dict = dict([(s, {}) for s in list(phip_df) ]) #pepid and zscores of hits debugging_dict = {} #for identify bugs for s in list(phip_df): debugging_dict[s + ':all_hits'] = {} debugging_dict[s + ':inter_hits'] = {} debugging_dict[s + ':intra_hits'] = {} debugging_dict[s + ':hits'] = {} debugging_dict[s + ':counts'] = {} #debugging_dict[s+':pvals'] = {} #loop by sample_names for sample_name, col in phip_df.items(): #print(sample_name) for s, indexs in taxon_group.items(): #1: inter-taxon searching inter_list = inter_df.ix[indexs][sample_name] inter_dict = self.taxon_inter_searching( col[indexs], inter_list) #export debugging_dict[sample_name + ':all_hits'][s] = inter_dict['all_hits'] debugging_dict[sample_name + ':inter_hits'][s] = inter_dict['inter_hits'] #print(inter_dict) #2: intra-taxon searching intra_dict = self.taxon_intra_searching( col[inter_dict['other_hits']]) #export debugging_dict[sample_name + ':intra_hits'][s] = intra_dict['intra_hits'] debugging_dict[sample_name + ':hits'][s] = intra_dict['hits'] all_hits = [ '{}:{}'.format('all', len(inter_dict['all_hits'])), '{}:{}'.format('inter', len(inter_dict['inter_hits'])), '{}:{}'.format('intra', len(intra_dict['intra_hits'])), '{}:{}'.format('hits', len(intra_dict['hits'])) ] debugging_dict[sample_name + ':counts'][s] = ','.join(all_hits) hit_list = [ '({},{})'.format(a, b) for a, b in col[intra_dict['hits']].items() ] taxon_pep_dict[sample_name][s] = ','.join(hit_list) #counts matrix of taxonomy search taxon_dict[sample_name][s] = len(intra_dict['hits']) #3: permutation #hit_scores = col[intra_dict['hits']] #permuted_scores = permute_dict[s]#df, pepids in rows, permuted scores in columns #pval_dict = self.taxon_permutation(hit_scores, permuted_scores, col) #export #pval_list = [len(intra_dict['hits']), pval_dict['ttest_pval'], pval_dict['utest_pval']] #taxon_pval_dict[sample_name][s] = ','.join(map(str, pval_list)) #pval_list = [ a+':'+str(b) for a,b in pval_dict.items()] #debugging_dict[sample_name+':pvals'][s] = ','.join(pval_list) #export to file file_head = '{}_{}_'.format( myIO.file_os(count_file).file_prefix(), taxon_rank) taxon_dict = myDict.basic(taxon_dict).transform_dict2() myDict.basic(taxon_dict).dict2_to_file(file_head + 'counting.txt', "\t") taxon_pep_dict = myDict.basic(taxon_pep_dict).transform_dict2() myDict.basic(taxon_pep_dict).dict2_to_file(file_head + 'peptides.txt', "\t") debugging_dict = myDict.basic(debugging_dict).transform_dict2() myDict.basic(debugging_dict).dict2_to_file(file_head + 'debugging.txt', "\t")
def main_loop(self): print("\n\n####Parameters of PHIP: \n") #parallel procesing if self.par['phip_alignment'] == 'yes' or self.par[ 'phip_counting'] == 'yes': sample_names = self.par['sample_names'] print(sample_names.__len__(), ' samples will be analyzed.\n') #multi-threads #myCommon.basic(self.par).pp_map_threads(self.phipseq_alignment, sample_names) #multi-processes myCommon.basic(self.par).pp_map_process(mp_alignment, [(self, s) for s in sample_names]) #combine RC and statistics file if self.par['phip_merge'] == 'yes': #1: combine RC files into RC matrix print('\n\n\n###Combine RC files (phip_merge)\n') #get arguments args_list = [] RC_level = 'lowRC' #peptide level: lowRC out_file = self.par['files_dict']['pep_RC'] arg_tuple = ('_RC.txt', RC_level, out_file, self.par['pep_ids']) args_list.append(arg_tuple) if 'file_annotation' in self.par: #promax level out_file = self.par['files_dict']['promax_RC'] arg_tuple = ('_pro_maxRC.txt', RC_level, out_file, self.par['pro_ids']) args_list.append(arg_tuple) #prosum level out_file = self.par['files_dict']['prosum_RC'] arg_tuple = ('_pro_sumRC.txt', RC_level, out_file, self.par['pro_ids']) args_list.append(arg_tuple) #multi-threads myCommon.basic(self.par).pp_map_threads( myAlign.alignment(self.par).combine_countfiles, args_list) #myCommon.basic(self.par).pp_apply_threads(args_list) #2: generate statistics.csv myCommon.basic(self.par).QC_statistics() #significance analysis using Z score if self.par['phip_zscores'] == 'yes': print('\n\n\n###normalization of RC (phip_zscores)\n') #peptides level RC_file = self.par['files_dict']['pep_RC'] #infile #1: scaling RCs sRC_file = self.par['files_dict']['pep_scalingRC'] # outfile myStat.normalization(self.par, RC_file, sRC_file, 'pep_id').RC_scaling() #2: z-scores of scaling RCs against negative controls and phipseq samples zfile = self.par['files_dict']['pep_NCPHIPzscores'] #outfile if 'file_NC' in self.par.keys(): myStat.normalization(self.par, sRC_file, zfile, 'pep_id').NCPHIPzscores_PN() else: myStat.normalization(self.par, sRC_file, zfile, 'pep_id').NCPHIPzscores_RLM() #3:collpase matrix if 'file_annotation' in self.par: print("\t######collapse peptide matrix into protein matrix") pars = [] for name in ['scalingRC', 'NCPHIPzscores']: pep_file = self.par['files_dict']['pep_' + name] #infile sum_file = self.par['files_dict']['pep_' + name + '_prosum'] #outfile pars.append((pep_file, sum_file, sum)) max_file = self.par['files_dict']['pep_' + name + '_promax'] #outfile pars.append((pep_file, max_file, max)) #multiple-threading myCommon.basic(self.par).pp_map_threads( myCommon.basic(self.par).collapse_matrix, pars) #Functional analysis after normalization and correction #parallel processing print('\n\n\n###Functional Analysis (phip_GP and phip_enrichment)\n') pool = mpd.Pool(processes=self.par['threads_num']) #set the list of parameters pep_zfile = self.par['files_dict']['pep_NCPHIPzscores'] #infile promax_zfile = self.par['files_dict']['pep_NCPHIPzscores_promax'] prosum_zfile = self.par['files_dict']['pep_NCPHIPzscores_prosum'] if self.par['phip_GP'] == 'yes': #1: polyclonal of signficant peptides pool.apply_async(self.sig_polyclonal, args=(pep_zfile, )) #virus only if 'VirScan' in self.par['file_annotation']: #5: inter/intra specie searching only for virus library##### pool.apply_async(self.taxon_spec, args=( pep_zfile, 'taxon_phip', 'pep_id', )) #6: specie alignment of virus only file_aln = self.par['dir_ref_seq'] + 'specie_blast.txt' pool.apply_async(self.taxon_blast, args=( file_aln, pep_zfile, )) #7: organism alignment of virus only file_aln = self.par['dir_ref_seq'] + 'organism_blast.txt' pool.apply_async(self.taxon_blast, args=( file_aln, pep_zfile, )) ##quality control #1: relationship between significant hits and raw read num pool.apply_async(myCommon.basic(self.par).QC_hits, args=(pep_zfile, )) pool.apply_async(myCommon.basic(self.par).QC_hits, args=(prosum_zfile, )) pool.apply_async(myCommon.basic(self.par).QC_hits, args=(promax_zfile, )) #2:saturation analysis pool.apply_async(myCommon.basic(self.par).QC_saturation) if self.par['phip_enrichment'] == 'yes': #5:Detection of enriched protein motifs E = myCommon.basic(self.par) if 'pro_motifs' in list(self.par['annot_df']): pool.apply_async(E.enrich_pro, args=( pep_zfile, 'pep_id', 'pro_motifs', ';', ',', )) #6:GO,loci,PPI,KEGG,InterPro, multifunctional scaffold protein enrichment analysis terms = set([ 'GO', 'map', 'PPI', 'KEGG', 'InterPro', 'MIM', 'autoantigen' ]) & set(list(self.par['annot_df'])) for term in terms: pool.apply_async(E.enrich_pro, args=( prosum_zfile, 'pro_id', term, ',', None, )) pool.apply_async(E.enrich_pro, args=( promax_zfile, 'pro_id', term, ',', None, )) pool.close() pool.join()
print('###permutation procedure\n\n') pool = mpd.Pool(processes=par['threads_num']) #permuation of organism alignment if par['organism_permutation'] == 'yes': #read aln file file_aln = par['dir_home'] + 'ref_seq/organism_blast.txt' par['binary_aln_df'] = myDataframe.basic().aln_df( file_aln, par['align_score']) par['type'] = myIO.file_os(file_aln).name_prefix() par['dir_out'] = myIO.dir_os(par['dir_home'] + 'permutation/' + par['type']).create_dir() # for hits_num in range(par['start'], par['end']): pool.apply_async(myCommon.basic(par).permute_taxon_blast, args=(hits_num, )) time.sleep(1) #permuation of specie alignment if par['specie_permutation'] == 'yes': #read aln file file_aln = par['dir_home'] + 'ref_seq/specie_blast.txt' par['binary_aln_df'] = myDataframe.basic().aln_df( file_aln, par['align_score']) par['type'] = myIO.file_os(file_aln).name_prefix() par['dir_out'] = myIO.dir_os(par['dir_home'] + 'permutation/' + par['type']).create_dir() # for hits_num in range(par['start'], par['end']): pool.apply_async(myCommon.basic(par).permute_taxon_blast,