def export_sample_info(self): print('get sample_raw and raw_sample') if os.path.isfile(self.par['file_sample_info']): self.file_to_samples() else: self.raw_to_samples() #generate sample file self.sample_info() # print("\nSample and raw files:") myDict.basic(self.sample_raw).print_dict() self.par['raw_to_sample'] = self.raw_sample # one raw file vs one sample name self.par['sample_to_raw'] = self.sample_raw #one sample name vs a list of raw files #get sample names self.sample_names = sorted(self.sample_raw.keys()) self.par['sample_names'] = self.sample_names #get sample_dirs self.sample_storage() self.par['sample_dirs'] = self.sample_dirs print('get group names if exists') flag = 1 while flag > 0: group_samples, sample_groups=self.group_names(flag+2) if group_samples == {}: flag = 0 else: key = 'group' + str(flag) self.par[key] = group_samples flag += 1 print('Groups of {}: {}'.format(key,self.par[key].keys())) return self.par
def protein_peptides(self): pro_pep = {} #read annotation file annot_dict = myIO.file_os(self.par['file_annotation'], "\t").to_dict2() if 'Rnl2_SPIKEIN' in annot_dict: annot_dict['Rnl2_SPIKEIN']['pep_rank'] = 0 in_pro = [annot_dict[p]['pro_id'] for p in annot_dict.keys()] in_pro = list(set(in_pro)) print('In proteins:{}, In peptides:{}'.format(in_pro.__len__(), annot_dict.keys().__len__())) ## pro_rank_pep = {} for pep_id in self.par['pep_ids']: pro_id = annot_dict[pep_id]['pro_id'] pep_rank = annot_dict[pep_id]['pep_rank'] pep_rank = int(pep_rank) if isinstance(pep_rank, int) else 0 if pro_id in pro_rank_pep: pro_rank_pep[pro_id][pep_id] = pep_rank else: pro_rank_pep[pro_id] = {pep_id:pep_rank} #print pro_rank_pep[pro_id] # pep_num = 0 for pro_id, pep_dict in pro_rank_pep.items(): #print sorted(pep_dict.keys()) peps = sorted(pro_rank_pep[pro_id], key = pro_rank_pep[pro_id].get) pep_num += len(peps) pro_pep[pro_id] = ','.join(peps) #export print("Number of protein:{}\tNumber of peptides:{}.".format(len(pro_pep.keys()), pep_num)) myDict.basic(pro_pep, self.par['pro_ids']).dict_to_file(self.par['file_pro_pep'], "\t") # return pro_pep
def sig_polyclonal(self, count_file): #count_file = args_tuple print("Polyclonal analysis of ", count_file) comb_df, pep_df = myCommon.basic(self.par).combine_df(count_file) #functions def hits_func(x, peps, threshold, pro_id): #signficant hits hits = x[x >= threshold] #non_overlapping peptides peps = [str(x) for x in peps] hit_peps = [str(x) for x in hits.index] none_overlapped_hits_num = myList.basic(peps).un_neighbours( hit_peps, return_type='hits_num') #if none_overlapped_hits_num>1: print "%d,%d" %(len(list(hits.index)), none_overlapped_hits_num) #if len(hit_peps)>0: print pro_id, peps, hit_peps #if pro_id == 'Q9YLJ1': print pro_id, peps, hit_peps return len(list( hits.index)), none_overlapped_hits_num, ','.join(hit_peps) #collapse by protein hits1 = {} hits2 = {} #n = 1 for pro_id, row_index in comb_df.groupby('pro_id').groups.items(): #row is protein id ##get protein-peptides annotations peps_str = self.par['dict_pro_pep'][pro_id] peps = peps_str.split(',') #df by protein sub_df = pep_df.ix[row_index] #print("{}\t{}".format(pro_id, list(sub_df.index)) ) #hits num beyond zscore threshold hits_num = sub_df.apply(hits_func, axis=0, args=(peps, self.par['zscore_threshold'], pro_id)) #if pro_id == 'Q9YLJ1': print hits_num #all number of significant hits num1 = [h[0] for h in hits_num] hits1[pro_id] = dict(zip(list(sub_df), list(num1))) #number of sig hits without overlapping num2 = [h[1] for h in hits_num] hits2[pro_id] = dict(zip(list(sub_df), list(num2))) #if (np.sum(num1))>10: #pd.set_option('display.max_columns', None) #pd.set_option('display.max_rows', None) #print np.matrix(np.round(sub_df)) #print num1 #print num2 #n+ = 1 #if n == 10: break #export file_head = myIO.file_os(count_file).file_prefix() + '_polyclonal' myDict.basic(hits1, self.par['pro_ids']).dict2_to_file( file_head + '.txt', "\t") myDict.basic(hits2, self.par['pro_ids']).dict2_to_file( file_head + '_nonoverlapped.txt', "\t")
def line_replace(self, new_dict={}): #get old variables var_dict = self.to_dict() #refresh new for name in new_dict.keys(): var_dict[name] = new_dict[name] #export to file myDict.basic(var_dict).dict_to_file(self.file, self.sep)
def sample_info(self): sample_pairs = {} for raw_file, sample_name in self.raw_sample.items(): raw_file_name = myIO.file_os(raw_file).file_name() group = 'NC' if 'BEADS' in raw_file_name.upper() else 'PhIP' if not 'unassigned' in raw_file_name: sample_name = re.sub('_R1', "", sample_name) pair = '{},{}'.format(raw_file_name, sample_name) sample_pairs[pair]=group #export dict to file print('Generate sample file: ', self.par['file_sample_info']) #order per record: fastq file name, sample_name, phip_group myDict.basic(sample_pairs).dict_to_file(self.par['file_sample_info'], ',')
def line_add(self, new_dict={}): #get old variables var_dict = self.to_dict() #refresh new for name in new_dict.keys(): if name in var_dict: var_dict[name] = int(var_dict[name]) + int(new_dict[name]) else: var_dict[name] = new_dict[name] #export to file myDict.basic(var_dict).dict_to_file(self.file, self.sep) ## #end
def permute_col(self, times=2, slice_dict=None): #add shuffled dict into embed_dict embed_dict = {} for i in range(times): #1: shuffle data frame shuffled_df = self.df.iloc[np.random.permutation(len(self.df))].copy() shuffled_df.index = self.df.index #print shuffled_df #2: convert permuted dfs into embeded dataframe shuffled_dict=shuffled_df.to_dict() embed_dict=myDict.basic(embed_dict).combine_dupdict2(shuffled_dict, i) #convert to dataframe if slice_dict is None: permute = embed_dict # col-name is key1, row-name is key2 else: permute = {} embed_df = pd.DataFrame(embed_dict) for slice_name, row_indexs in slice_dict.iteritems(): permute[slice_name] = {} #print slice_name # sub_df = embed_df.ix[row_indexs] for col_name, col in sub_df.iteritems(): permute[slice_name][col_name] = col #col is pd.Series #print col #print type(col) #break #print permute return(permute)
def combine_countfiles(self, args_tuple): #row_names should be None or list type infile_tail, RC_level, out_file, row_names = args_tuple # counting_dict2 = {} for sample_name in self.par['sample_names']: #get read counts of a given sample counting_file = '{}{}/{}{}'.format(self.par['dir_result'], sample_name, sample_name, infile_tail) sample_dict2 = myIO.file_os(counting_file, '\t').to_dict2() for ref in sample_dict2.keys(): #print ref counts = sample_dict2[ref][RC_level] if ref in counting_dict2: counting_dict2[ref].update({sample_name:counts}) #print '=='+ref+'==' else: counting_dict2[ref] = {sample_name:counts} #print sample_name, ref,counting_dict2[ref] #export counting_dict myDict.basic(counting_dict2).dict2_to_file(out_file=out_file, row_names=row_names)
def hits_permutation1(self, in_dict, sample_size=10): #get the pool for sampling pool = in_dict.keys() # permute_dict = {} for i in range(self.par['permutation_times']): #random select some keys from the pool random_keys = random.sample(pool, sample_size) random_values = {} for k in random_keys: values_list = in_dict[k].split(',') for v in values_list: if v in random_values: random_values[v] += 1 else: random_values[v] = 1 # permute_dict[i] = random_values #transform dict: times in columns, value of in_dict is in rows permute_dict = myDict.basic(permute_dict).transform_dict2() return permute_dict
def enrich_pro(self, infile, annot_A, annot_B, sep1, sep2): if annot_A is None: annot_A = 'transcript_id' if annot_B is None: annot_B = 'pro_motifs' print("Enrichment analysis of {} => {} : {}".format( annot_A, annot_B, infile)) #read data frame file_sep = ',' if infile.endswith('.csv') else '\t' counts_df = pd.read_csv(infile, index_col=0, sep=file_sep, low_memory=False) #get all ids connect counts_df with annot_df A_ids = list(self.par['annot_df'][annot_A]) #get all ids based on annot_type in list formate B_ids = myDataframe.basic(self.par['annot_df']).df_list( annot_B, sep1, sep2) #get A_ids vs list of b_ids in dict formate AB_dict = myDataframe.basic(self.par['annot_df']).list_dict( annot_A, annot_B, sep1, sep2) #initiate: #frequency of observed enriched motifs hits_observed = myDict.basic().init_dict2(B_ids, list(counts_df), 0) #initiate: zscores of obs based on permutation models hits_zscores = myDict.basic().init_dict2(B_ids, list(counts_df), 0) #initiate: detect bugs debugging = myDict.basic().init_dict2( B_ids + ['hits_counts', 'interact_counts'], {}, 'NA') #loop of data frame by columns for sample_name, zscores in counts_df.items(): #print sample_name zscores = pd.Series(zscores) zscores.index = list(counts_df.index) #1: get ids of significant hits sig_zscores = zscores[zscores >= self.par['zscore_threshold']] obs_ids = list(sig_zscores.index) sig_num = len(obs_ids) #print annot_B, sample_name,sig_num #2: count frequency of enriched annotations, namely motifs obs_freq, obs_details = myDict.basic(AB_dict).elements_frequency( obs_ids) #print obs_freq.values() #debugging debugging['hits_counts'][sample_name] = sig_num debugging['interact_counts'][sample_name] = sum(obs_freq.values()) #3: permute samples #print "\tenrichment: %s\t%s\t%s" % (sample_name, sig_num, len(obs_freq.keys())) perm_dict = {} for i in range(self.par['permutation_times']): perm_peps = random.sample(A_ids, sig_num) tmp_perm, tmp_details = myDict.basic( AB_dict).elements_frequency(perm_peps) # frequency dict for key, value in tmp_perm.items(): if key in perm_dict: perm_dict[key].append(value) else: perm_dict[key] = [value] #print perm_dict #4: calcuate z-scores of observed counts for enriched_id, obs_num in obs_freq.items(): #update hit_observed hits_observed[enriched_id][ sample_name] = obs_num #frequency of observed enriched annot #update debugging debugging[enriched_id][sample_name] = '{}:{}'.format( obs_num, obs_details[enriched_id]) #update zscores_dict if enriched_id in perm_dict: perm_pools = perm_dict[enriched_id] #append zero and all pools are the same length perm_pools = perm_pools + [0] * (5 - len(perm_pools)) perm_mean = np.mean(perm_pools) perm_sd = np.std(perm_pools) #zscores of observed hits against the null model zscore = (obs_num - perm_mean) / perm_sd if perm_sd > 0 else ( obs_num - perm_mean) hits_zscores[enriched_id][sample_name] = round(zscore, 2) else: hits_zscores[enriched_id][sample_name] = obs_num #print hits_zscores #export file_head = '{}{}_{}_'.format(self.par['dir_enrichment'], myIO.file_os(infile).name_prefix(), annot_B) myDict.basic(hits_observed).dict2_to_file(out_file=file_head + 'counting.txt', index_label=annot_B) myDict.basic(hits_zscores).dict2_to_file(out_file=file_head + 'zscores.txt', index_label=annot_B) myDict.basic(debugging).dict2_to_file(out_file=file_head + 'debugging.txt', index_label=annot_B, NA='NA')
def init_dir_file(self): self.par['dir_home'] = myIO.dir_os(self.par['dir_home']).create_dir() print('home directory of phip tool:', self.par['dir_home']) #dir_home = /home/yuan/phip/ #alignment related self.par['dir_aligner'] = self.par['dir_home'] + 'bowtie1/' self.par['aligner_options'] = '{}bowtie {}'.format( self.par['dir_aligner'], self.par['aligner_options']) self.par['genome_index'] = self.par['dir_aligner'] + self.par[ 'genome_index_name'] self.par['dir_ref_seq'] = self.par['dir_home'] + 'ref_seq/' self.par['file_ref_fa'] = '{}{}.fa'.format( self.par['dir_ref_seq'], self.par['genome_index_name']) if 'file_annotation' in self.par.keys(): self.par['file_annotation'] = self.par['dir_ref_seq'] + self.par[ 'file_annotation'] # #judge ref library human or virus if 'VirScan' in self.par['genome_index_name']: self.par['lib'] = 'virus' self.par[ 'file_NC'] = self.par['dir_ref_seq'] + 'virus_BeadsOnly.txt' elif 'human' in self.par['genome_index_name']: self.par['lib'] = 'human' self.par[ 'file_NC'] = self.par['dir_ref_seq'] + 'human_BeadsOnly.txt' elif 'PublicEpitope' in self.par['genome_index_name']: self.par['lib'] = 'PE' elif 'LISH' in self.par['genome_index_name']: self.par['lib'] = 'LISH' #dir of raw data if 'dir_raw_data' not in self.par.keys(): self.par['dir_raw_data'] = myIO.dir_os(self.par['dir_home'] + 'raw_data').create_dir() #results related if 'dir_result' not in self.par.keys(): self.par['dir_result'] = myIO.dir_os(self.par['dir_home'] + 'result').create_dir() #print('Result directory', self.par['dir_result']) if 'dir_result_array' not in self.par.keys(): self.par['dir_result_array'] = self.par['dir_result'] #dir of statistics self.par['dir_stat'] = myIO.dir_os(self.par['dir_result'] + 'statistics').create_dir() self.par['dir_QC'] = myIO.dir_os(self.par['dir_stat'] + 'QC').create_dir() self.par['dir_enrichment'] = myIO.dir_os(self.par['dir_stat'] + 'enrichment').create_dir() #sample info self.par[ 'file_sample_info'] = self.par['dir_result'] + 'sample_info.csv' self.par['dir_log'] = self.par['dir_result'] + 'sample_log/' self.par['file_log'] = self.par['dir_result'] + 'output.log' self.par['file_total_log'] = self.par['dir_result'] + 'Total.log' self.par['file_stat'] = self.par['dir_QC'] + 'statistics.csv' self.par['file_ref_txt'] = self.par['dir_result'] + 'references.txt' self.par[ 'file_pro_pep'] = self.par['dir_result'] + 'protein_peptides.txt' #raw data related #print(self.par['dir_raw_data']) # self.par['RC_levels'] = ['lowRC'] #lowRC, midRC, highRC self.par['phip_levels'] = ['pep', 'promax', 'prosum'] files_dict = {} for pl in self.par['phip_levels']: file_head = '{}{}_'.format(self.par['dir_stat'], pl) #raw reads files_dict[pl + '_RC'] = file_head + 'RC.txt' #noramlized by total raw counts files_dict[pl + '_scalingRC'] = file_head + 'scalingRC.txt' files_dict[ pl + '_scalingRC_prosum'] = file_head + 'scalingRC_prosum.txt' files_dict[ pl + '_scalingRC_promax'] = file_head + 'scalingRC_promax.txt' #scalingRC against regressed median of phip sample and regressed sd of negative controls files_dict[pl + '_NCPHIPzscores'] = file_head + 'NCPHIPzscores.txt' files_dict[ pl + '_NCPHIPzscores_prosum'] = file_head + 'NCPHIPzscores_prosum.txt' files_dict[ pl + '_NCPHIPzscores_promax'] = file_head + 'NCPHIPzscores_promax.txt' self.par['files_dict'] = files_dict #default parameters self.par['specieZ_threshold'] = int( self.par['specieZ_threshold'] ) if 'specieZ_threshold' in self.par.keys() else 10 self.par['align_score'] = float( self.par['align_score']) if 'align_score' in self.par.keys( ) else 80 #p value cutoff for binomial testing self.par['p_threshold'] = float( self.par['p_threshold']) if 'p_threshold' in self.par.keys( ) else .001 #x value is observed successes cutoff for binomial test self.par['x_threshold'] = float( self.par['x_threshold']) if 'x_threshold' in self.par.keys() else 1 self.par['sim_threshold'] = float( self.par['sim_threshold']) if 'sim_threshold' in self.par.keys( ) else 0.8 self.par['zscore_threshold'] = int( self.par['zscore_threshold'] ) if 'zscore_threshold' in self.par.keys() else 10 self.par['permutation_times'] = int( self.par['permutation_times'] ) if 'permutation_times' in self.par.keys() else 100 self.par['threads_num'] = int(self.par['threads_num']) self.par['scaling_factor'] = int( self.par['scaling_factor']) if 'scaling_factor' in self.par.keys( ) else 1e6 #print self.par myDict.basic(self.par).print_dict() # return (self.par)
def par_command(argv): phip_libs = ['human', 'virus', 'PE', 'allergome', 'LISH'] #initiate parameters par = {'fq_file':'NA','barcode_file':'NA','index_file':'NA','I1_file':'NA','I2_file':'NA', \ 'dir_raw_data':'NA', 'dir_raw':'NA','dir_in':'NA', 'out':'NA', \ 'dir_result':'NA', 'multiplexing_mode':0, 'ref_libs':phip_libs[:2], \ 'seq_start':0, 'seq_end':0, 'seq_min':10, 'seq_max':0 } usage_out = 'Usage:\n' + argv[0] + ' [options] -o <raw data directory> ' + \ '-f <fastq file> -i <index file> -b <barcode file>\n' try: opts, args = getopt.getopt(argv[1:],"hf:i:b:o:t:l:x:y:m:n:z:c:",["help",\ "fastq_file", "index_file", "barcode_file", "dir_raw_data", "trim_len",\ 'fixed_end5', 'dir_in', 'out', 'I1_file','I2_file','dir_raw','ref_library']) except getopt.GetoptError: print(usage_out) sys.exit(2) #get parameters for opt, arg in opts: if opt in ('-h', '--help'): print(usage_out) #common usage # python Process_FASTQ.py -f * -i * -b * -o * -y *" print("-h --help\tUsage information of this script.") print( "-t --trim_len\tTrim sequences from the 5'-end or 3'-end of reads (Optional)" ) print( "-f --fastq_file\tFastq file determined by a sequencing analyzer." ) print("-i --index_file\tIndex file matched with the fastq file.") print( "-b --barcode_file\tBarcode file matched with the index file.") print( "-o --raw_data\tDirectory storing demulitplexed *fastq files.") print( "-y --out\tDirectory storing sample_info.csv and variables.txt." ) print( "-c --ref_library\tReference libraries can be one of {}, default is {}." .format(phip_libs, phip_libs[:2])) sys.exit(2) elif opt in ("-f", "--fastq_file"): par['fq_file'] = os.path.abspath(arg) par['multiplexing_mode'] += 1 elif opt in ("-i", "--index_file"): par['index_file'] = os.path.abspath(arg) par['multiplexing_mode'] += 1 elif opt in ("-b", "--barcode_file"): par['barcode_file'] = os.path.abspath(arg) par['multiplexing_mode'] += 1 elif opt in ("-o", "--raw_data"): par['dir_raw_data'] = myIO.dir_os( os.path.abspath(arg)).create_dir() elif opt in ( "-z", "--all_raw_data"): # only for one more sets of fastq splits par['dir_raw'] = myIO.dir_os(os.path.abspath(arg)).create_dir() elif opt in ('-x', "--dir_in"): par['dir_in'] = os.path.abspath(arg) par['fq_files'] = myParallel.samples({}).seek_fq(par['dir_in']) elif opt in ('-y', "--out"): par['out'] = arg elif opt in ("-l", "--fixed_len"): len_min, len_max = arg.split(':') par['seq_min'] = abs(int(len_min)) par['seq_max'] = abs(int(len_max)) elif opt in ("-t", "--trim_len"): trim_end5, trim_end3 = arg.split(':') par['seq_start'] = abs(int(trim_end5)) par['seq_end'] = -abs(int(trim_end3)) elif opt in ("-m", "--I1_file"): par['I1_file'] = os.path.abspath(arg) elif opt in ("-n", "--I2_file"): par['I2_file'] = os.path.abspath(arg) elif opt in ("-c", "--ref_library"): libs = arg.split(',') par['ref_libs'] = [x for x in libs if x in phip_libs] # if par['seq_max'] > 0: par['seq_end'] = par['seq_max'] # myDict.basic(par).print_dict() return par
def taxon_spec(self, count_file, taxon_rank, annot_index): #combine two data frame combined_df, phip_df = myCommon.basic(self.par).combine_df( count_file, annot_index) #print(combined_df) #print(list(combined_df.index)) #taxonomy names: taxon_group = combined_df.groupby(taxon_rank).groups taxon_names = taxon_group.keys() taxon_names = [t for t in taxon_names if str(t) != 'nan'] #remove nan #print(taxon_names) taxon_pairs = {'taxon_specie':'InterSpecie', 'taxon_genus':'InterGenus', \ 'taxon_family':'InterFamily', 'taxon_phip':'InterTaxon'} taxon_inter = taxon_pairs[taxon_rank] #inter-score dict #taxon_inter should be pep_ids separated by comma pepid_taxoninter = pd.Series(combined_df[taxon_inter], index=list(phip_df.index)) inter_df = myDataframe.basic(phip_df).interact_df( pepid_taxoninter, max, count_file + taxon_inter) #make permutation of pep_ids #permute_dict = myList.basic(list(phip_df.index)).permute_Series(self.par['permutation_times'], slice_dict = taxon_group) #the hits of significant specie specific #rows are peptides, and columns are phip samples plus species names #z-scores matrix of specific peptides #initiate nested dict taxon_dict = dict([(s, {}) for s in list(phip_df)]) # number of hits taxon_dict['peptides'] = dict([(a, len(b)) for a, b in taxon_group.items()]) #taxon_pval_dict = dict([(s,{}) for s in list(phip_df)]) #pvalues of the hits by permutations taxon_pep_dict = dict([(s, {}) for s in list(phip_df) ]) #pepid and zscores of hits debugging_dict = {} #for identify bugs for s in list(phip_df): debugging_dict[s + ':all_hits'] = {} debugging_dict[s + ':inter_hits'] = {} debugging_dict[s + ':intra_hits'] = {} debugging_dict[s + ':hits'] = {} debugging_dict[s + ':counts'] = {} #debugging_dict[s+':pvals'] = {} #loop by sample_names for sample_name, col in phip_df.items(): #print(sample_name) for s, indexs in taxon_group.items(): #1: inter-taxon searching inter_list = inter_df.ix[indexs][sample_name] inter_dict = self.taxon_inter_searching( col[indexs], inter_list) #export debugging_dict[sample_name + ':all_hits'][s] = inter_dict['all_hits'] debugging_dict[sample_name + ':inter_hits'][s] = inter_dict['inter_hits'] #print(inter_dict) #2: intra-taxon searching intra_dict = self.taxon_intra_searching( col[inter_dict['other_hits']]) #export debugging_dict[sample_name + ':intra_hits'][s] = intra_dict['intra_hits'] debugging_dict[sample_name + ':hits'][s] = intra_dict['hits'] all_hits = [ '{}:{}'.format('all', len(inter_dict['all_hits'])), '{}:{}'.format('inter', len(inter_dict['inter_hits'])), '{}:{}'.format('intra', len(intra_dict['intra_hits'])), '{}:{}'.format('hits', len(intra_dict['hits'])) ] debugging_dict[sample_name + ':counts'][s] = ','.join(all_hits) hit_list = [ '({},{})'.format(a, b) for a, b in col[intra_dict['hits']].items() ] taxon_pep_dict[sample_name][s] = ','.join(hit_list) #counts matrix of taxonomy search taxon_dict[sample_name][s] = len(intra_dict['hits']) #3: permutation #hit_scores = col[intra_dict['hits']] #permuted_scores = permute_dict[s]#df, pepids in rows, permuted scores in columns #pval_dict = self.taxon_permutation(hit_scores, permuted_scores, col) #export #pval_list = [len(intra_dict['hits']), pval_dict['ttest_pval'], pval_dict['utest_pval']] #taxon_pval_dict[sample_name][s] = ','.join(map(str, pval_list)) #pval_list = [ a+':'+str(b) for a,b in pval_dict.items()] #debugging_dict[sample_name+':pvals'][s] = ','.join(pval_list) #export to file file_head = '{}_{}_'.format( myIO.file_os(count_file).file_prefix(), taxon_rank) taxon_dict = myDict.basic(taxon_dict).transform_dict2() myDict.basic(taxon_dict).dict2_to_file(file_head + 'counting.txt', "\t") taxon_pep_dict = myDict.basic(taxon_pep_dict).transform_dict2() myDict.basic(taxon_pep_dict).dict2_to_file(file_head + 'peptides.txt', "\t") debugging_dict = myDict.basic(debugging_dict).transform_dict2() myDict.basic(debugging_dict).dict2_to_file(file_head + 'debugging.txt', "\t")
def count_reads(self): #key is ref name, value is reads string sep by comma, the first is ref seq unique_seq = dict((a, []) for a in self.par['ref_dict'].keys()) #unique and multiple counts in dict unique = {} #key is ref name, value is counts multiple = {} # key is query name, value is the list of refs num = {}# counts statistics saturation = {0:{1:0, 5:0, 10:0, 'max':0 }} # count number for saturation analysis last_index = 0 print('\tread sam file: {}.gz'.format(self.par['sample_sam_file'])) IN = gzip.open(self.par['sample_sam_file']+'.gz', 'rt') UN = gzip.open(self.par['sample_dir']+self.par['sample_name']+'_unknown.fa.gz', 'wt') maxRC = 0 for line in IN: #print(line) #counts num['raw_reads_num'] = num.setdefault('raw_reads_num',0)+1 #analyze sam line info = self.analyze_SAM(line) qname, ref= info['qname'], info['ref'] #unique alignment if info['aligned'] == '1': unique[ref] = unique.setdefault(ref,0) + 1 if unique[ref] > maxRC: maxRC = unique[ref] #counting of saturation if unique[ref] in [1,5,10]: last_counts = saturation[last_index].copy()# copy() is essential!!!!! last_counts[unique[ref]] += 1 last_counts['max'] = maxRC#the maximum RC at the time of raw reads we get saturation[num['raw_reads_num']] = last_counts #print num['raw_reads_num'], last_index, saturation[num['raw_reads_num']] last_index = num['raw_reads_num'] #export aligned sequences of reads unique_seq[ref].append(info['seq']) num['unique_aligned_reads_num'] = num.setdefault('unique_aligned_reads_num',0)+1 #multiple alignment elif info['aligned'] == '3': multiple[qname] = multiple[qname] + [ref] if qname in multiple else [ref] num['multialigned_reads_num'] = num.setdefault('multialigned_reads_num',0)+1 #unalignment else: UN.write('>'+qname+'\n'+info['seq']+'\n') num['unaligned_reads_num'] = num.setdefault('unaligned_reads_num',0) + 1 IN.close() UN.close() #counting of saturation if num['raw_reads_num'] > last_index: saturation[num['raw_reads_num']] = saturation[last_index].copy() #for key in sorted(saturation.keys()): # print key, saturation[key] #upate num statistics myIO.file_os(self.par['sample_log'], '=').line_add(num) print('\tcombine RCs from unique and multiple alignments of ', self.par['sample_name']) #reversed multiple #print multiple rev_multiple = myDict.basic(multiple).counting_reversed_dict() #print unique RC_dict = self.multiple_counts(unique, rev_multiple) #export print('\tSave read counts into ', self.par['sample_RC_file']) myDict.basic(RC_dict).dict2_to_file(self.par['sample_RC_file'], pattern='\t') myDict.basic(saturation).dict2_to_file(self.par['sample_saturation_file'], pattern='\t') # seq_counts = {} for ref, reads_list in unique_seq.items(): key=ref+'\t'+self.par['ref_dict'][ref]+'\t'+str(len(reads_list)) if len(reads_list)>0: freq_dict = myList.basic(reads_list).elements_frequency0() seq_counts[key] = ';'.join(str(a)+':'+str(b) for a,b in freq_dict.items()) else: seq_counts[key] = 'NA' myDict.basic(seq_counts).dict_to_file(self.par['sample_dir']+'unique_aligned_reads.txt', pattern='\t')
def decompose_fq2(self, par): print('The splited FASTQ files are stored into {}'.format( par['dir_raw_data'])) #our directory out_dir = myIO.dir_os(par['dir_raw_data']).create_dir() #sequencing direction: R1 or R2 direction = self.R1R2() #read relationship between barcode vs sample from sample_file barcode_sample = myIO.file_os(par['barcode_file'], '\t').to_dict() #barcode_sample={ mySequence.sequence(k).revcom_DNA():v for k,v in barcode_sample.items()} barcode_sample['unassigned'] = 'unassigned' #print barcode_sample #open file handles based on barcode_sample file_handle = {} barcode_file = {} known_dict = {} un_dict = {} for barcode, sample_name in barcode_sample.items(): fq_file = '{}{}_{}.fq'.format(out_dir, sample_name, direction) file_handle[barcode] = open(fq_file, 'wt') barcode_file[barcode] = fq_file known_dict[barcode] = { 'sample_name': sample_name, 'read_counts': 0 } ### stdout_format = '|{:^15}|{:^15}|{:^15}|{:^15}|' dash_line = stdout_format.format('-' * 15, '-' * 15, '-' * 15, '-' * 15) print(dash_line) print( stdout_format.format('Raw reads', 'Assigned reads', 'Percentage', 'Trim reads')) print(stdout_format.format('millions', 'millions', '%', 'nt->nt')) print(dash_line) n = 0 #total number of reads m = 0 # total number assigned reads #file handle #with open(self.biofile, 'rt') as F1, open(index_file, 'rt') as F2: F1 = self.readonly_handle(self.biofile) #fastq_file F2 = self.readonly_handle(par['I1_file']) #I1_file F3 = self.readonly_handle(par['I2_file']) #I2_fie with F1, F2, F3: #read 4 lines at a time per file for L1, La, Le, L2, Lb, Lf, L3, Lc, Lg, L4, Ld, Lh in itertools.zip_longest( *[F1, F2, F3] * 4): barcode = Lb.rstrip() + Lf.rstrip() rlen = len(L2) - 1 tag = False #assign record based on barcode if barcode in file_handle and rlen >= par['seq_min']: L_name = re.sub(r'\/', '#' + barcode + '/', L1) #print L_name, La #trim reads from 5 end or 3-end L2 = L2.rstrip() L4 = L4.rstrip() L2 = L2[par['seq_start']:par['seq_end']] + "\n" L4 = L4[par['seq_start']:par['seq_end']] + "\n" #output file handle file_handle[barcode].writelines([L_name, L2, L3, L4]) #counting known_dict[barcode]['read_counts'] += 1 m += 1 tag = True else: #output file handle file_handle['unassigned'].writelines([L1, L2, L3, L4]) un_dict[barcode] = un_dict[ barcode] + 1 if barcode in un_dict else 1 known_dict['unassigned']['read_counts'] += 1 n += 1 #export when if n >= 1e5 and n % 5e5 == 0: #million perc = round(m * 100 / n, 2) flen = len(L2) - 1 read_info = "{}-->{}".format( rlen, flen) if tag is True else "{}-->X".format(rlen) print( stdout_format.format(n / 1e6, m / 1e6, perc, read_info)) #if n==3e6: break else: print(dash_line) print( stdout_format.format(n / 1e6, m / 1e6, round(m * 100 / n, 2), '---')) print(dash_line) #calculate percentage for bc in known_dict.keys(): RC = float(known_dict[bc]['read_counts']) known_dict[bc]['percentage_%'] = round(RC * 100 / n, 2) #close file handle for b, F in file_handle.items(): #close file handle F.close() #delete empty file if os.stat(barcode_file[b]).st_size == 0: os.remove(barcode_file[b]) #export statistics myDict.basic(known_dict).dict2_to_file(out_dir + 'known.log', '\t') myDict.basic(un_dict).dict_to_file(out_dir + 'unknown.log', '\t')
def demultiplex_fq(self, par): #our directory out_dir = myIO.dir_os(par['dir_raw_data']).create_dir() #sequencing direction: R1 or R2 direction = self.R1R2() #read relationship between barcode vs sample from sample_file barcode_sample = myIO.file_os(par['barcode_file'], '\t').to_dict() #barcode_sample={ mySequence.sequence(k).revcom_DNA():v for k,v in barcode_sample.items()} barcode_sample['unassigned'] = 'unassigned' #print barcode_sample #open file handles based on barcode_sample file_handle = {} barcode_file = {} known_dict = {} un_dict = {} for barcode, sample_name in barcode_sample.items(): fq_file = '{}{}_{}.fq'.format(out_dir, sample_name, direction) file_handle[barcode] = open(fq_file, 'wt') barcode_file[barcode] = fq_file known_dict[barcode] = { 'sample_name': sample_name, 'read_counts': 0 } ### #file handle #with open(self.biofile, 'rt') as F1, open(index_file, 'rt') as F2: F1 = self.readonly_handle(self.biofile) F2 = self.readonly_handle(par['index_file']) n = 0 #total number of reads m = 0 # total number assigned reads stdout_format = '|{:^15}|{:^15}|{:^15}|{:^15}|' dash_line = stdout_format.format('-' * 15, '-' * 15, '-' * 15, '-' * 15) print(dash_line) print( stdout_format.format('Raw reads', 'Assigned reads', 'Percentage', 'Read Length')) print(stdout_format.format('millions', 'millions', '%', 'nt')) print(dash_line) with F1, F2: #read 4 lines at a time per file for L1, La, L2, Lb, L3, Lc, L4, Ld in itertools.zip_longest( *[F1, F2] * 4): barcode = Lb.rstrip() #assign record based on barcode if barcode in file_handle and len(L2) >= par['seq_min']: L_name = re.sub(r'\/', '#' + barcode + '/', L1) #print L_name, La #trim reads from 5 end if par['seq_start'] > 0: L2 = L2[par['seq_start']:] L4 = L4[par['seq_start']:] #trim the longer reads from 3-end if par['seq_end'] != 0: L2 = L2.rstrip() L4 = L4.rstrip() L2 = L2[:par['seq_end']] + "\n" L4 = L4[:par['seq_end']] + "\n" #output file handle file_handle[barcode].writelines([L_name, L2, L3, L4]) #counting known_dict[barcode]['read_counts'] += 1 m += 1 else: #output file handle file_handle['unassigned'].writelines([L1, L2, L3, L4]) un_dict[barcode] = un_dict[ barcode] + 1 if barcode in un_dict else 1 known_dict['unassigned']['read_counts'] += 1 n += 1 #export when if m >= 1e6 and m % 1e6 == 0: #million print( stdout_format.format(n / 1e6, m / 1e6, round(m * 100 / n, 2), len(L2) - 1)) #if n==3e6: break else: print(dash_line) print( stdout_format.format(n / 1e6, m / 1e6, m * 100 / n, '---')) print(dash_line) #calculate percentage for bc in known_dict.keys(): RC = float(known_dict[bc]['read_counts']) known_dict[bc]['percentage_%'] = round(RC * 100 / n, 2) #close file handle for b, F in file_handle.items(): #close file handle F.close() #delete empty file if os.stat(barcode_file[b]).st_size == 0: os.remove(barcode_file[b]) #export statistics myDict.basic(known_dict).dict2_to_file(out_dir + 'known.log', '\t') myDict.basic(un_dict).dict_to_file(out_dir + 'unknown.log', '\t')
def par_command(argv): phip_libs = ['human', 'virus', 'allergome', 'provirome', 'toxome', 'mouse', 'PE', 'zika', 'arbo', 'LISH'] #initiate parameter na_str='fq_file,barcode_file,index_file,I1_file,I2_file,dir_raw_data,dir_in,out,dir_result' par=dict([(key, 'NA') for key in na_str.split(',')]) par.update({'ref_libs':phip_libs[:2], 'seq_start':0, 'seq_end':None, 'seq_min':10}) usage_out = 'Usage:\n' + argv[0] + ' [options] -o <raw data directory> ' + \ '-f <fastq file> -i <index file> -b <barcode file>\n' try: opts, args = getopt.getopt(argv[1:],"hf:i:b:o:t:r:l:x:y:m:n:c:",["help",\ "fastq_file", "index_file", "barcode_file", "dir_raw_data", "trim_5end", 'len_trim',\ 'fixed_end5', 'dir_in', 'out', 'I1_file','I2_file','ref_library']) except getopt.GetoptError: print(usage_out) sys.exit(2) #get parameters for opt, arg in opts: if opt in ('-h', '--help'): print(usage_out) #common usage # python Process_FASTQ.py -f * -i * -b * -o * -y *" print("-h --help\tUsage information of this script.") print("-t --trim_len\tTrim sequences from the 5'-end or 3'-end of reads (Optional)") print("-f --fastq_file\tFastq file determined by a sequencing analyzer.") print("-i --index_file\tIndex file matched with the fastq file.") print("-b --barcode_file\tBarcode file matched with the index file.") print("-o --raw_data\tDirectory storing demulitplexed *fastq files.") print("-y --out\tDirectory storing sample_info.csv and variables.txt.") print("-c --ref_library\tReference libraries can be any of {}, default is {}.".format(phip_libs, phip_libs[:2])) sys.exit(2) elif opt in ("-f", "--fastq_file"): par['fq_file'] = os.path.abspath(arg) elif opt in ("-i", "--index_file"): par['index_file'] = os.path.abspath(arg) elif opt in ("-b", "--barcode_file"): par['barcode_file'] = os.path.abspath(arg) elif opt in ("-o", "--raw_data"): par['dir_raw_data'] = myIO.dir_os(os.path.abspath(arg)).create_dir() elif opt in ('-x', "--dir_in"): par['dir_in'] = os.path.abspath(arg) par['fq_files'] = myParallel.samples({}).seek_fq(par['dir_in']) elif opt in ('-y', "--out"): par['out'] = arg elif opt in ("-l", "--min_len"): # discard shorter reads due to poor sequencing par['seq_min'] = abs(int(arg)) elif opt in ("-t", "--trim_5end"): #trim_end5: length of nt from the 5-end par['seq_start'] = abs(int(arg)) elif opt in ("-r", "--fixed_len"): #len_trim: length of nt after trimming 5-end and 3-end par['seq_len'] = abs(int(arg)) par['seq_end'] = par['seq_start'] + par['seq_len'] elif opt in ("-m", "--I1_file"): par['I1_file'] = os.path.abspath(arg) elif opt in ("-n", "--I2_file"): par['I2_file'] = os.path.abspath(arg) elif opt in ("-c", "--ref_library"): libs = arg.split(',') par['ref_libs'] = [x for x in libs if x in phip_libs] # myDict.basic(par).print_dict() return par