def vcf_to_tmp_bed(self, vcf_file, tmp_dir, debug=False): if f_judge_debug(debug): import ipdb ipdb.set_trace() if vcf_file.endswith('gz'): compress_format = 'gzip' else: compress_format = None vcf_df_raw = pd.io.parsers.read_csv( vcf_file, sep="\t", header=None, compression=compress_format).ix[:, 0:5] vcf_df_raw.columns = ['chr', 'pos', 'name', 'ref', 'alt'] vcf_df = vcf_df_raw.ix[vcf_df_raw.alt.str.contains('^[ATGCatgc]+$'), :] if vcf_df.shape[0] != vcf_df_raw.shape[0]: logging.info('Filter %s illegal alt variants' % (vcf_df_raw.shape[0] - vcf_df.shape[0])) vcf_df['start'] = vcf_df.ix[:, 'pos'] - 1 vcf_df['name'] = vcf_df['chr'] + '_' + vcf_df['pos'].map( str) + '_' + vcf_df['ref'] + '_' + vcf_df['alt'] tmp_bed_file = tmp_dir + '/' + my.f_generate_tmp_file_name('bed') vcf_df.ix[:, ['chr', 'start', 'pos', 'name']].to_csv(tmp_bed_file, header=False, index=False, sep='\t') return tmp_bed_file
def write_records_fastq(self, output_dir, fastq_recoreds, prefix = ''): if prefix == '': sequence_file= os.path.join( output_dir, my.f_generate_tmp_file_name('seq') ) else: sequence_file= os.path.join( output_dir, prefix ) output_handle = open(sequence_file, "w") SeqIO.write(fastq_recoreds, output_handle, "fasta") output_handle.close() return sequence_file
def extract_allele(self, data_dir, header=False, extra_cols=[]): #print self.data.head() #import ipdb; ipdb.set_trace() allele_file = data_dir + my.f_generate_tmp_file_name("allele") allele_data= self.data[["chr","start"]] allele_data[["end"]]=self.data[["start"]] + 1 allele_data[["ref",'alt']]=self.data[["ref","alt"]] allele_data[extra_cols]=self.data[extra_cols] allele_data.drop_duplicates(cols=["chr","start","alt"],inplace=True) allele_data.drop_duplicates().to_csv(allele_file, header=header, index=False, sep="\t") return allele_file
def test_basic(self): other_col = 9 peak_file = '%s/deepsea/tests/data/yy1.sorted.bed' % project_dir chr_str = 'chr22' vcf_file = '%s/deepsea/tests/data/%s.merge.head.vcf.gz' % (project_dir, chr_str) tmp_dir = '%s/deepsea/tmp/%s/' % (project_dir, my.f_generate_tmp_file_name('t')) tmp_dir = '%s/deepsea/tmp/' % (project_dir) my.f_ensure_make_dir(tmp_dir) fastq_file = f_prepare_deepsea_fastq_based_on_vcf( peak_file, vcf_file, tmp_dir)
def extract_snp(self, data_dir): #import ipdb; ipdb.set_trace() #Only for the het sites. #mainly for read simulation. #zero-based in the format: snp_name chr1 933790 + G A allele_file = data_dir + my.f_generate_tmp_file_name("snp") allele_data= self.data.ix[self.data.het_type=='het', ["chr","start"]] allele_data[["start"]]=allele_data[["start"]] allele_data["strand"]='+' allele_data[["ref",'alt']]=self.data.ix[self.data.het_type=='het', ["ref","alt"]] allele_data.drop_duplicates(cols=["chr","start","alt"],inplace=True) allele_data.index=range(0, allele_data.shape[0]) allele_data.drop_duplicates().to_csv(allele_file, header=True, index=True, sep=" ") return allele_file
def extract_bed(self): #print self.data.head() #import ipdb; ipdb.set_trace() if self.loc_file is not None: return self.loc_file logging.info('Generate new bed') data_dir = os.path.dirname(self.file_path) bed_file = data_dir + '/' + my.f_generate_tmp_file_name("loc.bed") bed_data = self.data[["chr", "start", 'end']] bed_data["name"] = bed_data["chr"] + "-" + bed_data["start"].map(str) bed_data.drop_duplicates().to_csv(bed_file, header=False, index=False, sep="\t") self.loc_file = bed_file return bed_file
def extract_bed(self, data_dir, filter_col=None, filter_val=None, add_info = None): #print self.data.head() #import ipdb; ipdb.set_trace() bed_file = data_dir + '/' + my.f_generate_tmp_file_name("loc.bed") if filter_col is not None: bed_data=self.data[self.data[filter_col] == filter_val][['chr','start']] else: bed_data=self.data[["chr","start"]] bed_data["end"]=bed_data["start"].astype(int) bed_data["start"]=bed_data["start"].astype(int)-1 bed_data["name"]=bed_data["chr"]+"-"+bed_data["start"].map(str) if add_info is not None: bed_data["name"]=bed_data["name"]+"-"+self.data[add_info] bed_data.set_index(keys=["chr","start"], inplace=True, drop=False) bed_data.drop_duplicates().to_csv(bed_file, header=False, index=False, sep="\t") return bed_file
def f_extract_features_on_location_dp(database_file, loc_tf, feature_list, cell, data_dir, rmdup=False, labs='', guest_extract_flag=False, mapQ=0, debug=False): #Extract the dp information accoriding to the loc_flie (loc_cell and loc_tf) in bam files of feature_list, and loc_tf in the cell data #loc_file: the locations where to extract the binding signal #loc_tf / loc_cell: the source of loc_file #feature_list, cell, data_dir: for the bam files #this version is for collecting all the data in to a central database file if debug == True: import ipdb ipdb.set_trace() tf_database = loc.data_table(database_file) loc_file = tf_database.extract_loc(data_dir) error_cols = my.grep_list('.*simulate%s_dp_encode' % loc_tf, tf_database.data.columns.tolist()) my.f_print_list(error_cols) #my.f_print_list(tf_database.data.columns.tolist()) tf_database.drop_feautre(error_cols) error_cols = my.grep_list('.*_%s_dp_encode' % loc_tf, tf_database.data.columns.tolist()) tf_database.drop_feautre(error_cols) #import ipdb; ipdb.set_trace() for tf in feature_list: print tf bam_files = [] bam_pattern = '(%s).*%s-%s[\.-].*' % ("|".join(labs), cell, tf) if rmdup == True: bam_files = my.f_grep_files_from_dir(data_dir, bam_pattern + "rmdup.bam$") rmdup_str = ".rmdup" if rmdup == False or bam_files == []: bam_files = my.f_grep_files_from_dir(data_dir, bam_pattern + "sorted.bam$") rmdup_str = "" if bam_files == []: bam_files = my.f_grep_files_from_dir(data_dir, bam_pattern + "bam$") #import ipdb; ipdb.set_trace() logging.info('bam files:' + data_dir + bam_pattern) logging.info(bam_files) for bam_file in bam_files: #print bam_file #Get the replicate number rep_object = re.match(r".*(Rep[1-9]).*", bam_file, flags=re.IGNORECASE) lab = re.match(r".*(%s).*%s.*" % ('|'.join(labs), cell), bam_file, flags=re.IGNORECASE) if lab == None: print "Lab is missing %s in labs(%s)" % (bam_file, ' '.join(labs)) lab = lab.group(1) if rep_object == None: print "Escape the %s" % bam_file continue loc_dir = data_dir #my.f_get_dir_name_from_file(loc_file) #print loc_dir #vcf_file=f_create_file_name(data_dir=loc_dir,cell=cell,tf=tf,suffix= loc_cell + "." + loc_tf + "." + extract_type + rmdup_str + ".dp.vcf",rep=rep_object.group(1)) vcf_file = loc_dir + my.f_generate_tmp_file_name("vcf") #dp_file=f_create_file_name(data_dir=loc_dir,cell=cell,tf=tf,suffix= loc_cell + "." + loc_tf + "." + extract_type + rmdup_str + ".bam.vcf.dp",rep=rep_object.group(1)) dp_file = loc_dir + my.f_generate_tmp_file_name("dp") #print vcf_file, dp_file f_extract_depth_from_bam(loc_file, bam_file, vcf_file, dp_file, mapQ) #import ipdb; ipdb.set_trace() rep_names = {} if labs != '': for i in range(1, 8): rep_names['Rep%s' % i] = '_%s%s' % (lab, i) else: for i in range(1, 8): rep_names['Rep%s' % i] = '_broad%s' % i if guest_extract_flag == True: rep_names['Rep1'] = '_guest' feature_data = tf_database.read_feature_replace_name( dp_file, ["chip_ref_dp", "chip_alt_dp"], [ "ref_%s_dp%s" % (tf, rep_names[rep_object.group(1)]), "alt_%s_dp%s" % (tf, rep_names[rep_object.group(1)]) ]) #feature_data=tf_database.read_feature(dp_file, tf) tf_database.merge_feature(feature_data)
def extract_loc(self, data_dir, header=False): #print self.data.head() loc_file = data_dir + my.f_generate_tmp_file_name("loc") self.data[["chr","start"]].drop_duplicates().to_csv(loc_file, header=header, index=False, sep="\t") return loc_file
def get_ref_and_alt_peak_fastq_files_from_database(self, output_dir, hg19_file, file_prefix = None, target_lab = '', debug = False): if debug == True: import ipdb; ipdb.set_trace() tf_database = self peak_start = 'peak_%s_bed_start' % target_lab tf_database.data[peak_start] = tf_database.data['peak_%s_dis' % target_lab] - 50 print my.grep_list('lab', tf_database.data.columns.tolist()) lab_data=tf_database.data[ tf_database.data['lab_%s' % target_lab] != '.' ] peak_data = lab_data.ix[:, ['chr', peak_start]] peak_data[peak_start] = peak_data.ix[:,peak_start].astype('float').astype('int') peak_data['end'] = peak_data[peak_start] + 100 #old full_data = tf_database.data tf_database.data = lab_data allele_file=tf_database.extract_allele(output_dir, header=True) allele_data=pd.io.parsers.read_csv(allele_file, sep="\t", index_col=None) tf_database.data = full_data if allele_data.shape[0] == 0: print "empty input" bed_str=peak_data.to_string(header=False, index=False) bed_file=pybedtools.BedTool(bed_str, from_string=True) fasta = pybedtools.example_filename(hg19_file) a = bed_file.sequence(fi=fasta) from Bio import SeqIO fasta_file= os.path.join(output_dir, my.f_generate_tmp_file_name('fasta')) import shutil shutil.copyfile(a.seqfn, fasta_file) allele_data['fastq'] = '' mutation_seq=[] i=-1 #mutation_pos_col = [int(record[5]) - int(record[1]) for record in peak_regions] #print allele_data.ix[:,'start'] #print peak_data.ix[:, peak_start] peak_data.index = allele_data.index mutation_pos_col = allele_data.ix[:,'start'] - peak_data.ix[:,peak_start] - 1 #print mutation_pos_col alt_fastq_records = [] ref_fastq_records = [] for record in SeqIO.parse(open(fasta_file), "fasta"): i=i+1; mutation_pos=mutation_pos_col[i] line=allele_data.ix[i,] ref_allele=line[3] alt_allele=line[4] pwm_ref_strand = line[5] #print 'index %s' % i #if i == 112: # import ipdb; ipdb.set_trace() assert ref_allele.upper() == record[mutation_pos].upper(), "Ref Allele doesn't Match'" record.seq.alphabet = IUPAC.unambiguous_dna mutation_record = SeqRecord(record, id = record.id, name = record.name, description = record.description) mutation_seq = record.seq.lower().tomutable() mutation_seq[mutation_pos]=alt_allele.upper() mutation_record.seq = mutation_seq.toseq() mutation_record.seq.alphabet = IUPAC.unambiguous_dna ref_record = record ref_seq = record.seq.lower().tomutable() ref_seq[mutation_pos] = ref_allele.upper() ref_record.seq = ref_seq.toseq() alt_fastq_records.append(mutation_record) ref_fastq_records.append(ref_record) allele_data.ix[i, 'fastq' ] = str(ref_record.seq) alt_sequence_file= self.write_records_fastq(output_dir, alt_fastq_records, prefix = file_prefix + '.alt.fastq') ref_sequence_file= self.write_records_fastq(output_dir, ref_fastq_records, prefix = file_prefix + '.ref.fastq') os.remove(allele_file) return [ref_sequence_file, alt_sequence_file, allele_data]
def get_ref_and_alt_fastq_files_from_database(self, output_dir, hg19_file, flanking_length =30, file_prefix = None, debug = False): if debug == True: import ipdb; ipdb.set_trace() tf_database = self if 'pwm_ref_strand' not in tf_database.data.columns: tf_database.data['pwm_ref_strand'] = '+' allele_file=tf_database.extract_allele(output_dir, header=True, extra_cols = ['pwm_ref_strand'] ) pssm_len = flanking_length allele_data=pd.io.parsers.read_csv(allele_file, sep="\t", index_col=None) if allele_data.shape[0] == 0: print "empty input" #return bed_data=allele_data[["chr","start"]] bed_data[["start"]]= allele_data[["start"]] - pssm_len bed_data[["end"]]= allele_data[["start"]] +pssm_len -1 bed_data["name"] = '.' bed_data['score'] = '.' bed_data[['strand']] = allele_data[['pwm_ref_strand']] bed_str=bed_data.to_string(header=False, index=False) bed_file=pybedtools.BedTool(bed_str, from_string=True) fasta = pybedtools.example_filename(hg19_file) a = bed_file.sequence(fi=fasta) from Bio import SeqIO fasta_file= os.path.join(output_dir, my.f_generate_tmp_file_name('fasta')) import shutil shutil.copyfile(a.seqfn, fasta_file) allele_data['fastq'] = '' mutation_seq=[] i=-1 mutation_pos_col=pssm_len -1 alt_fastq_records = [] ref_fastq_records = [] for record in SeqIO.parse(open(fasta_file), "fasta"): i=i+1; mutation_pos=mutation_pos_col line=allele_data.ix[i,] ref_allele=line[3] alt_allele=line[4] pwm_ref_strand = line[5] assert ref_allele.upper() == record[mutation_pos].upper(), "Ref Allele doesn't Match'" record.seq.alphabet = IUPAC.unambiguous_dna #The following is necessary to copy mutation_record = SeqRecord(record, id = record.id, name = record.name, description = record.description) mutation_seq = record.seq.lower().tomutable() mutation_seq[mutation_pos]=alt_allele.upper() mutation_record.seq = mutation_seq.toseq() mutation_record.seq.alphabet = IUPAC.unambiguous_dna ref_record = record ref_seq = record.seq.lower().tomutable() ref_seq[mutation_pos] = ref_allele.upper() ref_record.seq = ref_seq.toseq() #if pwm_ref_strand == '-': #ref_record.seq = record.seq.complement() #mutation_record.seq = mutation_record.seq.complement() #print mutation_seq #import ipdb; ipdb.set_trace() #mutation_record.id = ref_record.id #mutation_record.name = ref_record.name alt_fastq_records.append(mutation_record) ref_fastq_records.append(ref_record) allele_data.ix[i, 'fastq' ] = str(ref_record.seq) alt_sequence_file= self.write_records_fastq(output_dir, alt_fastq_records, prefix = file_prefix + '.alt.fastq') ref_sequence_file= self.write_records_fastq(output_dir, ref_fastq_records, prefix = file_prefix + '.ref.fastq') os.remove(allele_file) return [ref_sequence_file, alt_sequence_file, allele_data]