def test_add_tf_database(self): #import ipdb; ipdb.set_trace() db_file=my.f_create_file_name(self.test_dir, 'gmtest', 'ctcf',"database") tf_db =data_table(db_file) update_cols=my.grep_list('^(ref|alt)$', tf_db.data.columns) new_cols = my.grep_list('genotype', tf_db.data.columns) expected_cols = ['chr', 'start'] small_db = tf_db small_db.data = tf_db.data.ix[0:5, expected_cols + update_cols + new_cols] db_file2 =my.f_create_file_name(self.test_dir, 'gmtest', 'znf143',"database") tf_db2 = data_table(db_file2) tf_db2.data = tf_db2.data.ix[0:5, expected_cols + update_cols + new_cols] suffix = ['_tf1','_tf2'] merged_file = self.test_dir + '/gmtest_merged_db.database' merged_db = data_table(merged_file, data = small_db.data[expected_cols + update_cols]) merged_db.add_another_database(tf_db, expected_cols, update_cols, new_cols, '_tf1', debug = False) merged_db.add_another_database(tf_db2, expected_cols, update_cols, new_cols, '_tf2', debug = False) merged_db.save_data() self.assertTrue( set( tf_db2.get_cord_name() ) <= set(merged_db.get_cord_name()) ) new_add_rows = tf_db2.data.index[0] self.assertEqual( merged_db.data.ix[new_add_rows, 'genotype_tf2'], tf_db2.data.ix[new_add_rows, 'genotype'] )
def check_lokcer(self, pattern, timeout=10000000): import os.path import time #import ipdb; ipdb.set_trace() enquiry_lockers = my.grep_list(pattern, self.locker_list) return_status = True if len(enquiry_lockers) == 0: logging.warning("The equiery lokcer doesn't exist :" + pattern) return_status = False else: for locker_file in enquiry_lockers: print "Check the locker:" + locker_file total_time = 0 while True: if total_time > timeout: return_status = False logging.warning(locker_file + " reached timeout") break if os.path.isfile("/homed/home/shi/locker_dir/%s" % (locker_file)): print "Get the locker" self.locker_list.remove(locker_file) break time.sleep(10) total_time = total_time + 10 return return_status
def f_check_loc_cols(input_data): matched_cols = my.grep_list('start|end|pos', input_data.columns.tolist()) for loc_col in matched_cols: if input_data[loc_col].dtype != 'int64': logging.error('Error data type in %s', loc_col) input_data[loc_col] = input_data[loc_col].astype(int) return input_data
def f_get_tf_peak_list(project_dir, version='processed'): tf_dir = '%s/data/raw_data/tf/encode_peaks/%s/' % (project_dir, version) peak_list_raw = my.f_shell_cmd("find %s -name '*gm12878-*.narrowPeak'" % (tf_dir), quiet=True).split('\n') black_list = my.grep_list( ".*(--|Rep[1-9]|-myc|xyy1|test|pax5n19|embl|encode-)", peak_list_raw) duplicate_list = [ 'uta-gm12878-ctcf.narrowPeak', 'uw-gm12878-ctcf.narrowPeak', 'sydh-gm12878-yy1.narrowPeak', 'sydh-gm12878-rad21.narrowPeak', 'haib-gm12878-p300.narrowPeak', 'ut-gm12878-cmyc.narrowPeak', 'haib-gm12878-pol24h8.narrowPeak', 'sydh-gm12878-pol2.narrowPeak', 'uta-gm12878-pol2.narrowPeak' ] peak_list = list( set(peak_list_raw) - set(['']) - set(black_list) - set(my.grep_list('.*(%s)' % '|'.join(duplicate_list), peak_list_raw))) logging.info('Length of peaks: %s' % len(peak_list)) return peak_list
def head_file(self, pattern=".*", wc_flag=True, n=10): import random, sys, os print "\n===========================" print "============Sample Files=========" print "=============================" print self.new_files() file_list = my.grep_list(pattern, self.new_files()) os.chdir(self.test_dir) for single_file in file_list: print "\n=========%s============" % single_file if wc_flag == True: cmd = "wc -l %s" % single_file print "[File lines:]", "\t".join( (my.f_shell_cmd(cmd, quiet=True).replace("\n", "").split(" "))) print "" cmd = "head -n %s %s" % (n, single_file) my.f_shell_cmd(cmd)
def f_extract_features_on_location_dp(database_file, loc_tf, feature_list, cell, data_dir, rmdup=False, labs='', guest_extract_flag=False, mapQ=0, debug=False): #Extract the dp information accoriding to the loc_flie (loc_cell and loc_tf) in bam files of feature_list, and loc_tf in the cell data #loc_file: the locations where to extract the binding signal #loc_tf / loc_cell: the source of loc_file #feature_list, cell, data_dir: for the bam files #this version is for collecting all the data in to a central database file if debug == True: import ipdb ipdb.set_trace() tf_database = loc.data_table(database_file) loc_file = tf_database.extract_loc(data_dir) error_cols = my.grep_list('.*simulate%s_dp_encode' % loc_tf, tf_database.data.columns.tolist()) my.f_print_list(error_cols) #my.f_print_list(tf_database.data.columns.tolist()) tf_database.drop_feautre(error_cols) error_cols = my.grep_list('.*_%s_dp_encode' % loc_tf, tf_database.data.columns.tolist()) tf_database.drop_feautre(error_cols) #import ipdb; ipdb.set_trace() for tf in feature_list: print tf bam_files = [] bam_pattern = '(%s).*%s-%s[\.-].*' % ("|".join(labs), cell, tf) if rmdup == True: bam_files = my.f_grep_files_from_dir(data_dir, bam_pattern + "rmdup.bam$") rmdup_str = ".rmdup" if rmdup == False or bam_files == []: bam_files = my.f_grep_files_from_dir(data_dir, bam_pattern + "sorted.bam$") rmdup_str = "" if bam_files == []: bam_files = my.f_grep_files_from_dir(data_dir, bam_pattern + "bam$") #import ipdb; ipdb.set_trace() logging.info('bam files:' + data_dir + bam_pattern) logging.info(bam_files) for bam_file in bam_files: #print bam_file #Get the replicate number rep_object = re.match(r".*(Rep[1-9]).*", bam_file, flags=re.IGNORECASE) lab = re.match(r".*(%s).*%s.*" % ('|'.join(labs), cell), bam_file, flags=re.IGNORECASE) if lab == None: print "Lab is missing %s in labs(%s)" % (bam_file, ' '.join(labs)) lab = lab.group(1) if rep_object == None: print "Escape the %s" % bam_file continue loc_dir = data_dir #my.f_get_dir_name_from_file(loc_file) #print loc_dir #vcf_file=f_create_file_name(data_dir=loc_dir,cell=cell,tf=tf,suffix= loc_cell + "." + loc_tf + "." + extract_type + rmdup_str + ".dp.vcf",rep=rep_object.group(1)) vcf_file = loc_dir + my.f_generate_tmp_file_name("vcf") #dp_file=f_create_file_name(data_dir=loc_dir,cell=cell,tf=tf,suffix= loc_cell + "." + loc_tf + "." + extract_type + rmdup_str + ".bam.vcf.dp",rep=rep_object.group(1)) dp_file = loc_dir + my.f_generate_tmp_file_name("dp") #print vcf_file, dp_file f_extract_depth_from_bam(loc_file, bam_file, vcf_file, dp_file, mapQ) #import ipdb; ipdb.set_trace() rep_names = {} if labs != '': for i in range(1, 8): rep_names['Rep%s' % i] = '_%s%s' % (lab, i) else: for i in range(1, 8): rep_names['Rep%s' % i] = '_broad%s' % i if guest_extract_flag == True: rep_names['Rep1'] = '_guest' feature_data = tf_database.read_feature_replace_name( dp_file, ["chip_ref_dp", "chip_alt_dp"], [ "ref_%s_dp%s" % (tf, rep_names[rep_object.group(1)]), "alt_%s_dp%s" % (tf, rep_names[rep_object.group(1)]) ]) #feature_data=tf_database.read_feature(dp_file, tf) tf_database.merge_feature(feature_data)
deepsea_out = '%s/data/%s/deep_result/all/chrMergeTF/' % (project_dir, batch_name) outdir = '%s/%s/' % (deepsea_out, chr_str) logging.info('Out dir: %s', outdir) print vcf_df.head() peak_file_df_rmdup = f_get_peak_file_df_rmdup(project_dir, version='processed') predictors = pd.read_csv('%s/deepsea/resources/predictor.names' % project_dir, header=None) print predictors.head() gm12878_predictors = f_add_suffix_on_duplicates( my.grep_list('gm12878', predictors.ix[:, 0])) logging.info('GM12878 features: %s' % len(gm12878_predictors)) for deepsea_col in gm12878_predictors: vcf_df[deepsea_col] = 0 target_tf = 'CTCF' print filter( lambda x: re.search(r'GM12878[|]%s[|]' % target_tf, x, re.IGNORECASE), gm12878_predictors) if f_judge_debug(DEBUG): import ipdb ipdb.set_trace() for loc_tf in peak_file_df_rmdup.tf: pred_file = '%s/%s.out.%s' % (outdir, loc_tf, value_type)
file_list = my.f_shell_cmd( "ssh [email protected] find /home/wenqiang/encode/helas3/ -name '*.fastq.gz'", quiet=True).split('\n') #ctcf_list_raw = [ os.path.basename(fastq_file) for fastq_file in my.grep_list('^(?!.*gm12xxx|.*/ut-|.*open-).*%s'%feature, file_list)] #ctcf_list = my.grep_list('uw-(gm12864|gm12873).*', ctcf_list_raw) tf_list_file = '/homed/home/shi/projects/wgs/tf_list.txt' tf_list = my.f_parse_tf_list_file(tf_list_file) compiled_list = [] rest_list = list(set(tf_list) - set(compiled_list)) #tf_list = ['egr1'] map_fastq_list = [ os.path.basename(fastq_file) for fastq_file in my.grep_list('.*-helas3.*(%s)' % '|'.join(rest_list), file_list) ] map_fastq_list.sort() my.f_print_list(map_fastq_list) print len(map_fastq_list) fastq_gz_list = map_fastq_list elif 'simulate_mask' in cell_name: file_list = my.f_shell_cmd( "ssh [email protected] find /home/shi/encode/ -name 'encode-*mask*simulate*.fastq.gz'", quiet=True).split('\n') fastq_gz_list = [os.path.basename(fastq_file) for fastq_file in file_list] my.f_print_list(fastq_gz_list) elif 'simulate' in cell_name:
def get_ref_and_alt_peak_fastq_files_from_database(self, output_dir, hg19_file, file_prefix = None, target_lab = '', debug = False): if debug == True: import ipdb; ipdb.set_trace() tf_database = self peak_start = 'peak_%s_bed_start' % target_lab tf_database.data[peak_start] = tf_database.data['peak_%s_dis' % target_lab] - 50 print my.grep_list('lab', tf_database.data.columns.tolist()) lab_data=tf_database.data[ tf_database.data['lab_%s' % target_lab] != '.' ] peak_data = lab_data.ix[:, ['chr', peak_start]] peak_data[peak_start] = peak_data.ix[:,peak_start].astype('float').astype('int') peak_data['end'] = peak_data[peak_start] + 100 #old full_data = tf_database.data tf_database.data = lab_data allele_file=tf_database.extract_allele(output_dir, header=True) allele_data=pd.io.parsers.read_csv(allele_file, sep="\t", index_col=None) tf_database.data = full_data if allele_data.shape[0] == 0: print "empty input" bed_str=peak_data.to_string(header=False, index=False) bed_file=pybedtools.BedTool(bed_str, from_string=True) fasta = pybedtools.example_filename(hg19_file) a = bed_file.sequence(fi=fasta) from Bio import SeqIO fasta_file= os.path.join(output_dir, my.f_generate_tmp_file_name('fasta')) import shutil shutil.copyfile(a.seqfn, fasta_file) allele_data['fastq'] = '' mutation_seq=[] i=-1 #mutation_pos_col = [int(record[5]) - int(record[1]) for record in peak_regions] #print allele_data.ix[:,'start'] #print peak_data.ix[:, peak_start] peak_data.index = allele_data.index mutation_pos_col = allele_data.ix[:,'start'] - peak_data.ix[:,peak_start] - 1 #print mutation_pos_col alt_fastq_records = [] ref_fastq_records = [] for record in SeqIO.parse(open(fasta_file), "fasta"): i=i+1; mutation_pos=mutation_pos_col[i] line=allele_data.ix[i,] ref_allele=line[3] alt_allele=line[4] pwm_ref_strand = line[5] #print 'index %s' % i #if i == 112: # import ipdb; ipdb.set_trace() assert ref_allele.upper() == record[mutation_pos].upper(), "Ref Allele doesn't Match'" record.seq.alphabet = IUPAC.unambiguous_dna mutation_record = SeqRecord(record, id = record.id, name = record.name, description = record.description) mutation_seq = record.seq.lower().tomutable() mutation_seq[mutation_pos]=alt_allele.upper() mutation_record.seq = mutation_seq.toseq() mutation_record.seq.alphabet = IUPAC.unambiguous_dna ref_record = record ref_seq = record.seq.lower().tomutable() ref_seq[mutation_pos] = ref_allele.upper() ref_record.seq = ref_seq.toseq() alt_fastq_records.append(mutation_record) ref_fastq_records.append(ref_record) allele_data.ix[i, 'fastq' ] = str(ref_record.seq) alt_sequence_file= self.write_records_fastq(output_dir, alt_fastq_records, prefix = file_prefix + '.alt.fastq') ref_sequence_file= self.write_records_fastq(output_dir, ref_fastq_records, prefix = file_prefix + '.ref.fastq') os.remove(allele_file) return [ref_sequence_file, alt_sequence_file, allele_data]
def head(self): sample_cols = my.grep_list('(NA|HG)[0-9]+', self.data.columns) show_cols = list(set(self.data.columns) - set(sample_cols)) self.data = self.data.drop(sample_cols, axis=1) print self.data.ix[:, show_cols].head()
def get_sample_cols(self): return my.grep_list('(NA|HG)[0-9]+', self.data.columns)
def f_grep_wget_from_given_embl_file(index_file, pattern, output_dir, prefix, download_pattern, test_flag=False, quiet=False, debug=False): if debug == True: import ipdb ipdb.set_trace() import urllib matched_lines = my.grep_file(pattern, index_file) if matched_lines == None: if quiet == False: print "-----------------------Warning--------------------------\nNo matching for the pattern %s in %s\n" % ( pattern, index_file) return "failed" file_names = [ my.grep_list(download_pattern, re.split('\t', line))[0] for line in matched_lines ] #print file_names i = 1 for file_name in file_names: data_url = os.path.dirname(file_name) file_name = os.path.basename(file_name) #file_suffix=re.match(r"[a-zA-Z0-9_]*\.(.*)",file_name).group(1) #print file_suffix tmp, file_suffix = os.path.splitext(file_name) match_object = re.match(r".*(Rep[1-9]).*", file_name, flags=re.IGNORECASE) if match_object or len(file_names): if match_object: output_name = prefix + "-" + match_object.group( 1) + file_suffix else: output_name = prefix + "-Rep%s" % i + file_suffix i = i + 1 else: output_name = prefix + file_suffix output_file = output_dir + "/" + output_name if test_flag == False: urllib.urlretrieve(url=data_url + "/" + file_name, filename=output_file) print "Downlaod " + file_name + " " + data_url + ' ' + output_file match_object = re.match(r".*\Peak.gz$", file_name) if match_object: if test_flag == False: f_unzip_targz(output_file) print "Unzip " + output_name return "success"
sys.path.insert(0, lib_dir) import pandas as pd import p_mymodule as my project_dir = '%s/expression_var/' % home_dir #Parse the enhancer file loc_cell = 'gm12878' enhancer_path = '%s/data/fantom5/hg19_permissive_enhancers_expression_rle_tpm.csv' % project_dir enhancer_pd = pd.read_csv( enhancer_path, header=0, sep=',', ) print enhancer_pd.shape target_cel_columns = my.grep_list('.*gm12878', enhancer_pd.columns.tolist()) print['Unnamed: 0'] + target_cel_columns extract_data = enhancer_pd.loc[:, ['Unnamed: 0'] + target_cel_columns] extract_data print extract_data.head() coord_data = pd.DataFrame( list(extract_data.loc[:, 'Unnamed: 0'].str.split(':|-'))) extract_data['chr'] = coord_data[0] extract_data['start'] = coord_data[1] extract_data['end'] = coord_data[2] extract_data.columns = ['name', 'rep1', 'rep2', 'rep3', 'chr', 'start', 'end'] bed_data = extract_data.loc[:, ['chr', 'start', 'end', 'rep1', 'rep2', 'rep3']] print bed_data.head()