def f_process_one_CTCF(loc_bam, head_dir, node_base_dir): #import ipdb; ipdb.set_trace() individual_id = loc_bam.split('-')[1]+'_'+loc_bam.split('.')[0].split('-')[3] node_dir = node_base_dir + '/' + individual_id my.f_ensure_make_dir(node_dir) add_chr_cmd = "samtools view -H %s/%s | sed -e 's/SN:\([0-9XY]\)/SN:chr\\1/' -e 's/SN:MT/SN:chrM/' | samtools reheader - %s/%s > %s/%s" % (head_dir, loc_bam, head_dir, loc_bam, node_dir, loc_bam) my.f_shell_cmd(add_chr_cmd) individual_id = loc_bam.split('-')[1]+'_'+loc_bam.split('.')[0].split('-')[3] mkdir_cmd = 'makeTagDirectory %s/%s %s/%s' % (node_dir, individual_id, node_dir, loc_bam) my.f_shell_cmd(mkdir_cmd) copy_cmd = 'cp -r %s/%s %s; rm -r %s' % (node_dir, individual_id, head_dir, node_dir) my.f_shell_cmd(copy_cmd)
def test_basic(self): other_col = 9 peak_file = '%s/deepsea/tests/data/yy1.sorted.bed' % project_dir chr_str = 'chr22' vcf_file = '%s/deepsea/tests/data/%s.merge.head.vcf.gz' % (project_dir, chr_str) tmp_dir = '%s/deepsea/tmp/%s/' % (project_dir, my.f_generate_tmp_file_name('t')) tmp_dir = '%s/deepsea/tmp/' % (project_dir) my.f_ensure_make_dir(tmp_dir) fastq_file = f_prepare_deepsea_fastq_based_on_vcf( peak_file, vcf_file, tmp_dir)
def test_empty_vcf_overlap_with_bed(self): #import ipdb; ipdb.set_trace() other_col = 9 peak_file = '/homed/home/shi/expression_var//data/raw_data/tf/encode_peaks/uw-gm12878-dnase.narrowPeak' chr_str = 'chr22' vcf_file = '%s/deepsea/examples/deepsea/example.vcf' % project_dir tmp_dir = '/tmp/tmpegec75' my.f_ensure_make_dir(tmp_dir) with self.assertRaises(Exception) as context: fastq_file = f_prepare_deepsea_fastq_based_on_vcf(peak_file, vcf_file, tmp_dir, debug=False) self.assertTrue( 'Empty overlap between features and vcf file' in context.exception)
def test_dnase(self): other_col = 9 peak_file = '/homed/home/shi/expression_var/data/raw_data/tf/encode_peaks/processed/uw-gm12878-dnase.narrowPeak' #peak_file = '/homed/home/shi/expression_var/data/raw_data/tf/encode_peaks/processed/haib-gm12878-runx3.narrowPeak' #peak_file = '/homed/home/shi/expression_var/data/raw_data/tf/encode_peaks/processed/sydh-gm12878-ctcf.narrowPeak' chr_str = 'chr22' vcf_file = '%s/deepsea/tests/data/%s.merge.head.vcf.gz' % (project_dir, chr_str) tmp_dir = '%s/deepsea/tmp/' % (project_dir) my.f_ensure_make_dir(tmp_dir) #import ipdb; ipdb.set_trace() fastq_file = f_prepare_deepsea_fastq_based_on_vcf( peak_file, vcf_file, tmp_dir) a = 0
if f_judge_debug(debug): peak_file_df_rmdup = peak_file_df_rmdup.ix[1:3, :] f_add_break_point() if 'vcf' in vcf_file: for loc_tf in peak_file_df_rmdup.index: final_file = '%s/%s.out.evalue' % (outdir, loc_tf) if os.path.isfile(final_file): logging.info('Skip %s: %s' % (loc_tf, final_file)) continue try: if f_judge_debug(debug): tempdir = './tmp/aaa/' my.f_ensure_make_dir(tempdir) else: tempdir = mkdtemp() tmp_dir = tempdir peak_file = peak_file_df_rmdup.ix[loc_tf, 'file_path'] #vcf_file = '%s/deepsea/tests/data/chr22.merge.head.vcf.gz'%(project_dir) deepsea_tf = peak_file_df_rmdup.ix[loc_tf, 'deepsea_tf'] print "Successfully copied input to working directory " + tempdir try: #logging.info("python2.7 p_generate_peak_fastq.py --vcf_file %s --peak_file %s --tmp_dir %s --hg19_file %s" % (vcf_file, peak_file, tmp_dir, hg19_file)) my.f_shell_cmd( "python2.7 p_generate_peak_fastq.py --vcf_file %s --peak_file %s --tmp_dir %s --hg19_file %s" % (vcf_file, peak_file, tmp_dir, hg19_file))
def main(): if __doc__ is None: parser.add_argument('--out_dir', help='Out', default='%s/qsub_445samples/' % project_dir) parser.add_argument('--test_flag', help='Test flag', default='T') opts = parser.parse_args() out_dir = opts.out_dir test_flag = (opts.test_flag == 'T') node_dir = "/state/partition1/shi/tmp_depth/%s/" % my.f_shell_cmd( 'echo $JOB_ID', quiet=True).replace('\n', '') else: out_dir = '%s/qsub_445samples/' % project_dir node_dir = out_dir + '/node/' test_flag = True my.f_ensure_make_dir(out_dir) FQ_dir = '%s/fastq/' % project_dir geuvadis_meta = '%s/metaData/E-GEUV-1.sdrf.txt' % project_dir our_study = '%s/metaData/our_sample.list' % project_dir metadata = '%s/metadata' % project_dir #import ipdb; ipdb.set_trace() our_people = set() gender = {} pop = {} for line in open(our_study, 'r').readlines(): our_people.add(line.strip().split('\t')[0]) items = line.strip().split('\t') person = items[0] person_gender = items[3] if person not in gender.keys(): gender[person] = person_gender if person not in pop.keys(): pop[person] = items[1] geu1 = set() for line in open(geuvadis_meta, 'r').readlines(): items = line.strip().split('\t') geu1.add(items[0]) of_interest = geu1.intersection(our_people) print of_interest print len(of_interest) person_to_fq = {} for line in open(geuvadis_meta, 'r').readlines(): items = line.strip().split('\t') person = items[0] if person not in of_interest: continue if person not in person_to_fq.keys(): person_to_fq[person] = set() curr_fq = items[28] person_to_fq[person].add(FQ_dir + os.path.basename(curr_fq)) #print items print person_to_fq metadata_file = open(metadata, 'w') for person in person_to_fq.keys(): out_curr = node_dir + person + '.sailfish/' metadata_file.write(person + '\t' + ','.join(person_to_fq[person]) + '\t' + out_curr + '\n') #And run sailfish cur_gender = gender[person] cur_pop = pop[person] #sailfish_idx='%s/Transcriptome/gencode.v19.annotation.PC.lincRNA.gtf.splicedExon.N'% project_dir +cur_gender+'.fa.dedup.fa_IDX_sailfish' index_dir = '~/expression_var/data/raw_data/pop/%s_dir' % cur_pop sailfish_idx = '%s/gencode.v19.annotation.PC.lincRNA.gtf.splicedExon.N' % index_dir + cur_gender + '.fa.dedup.fa_IDX_sailfish' #cmd_module='module load sailfish/0.6.3' library_type = '"T=PE:O=><"' #T=PE:O=><:S=SA fastqs = list(person_to_fq[person]) #If the output is there, don't lanch the jobs again. final_out_file = '%s/%s.sailfish/%squant.gene_level.sf' % ( out_dir, person, person) if os.path.isfile(final_out_file): print 'Got the results of %s' % person continue else: print 'Sailfish %s' % person my.f_remove_dir('%s/%s.sailfish' % (out_dir, person)) if not os.path.isfile(fastqs[0]): print 'Missing person %s: %s' % (person, fastqs[0]) if not os.path.isfile(fastqs[1]): print 'Missing person %s: %s' % (person, fastqs[1]) continue continue cmds = [] cmds.append('#!/usr/bin/env bash') cmds.append('mkdir -p %s' % out_curr) cmds.append('cp -u %s %s' % (' '.join(fastqs), out_curr)) loc_fastqs = [ os.path.join(out_curr, os.path.basename(fastq_file)) for fastq_file in fastqs ] #cmds.append(cmd_module) sailfish_exe = '~/packages/Sailfish-0.6.3-Linux_x86-64/bin/sailfish' sailfish_cmd = sailfish_exe + ' quant -i ' + sailfish_idx + ' -l ' + library_type + ' -1 <(gunzip -c ' + loc_fastqs[ 0] + ') -2 <(gunzip -c ' + loc_fastqs[ 1] + ') -o ' + out_curr + ' -f' cmds.append(sailfish_cmd) cmds.append('cd ' + out_curr) #cmds.append('module load java/latest') gtf = '%s/GENCODE_v19_2014-06-03/gencode.v19.annotation.PC.lincRNA.gtf' % project_dir cmds.append('%s/TranscriptsToGenes.sh --gtf-file ' % script_dir + gtf + ' --exp-file ' + out_curr + '/quant.sf' + ' --res-file ' + person + 'quant.gene_level.sf') cmds.append('mv ' + out_curr + '/quant.sf' + ' ' + out_curr + '/' + person + 'quant.sf') cmds.append('rm %s/*.fastq.gz' % (out_curr)) cmds.append('rm %s/reads.*' % (out_curr)) cmds.append('mv %s %s/' % (out_curr, out_dir)) cmds.append('rm -r %s' % (out_curr)) print '\n'.join(cmds) if test_flag == False: qsub_a_command('qqqq'.join(cmds), out_dir + person + '_script.sh', 'qqqq', '10G')
def f_novo_mapping(fastq_dir, fastq_gz_file, data_dir, wgs_dir, cell, local_bin, desination_dir, mode=None, short_read=False): my.f_ensure_make_dir(data_dir) my.f_copy_to_dir(fastq_dir, fastq_gz_file, data_dir) my.f_unzip_targz(data_dir + "/" + fastq_gz_file) fastq_file = data_dir + "/" + fastq_gz_file.replace(".gz", "") cell_nix_file = wgs_dir + "/" + cell + ".nix" #file_prefix = my.f_get_prefix(fastq_file); file_prefix = fastq_file.replace(".fastq", "") # + '.'.join(mode) map_stats_file = file_prefix + ".stats.txt" map_bam_file = file_prefix + ".sam.map" mode_string = '' if 'test' in mode: mode_string = " -#1k " desination_dir = desination_dir + '/test/' my.f_ensure_make_dir(desination_dir) if 'unique' in mode: mode_string = mode_string + " -r None" if mode_string == '' and mode != None: logging.error('Unkonwn mode: %s ' % mode) #import ipdb; ipdb.set_trace() match_object = re.match(".*methy.*", fastq_gz_file) if match_object != None: #-F ILMFQ map_cmd = "%s/novoalign -d %s -f %s -o SAM %s 2> %s > %s" % ( local_bin, cell_nix_file, fastq_file, mode_string, map_stats_file, map_bam_file) elif short_read == True: map_cmd = "%s/novoalign -d %s -f %s -l 20 -o SAM %s 2> %s > %s" % ( local_bin, cell_nix_file, fastq_file, mode_string, map_stats_file, map_bam_file) else: map_cmd = "%s/novoalign -F ILMFQ -d %s -f %s -o SAM %s 2> %s > %s; echo $?" % ( local_bin, cell_nix_file, fastq_file, mode_string, map_stats_file, map_bam_file) logging.info("Map cmd: " + map_cmd) first_try = my.f_shell_pipe(map_cmd) map_stats = '' if first_try == '0\n': map_stats = 'ILMFQ' else: logging.warning('Novo output: first try faild ' + first_try) map_cmd = "%s/novoalign -d %s -f %s -o SAM %s 2> %s > %s;echo $?" % ( local_bin, cell_nix_file, fastq_file, mode_string, map_stats_file, map_bam_file) logging.info("2nd Map cmd: " + map_cmd) sencond_try = my.f_shell_pipe(map_cmd) logging.info('Sencond try: ' + sencond_try) if sencond_try == '0\n': map_stats = 'default' sort_cmd = "samtools view -bS %s | samtools sort - %s" % ( map_bam_file, file_prefix + ".sorted") my.f_shell_pipe(sort_cmd) bam_file = "%s.sorted.bam" % os.path.basename(file_prefix) #f_bam_remove_dup(bam_file, data_dir, desination_dir, picard_java_lib_path) my.f_copy_to_dir(data_dir, "%s.stats.txt" % os.path.basename(file_prefix), desination_dir) my.f_copy_to_dir(data_dir, "%s.sorted.bam" % os.path.basename(file_prefix), desination_dir) os.remove(map_stats_file) os.remove("%s.sorted.bam" % file_prefix) os.remove(fastq_file) os.remove(map_bam_file) if not os.listdir(data_dir): logging.info('Empty dir: %s ' % data_dir) os.rmdir(data_dir) else: logging.warning("Not empty dir") logging.warning(os.listdir(data_dir)) return map_stats
home_dir = os.path.expanduser('~') lib_dir = '%s/python/' % home_dir import sys sys.path.insert(0, lib_dir) sys.path.insert(0, '%s/expression_var/python/' % home_dir) import pandas as pd import p_mymodule as my from p_project_metadata import * from p_generate_peak_fastq import chipseq_region peak_file_df_rmdup = f_get_peak_file_df_rmdup(project_dir) print peak_file_df_rmdup.head() processed_dir = '%s/data/raw_data/tf/encode_peaks/processed/' % project_dir my.f_ensure_make_dir(processed_dir) for loc_tf in peak_file_df_rmdup.tf: #loc_tf = 'pol2' print 'Process %s' % loc_tf peak_file = peak_file_df_rmdup.ix[loc_tf, 'file_path'] tf_region = chipseq_region(file_path=peak_file) tf_region.merge_overlapped_peaks() tf_region.split_peaks_with_multiple_peakMax(debug=False) print tf_region.binding_df.head() #import ipdb; ipdb.set_trace() #print tf_region.binding_df.ix[ tf_region.binding_df.start== 43044464,:] tf_region.bed_trim_binding_regions() tf_region.save_bed('%s/%s' % (processed_dir, os.path.basename(peak_file)))
reload(fun) reload(loc) cofactor_list_raw = [] feature_list = [] #feature_list=["methy"] #tf_list=["inputigg","inputstd","ctcf"] #tf_list=["ctcf","znf143","bhlhe40","ebf1"] #tf_list=["znf143","ctcf","ebf1"] #tf_list=['brca1', 'chd2', 'elk1', 'max', 'maz', 'mxi1', 'nfya', 'nfyb', 'rad21', 'rfx5', 'smc3', 'stat3', 'tbp', 'usf2'] reload(fun) print my.f_shell_cmd('echo $HOME', quiet=True).replace('\n', '') my.f_unique_element_in_list(guest_cells) guest_cell = guest_cells[0] guest_extract_flag = guest_cell != cell #When the host and guest cell are different, means that try to predict variation impact. my.f_ensure_make_dir(dnase_dir) logging.info('Node dir name: ' + dnase_dir) cofactor_list = [tf_name.lower() for tf_name in cofactor_list_raw] logging.info(cofactor_list) reload(my) reload(fun) #if in clustdell, copy the tf's bam file to the local node #import ipdb; ipdb.set_trace() for tf in tf_list + feature_list + cofactor_list: tf_bam_pattern = "*%s*bam" % tf print tf_bam_pattern f_copy_to_dir(target_dir, tf_bam_pattern, dnase_dir, "-u") if guest_extract_flag == True:
tf_list = ['PU1', 'RPB2'] elif embl_number == '3656': tf_list = ['RNA'] else: print 'Wrong embl number' if embl_number != '': #TF binding data cell_list = list( set(index_data['Characteristics[coriell id]'].str.replace( 'NA', 'NA').tolist())) my.f_print_list(cell_list) for tf in tf_list: data_dir = os.path.join(head_dir, tf) my.f_ensure_make_dir(data_dir) for cell in cell_list: data_pattern = '%s_%s' % (cell.replace('NA', ''), tf) #fastq_prefix=my.f_create_file_prefix(cell, tf, lab, 'Rep1') #fastq_field = 30 #download_state=f_grep_wget_from_given_embl_file(data_index_file, data_pattern, data_dir, fastq_prefix, download_col = fastq_field, test_flag = test_flag, quiet = True, debug = False) bam_field = 36 bam_prefix = my.f_create_file_prefix(cell, tf, lab) #import ipdb; ipdb.set_trace() download_state = f_grep_wget_from_given_embl_file( data_index_file, data_pattern, data_dir, bam_prefix, download_pattern='ftp.*bam', test_flag=test_flag,