def f_bam_remove_dup(bam_file, data_dir, head_dir, picard_java_lib_path): import p_mymodule as my cell_tf_name = my.f_get_prefix(bam_file) output_file = data_dir + "/" + cell_tf_name + ".rmdup.bam" input_bam = data_dir + "/" + bam_file cmd = "java -jar %s/MarkDuplicates.jar I=%s O=%s M=%s.duplicate_report.txt ASSUME_SORTED=true VALIDATION_STRINGENCY=LENIENT REMOVE_DUPLICATES=true" % ( picard_java_lib_path, input_bam, output_file, data_dir + "/" + cell_tf_name) print cmd my.f_shell_cmd(cmd) output_pattern = '%s.(rmdup.bam|duplicate_report.txt)' % (cell_tf_name) my.f_grep_and_copy(data_dir, output_pattern, head_dir) os.remove(output_file) os.remove(os.path.join(data_dir, cell_tf_name + '.duplicate_report.txt'))
def head_file(self, pattern=".*", wc_flag=True, n=10): import random, sys, os print "\n===========================" print "============Sample Files=========" print "=============================" print self.new_files() file_list = my.grep_list(pattern, self.new_files()) os.chdir(self.test_dir) for single_file in file_list: print "\n=========%s============" % single_file if wc_flag == True: cmd = "wc -l %s" % single_file print "[File lines:]", "\t".join( (my.f_shell_cmd(cmd, quiet=True).replace("\n", "").split(" "))) print "" cmd = "head -n %s %s" % (n, single_file) my.f_shell_cmd(cmd)
def wc_file(self, pattern=".*"): print "\n===========================" print "============WC Files=========" print "=============================" file_list = self.new_files() os.chdir(self.test_dir) for single_file in file_list: cmd = "wc -l %s" % single_file print "\t".join( (my.f_shell_cmd(cmd, quiet=True).replace("\n", "").split(" ")))
def f_process_one_CTCF(loc_bam, head_dir, node_base_dir): #import ipdb; ipdb.set_trace() individual_id = loc_bam.split('-')[1]+'_'+loc_bam.split('.')[0].split('-')[3] node_dir = node_base_dir + '/' + individual_id my.f_ensure_make_dir(node_dir) add_chr_cmd = "samtools view -H %s/%s | sed -e 's/SN:\([0-9XY]\)/SN:chr\\1/' -e 's/SN:MT/SN:chrM/' | samtools reheader - %s/%s > %s/%s" % (head_dir, loc_bam, head_dir, loc_bam, node_dir, loc_bam) my.f_shell_cmd(add_chr_cmd) individual_id = loc_bam.split('-')[1]+'_'+loc_bam.split('.')[0].split('-')[3] mkdir_cmd = 'makeTagDirectory %s/%s %s/%s' % (node_dir, individual_id, node_dir, loc_bam) my.f_shell_cmd(mkdir_cmd) copy_cmd = 'cp -r %s/%s %s; rm -r %s' % (node_dir, individual_id, head_dir, node_dir) my.f_shell_cmd(copy_cmd)
def sample_file(self, pattern=".*", wc_flag=True, n=10): import random, sys, os print "\n===========================" print "============Sample Files=========" print "=============================" file_list = my.f_grep_files_from_dir(self.test_dir, pattern, path=False) os.chdir(self.test_dir) for single_file in file_list: print "\n=========%s============" % single_file if wc_flag == True: cmd = "wc -l %s" % single_file print "[File lines:]", "\t".join( (my.f_shell_cmd(cmd, quiet=True).replace("\n", "").split(" "))) print "" file_handle = open(single_file, "r") print("".join(random.sample(file_handle.readlines(), n))) file_handle.close()
def f_get_tf_peak_list(project_dir, version='processed'): tf_dir = '%s/data/raw_data/tf/encode_peaks/%s/' % (project_dir, version) peak_list_raw = my.f_shell_cmd("find %s -name '*gm12878-*.narrowPeak'" % (tf_dir), quiet=True).split('\n') black_list = my.grep_list( ".*(--|Rep[1-9]|-myc|xyy1|test|pax5n19|embl|encode-)", peak_list_raw) duplicate_list = [ 'uta-gm12878-ctcf.narrowPeak', 'uw-gm12878-ctcf.narrowPeak', 'sydh-gm12878-yy1.narrowPeak', 'sydh-gm12878-rad21.narrowPeak', 'haib-gm12878-p300.narrowPeak', 'ut-gm12878-cmyc.narrowPeak', 'haib-gm12878-pol24h8.narrowPeak', 'sydh-gm12878-pol2.narrowPeak', 'uta-gm12878-pol2.narrowPeak' ] peak_list = list( set(peak_list_raw) - set(['']) - set(black_list) - set(my.grep_list('.*(%s)' % '|'.join(duplicate_list), peak_list_raw))) logging.info('Length of peaks: %s' % len(peak_list)) return peak_list
my.f_call_shell_fun(cmd) file_names = my.f_parse_file_name(os.path.basename(narrowPeak_file)) print os.path.basename(narrowPeak_file), file_names file_prefix = my.f_get_prefix(narrowPeak_file) if "gm" in file_names[1]: print("Interpret as gm cell") overlab_cmd = "sed '/^#/d' %s/%s.vcf | sed 's/^/chr/g' | intersectBed -u -a stdin -b %s > %s" % ( wgs_dir, file_names[1], output_file, file_prefix + ".wgs.vcf") else: overlab_cmd = "sed '/^#/d' %s/%s.vcf | intersectBed -u -a stdin -b %s > %s" % ( wgs_dir, file_names[1], output_file, file_prefix + ".wgs.vcf") my.f_shell_cmd(overlab_cmd) grep_het_cmd = "f_complete_genome_read_depth %s | f_grep_legal_snp | sed '/^#/d' | grep -v '1/1' > %s " % ( file_prefix + ".wgs.vcf", file_prefix + ".het.loc") grep_alt_cmd = "f_complete_genome_read_depth %s | f_grep_legal_snp | sed '/^#/d' | grep '1[/\|]1' > %s " % ( file_prefix + ".wgs.vcf", file_prefix + ".alt.loc") #os.remove(file_prefix + ".wgs.vcf") print grep_het_cmd print grep_alt_cmd my.f_shell_fun_pipe(grep_het_cmd) my.f_shell_fun_pipe(grep_alt_cmd) if (server_name != "loire"): het_pattern = my.f_create_pattern(cell_list, tf_list, ".het.loc") alt_pattern = my.f_create_pattern(cell_list, tf_list, ".alt.loc") my.f_grep_and_scp_to_loire(bed_dir, het_pattern, syn_dir) my.f_grep_and_scp_to_loire(bed_dir, alt_pattern, syn_dir)
tempdir = './tmp/aaa/' my.f_ensure_make_dir(tempdir) else: tempdir = mkdtemp() tmp_dir = tempdir peak_file = peak_file_df_rmdup.ix[loc_tf, 'file_path'] #vcf_file = '%s/deepsea/tests/data/chr22.merge.head.vcf.gz'%(project_dir) deepsea_tf = peak_file_df_rmdup.ix[loc_tf, 'deepsea_tf'] print "Successfully copied input to working directory " + tempdir try: #logging.info("python2.7 p_generate_peak_fastq.py --vcf_file %s --peak_file %s --tmp_dir %s --hg19_file %s" % (vcf_file, peak_file, tmp_dir, hg19_file)) my.f_shell_cmd( "python2.7 p_generate_peak_fastq.py --vcf_file %s --peak_file %s --tmp_dir %s --hg19_file %s" % (vcf_file, peak_file, tmp_dir, hg19_file)) except: raise Exception('Vcf format error.') #retrieve 1100bp instead of 1000bp for supporting deletion variants (<100bp) check_call([ "python2.7 p_fasta2input.py --fasta_file %s/infile.vcf.wt1100.fasta" % tmp_dir ], shell=True) print "Successfully converted to input format" check_call([ "luajit 2_DeepSEA.lua -test_file_h5 " + tempdir + "/infile.vcf.wt1100.fasta.ref.h5" ],
for deepsea_col in my.grep_list('.*%s[|]' % target_tf, gm12878_predictors): vcf_df.ix[pred_data.index, deepsea_col] = pred_data.ix[:, deepsea_col] else: logging.info('Missing %s deepsea output' % loc_tf) assert all(vcf_df.chr == chr_str), 'Error in chr' print vcf_df.ix[:, 1:10].head() zero_variants = (vcf_df.ix[:, gm12878_predictors].sum(axis=1) == 0).sum() logging.info('%s out of %s are zero' % (zero_variants, vcf_df.shape[0])) vcf_df.columns = [ re.sub('None.[0-9]*', 'None', col_name) for col_name in vcf_df.columns ] print pd.isnull(vcf_df.pos).sum() assert pd.isnull(vcf_df.pos).sum() == 0, 'No null positions' print vcf_df.shape print vcf_df.chr vcf_df.index = range(vcf_df.shape[0]) outfile = '%s/data/%s/deep_result/all/chrMergeTF/%s.%s' % ( project_dir, batch_name, chr_str, value_type) vcf_df.to_csv(outfile, sep=',', float_format='%.4e') my.f_shell_cmd('gzip -f %s' % (outfile))
while True: if os.path.exists(full_result_dir): gene_output = my.f_grep_files_from_dir(full_result_dir, '%s.*enet$' % last_gene) else: gene_output = [] if len(gene_output) == 0: time.sleep(time_interval) else: logging.info( 'Check the output %s' % gene_output) interval = start_time - os.path.getmtime(gene_output[0]) if interval < 0: time.sleep(10*time_interval) my.f_shell_cmd('Rscript3 %s/R/r_summary_features_in_one_mode.R --batch_name %s --target_mode %s --chr_str %s ' %(project_dir, batch_name, loc_dir, chr_str )) break else: time.sleep(time_interval) if time.time() - start_time > 10*time_interval: my.f_shell_cmd('Rscript3 %s/R/r_summary_features_in_one_mode.R --batch_name %s --target_mode %s --chr_str %s ' %(project_dir, batch_name, loc_dir, chr_str )) break
def main(): if __doc__ is None: parser.add_argument('--out_dir', help='Out', default='%s/qsub_445samples/' % project_dir) parser.add_argument('--test_flag', help='Test flag', default='T') opts = parser.parse_args() out_dir = opts.out_dir test_flag = (opts.test_flag == 'T') node_dir = "/state/partition1/shi/tmp_depth/%s/" % my.f_shell_cmd( 'echo $JOB_ID', quiet=True).replace('\n', '') else: out_dir = '%s/qsub_445samples/' % project_dir node_dir = out_dir + '/node/' test_flag = True my.f_ensure_make_dir(out_dir) FQ_dir = '%s/fastq/' % project_dir geuvadis_meta = '%s/metaData/E-GEUV-1.sdrf.txt' % project_dir our_study = '%s/metaData/our_sample.list' % project_dir metadata = '%s/metadata' % project_dir #import ipdb; ipdb.set_trace() our_people = set() gender = {} pop = {} for line in open(our_study, 'r').readlines(): our_people.add(line.strip().split('\t')[0]) items = line.strip().split('\t') person = items[0] person_gender = items[3] if person not in gender.keys(): gender[person] = person_gender if person not in pop.keys(): pop[person] = items[1] geu1 = set() for line in open(geuvadis_meta, 'r').readlines(): items = line.strip().split('\t') geu1.add(items[0]) of_interest = geu1.intersection(our_people) print of_interest print len(of_interest) person_to_fq = {} for line in open(geuvadis_meta, 'r').readlines(): items = line.strip().split('\t') person = items[0] if person not in of_interest: continue if person not in person_to_fq.keys(): person_to_fq[person] = set() curr_fq = items[28] person_to_fq[person].add(FQ_dir + os.path.basename(curr_fq)) #print items print person_to_fq metadata_file = open(metadata, 'w') for person in person_to_fq.keys(): out_curr = node_dir + person + '.sailfish/' metadata_file.write(person + '\t' + ','.join(person_to_fq[person]) + '\t' + out_curr + '\n') #And run sailfish cur_gender = gender[person] cur_pop = pop[person] #sailfish_idx='%s/Transcriptome/gencode.v19.annotation.PC.lincRNA.gtf.splicedExon.N'% project_dir +cur_gender+'.fa.dedup.fa_IDX_sailfish' index_dir = '~/expression_var/data/raw_data/pop/%s_dir' % cur_pop sailfish_idx = '%s/gencode.v19.annotation.PC.lincRNA.gtf.splicedExon.N' % index_dir + cur_gender + '.fa.dedup.fa_IDX_sailfish' #cmd_module='module load sailfish/0.6.3' library_type = '"T=PE:O=><"' #T=PE:O=><:S=SA fastqs = list(person_to_fq[person]) #If the output is there, don't lanch the jobs again. final_out_file = '%s/%s.sailfish/%squant.gene_level.sf' % ( out_dir, person, person) if os.path.isfile(final_out_file): print 'Got the results of %s' % person continue else: print 'Sailfish %s' % person my.f_remove_dir('%s/%s.sailfish' % (out_dir, person)) if not os.path.isfile(fastqs[0]): print 'Missing person %s: %s' % (person, fastqs[0]) if not os.path.isfile(fastqs[1]): print 'Missing person %s: %s' % (person, fastqs[1]) continue continue cmds = [] cmds.append('#!/usr/bin/env bash') cmds.append('mkdir -p %s' % out_curr) cmds.append('cp -u %s %s' % (' '.join(fastqs), out_curr)) loc_fastqs = [ os.path.join(out_curr, os.path.basename(fastq_file)) for fastq_file in fastqs ] #cmds.append(cmd_module) sailfish_exe = '~/packages/Sailfish-0.6.3-Linux_x86-64/bin/sailfish' sailfish_cmd = sailfish_exe + ' quant -i ' + sailfish_idx + ' -l ' + library_type + ' -1 <(gunzip -c ' + loc_fastqs[ 0] + ') -2 <(gunzip -c ' + loc_fastqs[ 1] + ') -o ' + out_curr + ' -f' cmds.append(sailfish_cmd) cmds.append('cd ' + out_curr) #cmds.append('module load java/latest') gtf = '%s/GENCODE_v19_2014-06-03/gencode.v19.annotation.PC.lincRNA.gtf' % project_dir cmds.append('%s/TranscriptsToGenes.sh --gtf-file ' % script_dir + gtf + ' --exp-file ' + out_curr + '/quant.sf' + ' --res-file ' + person + 'quant.gene_level.sf') cmds.append('mv ' + out_curr + '/quant.sf' + ' ' + out_curr + '/' + person + 'quant.sf') cmds.append('rm %s/*.fastq.gz' % (out_curr)) cmds.append('rm %s/reads.*' % (out_curr)) cmds.append('mv %s %s/' % (out_curr, out_dir)) cmds.append('rm -r %s' % (out_curr)) print '\n'.join(cmds) if test_flag == False: qsub_a_command('qqqq'.join(cmds), out_dir + person + '_script.sh', 'qqqq', '10G')
bam_list = my.f_grep_files_from_dir(data_dir, 'embl.*.bam', path=False) my.f_print_list(bam_list) loc_bam = bam_list[0] if para_flag == True: for loc_bam in bam_list: f_process_one_CTCF(loc_bam, data_dir, node_dir ) else: Parallel(n_jobs=num_cores)(delayed(f_process_one_CTCF)(loc_bam, data_dir, node_dir) for loc_bam in bam_list) dir_list = my.f_grep_files_from_dir(data_dir, 'NA.*', path=True) my.f_print_list(dir_list) data_dirs=' '.join([loc_dir + '/' for loc_dir in dir_list]) tf_dir = '%s/data/raw_data/tf/encode_peaks/processed/' % project_dir annotate_cmd = 'annotatePeaks.pl %s/%s hg19 -size given -d %s -noann > %s/output.file' %(tf_dir, tf_peak[loc_tf], data_dirs, data_dir) my.f_shell_cmd(annotate_cmd) #annotatePeaks.pl /homed/home/shi/expression_var/data/raw_data/tf/encode_peaks/processed/haib-gm12878-pu1.narrowPeak hg19 -size given -d NA10851-PU1-Rep1/ NA10852-PU1-Rep1/ -noann > output.file #RNA-seq #[Fri Feb 24 22:59:31 2017] p p_run_cluster_sep.py preprocess-24g p_extract_chipseq_signal_from_bam.py 1 1 1 1 1 1 1 1 1 1 1 #[Fri Feb 24 23:03:18 2017] p p_run_cluster_sep.py preprocess-shi-24g p_extract_chipseq_signal_from_bam.py 1 1 1 1 1 1 1 1 1 1 1 #[Fri Feb 24 23:06:24 2017] p p_run_cluster_sep.py preprocess-shi-24g p_extract_chipseq_signal_from_bam.py 1 1 1 1 1 1 1 1 1 1 1 #[Fri Feb 24 23:08:34 2017] p p_run_cluster_sep.py preprocess-shi-24g p_extract_chipseq_signal_from_bam.py 1 1 1 1 1 1 1 1 1 1 1
dnase_dir = target_dir + "/tmp_dir/" #dnase_dir="/state/partition1/shi/tmp_depth/%s/" % my.f_shell_cmd('echo $JOB_ID', quiet = True).replace('\n', '') #my.f_scp_python_script_to_clustdell("p_extract_depth_from_bam_fun.py") #my.f_scp_python_script_to_clustdell("p_pd.py") reload(fun) reload(loc) cofactor_list_raw = [] feature_list = [] #feature_list=["methy"] #tf_list=["inputigg","inputstd","ctcf"] #tf_list=["ctcf","znf143","bhlhe40","ebf1"] #tf_list=["znf143","ctcf","ebf1"] #tf_list=['brca1', 'chd2', 'elk1', 'max', 'maz', 'mxi1', 'nfya', 'nfyb', 'rad21', 'rfx5', 'smc3', 'stat3', 'tbp', 'usf2'] reload(fun) print my.f_shell_cmd('echo $HOME', quiet=True).replace('\n', '') my.f_unique_element_in_list(guest_cells) guest_cell = guest_cells[0] guest_extract_flag = guest_cell != cell #When the host and guest cell are different, means that try to predict variation impact. my.f_ensure_make_dir(dnase_dir) logging.info('Node dir name: ' + dnase_dir) cofactor_list = [tf_name.lower() for tf_name in cofactor_list_raw] logging.info(cofactor_list) reload(my) reload(fun) #if in clustdell, copy the tf's bam file to the local node #import ipdb; ipdb.set_trace() for tf in tf_list + feature_list + cofactor_list: tf_bam_pattern = "*%s*bam" % tf
target_server = sys.argv[1] cell_name = sys.argv[2] test_flag = sys.argv[3] print cell_name if test_flag == 'test': fastq_gz_list = ['sydh-testcell-test-Rep1.fastq.gz'] elif "gm12878" in cell_name: fastq_gz_list = gm12878_gz_list elif "gm12xxx" in cell_name: fastq_gz_list = gm12xxx_list elif "helas3" in cell_name: #fastq_gz_list = helas3_gz_list file_list = my.f_shell_cmd( "ssh [email protected] find /home/wenqiang/encode/helas3/ -name '*.fastq.gz'", quiet=True).split('\n') #ctcf_list_raw = [ os.path.basename(fastq_file) for fastq_file in my.grep_list('^(?!.*gm12xxx|.*/ut-|.*open-).*%s'%feature, file_list)] #ctcf_list = my.grep_list('uw-(gm12864|gm12873).*', ctcf_list_raw) tf_list_file = '/homed/home/shi/projects/wgs/tf_list.txt' tf_list = my.f_parse_tf_list_file(tf_list_file) compiled_list = [] rest_list = list(set(tf_list) - set(compiled_list)) #tf_list = ['egr1'] map_fastq_list = [ os.path.basename(fastq_file) for fastq_file in my.grep_list('.*-helas3.*(%s)' % '|'.join(rest_list), file_list)
import sys sys.path.insert(0, lib_dir) sys.path.insert(0, '%s/expression_var/python/' % home_dir) import pandas as pd import p_mymodule as my from p_project_metadata import * #batch_name = '800samples' batch_name = '462samples' #chr_num_list =[22, 10, 15] chr_num_list = ['X'] for chr_num in chr_num_list: cmd = 'python2.7 p_merge_tf_results.py --batch_name %s --chr_str chr%s --value_type diff' % ( batch_name, chr_num) my.f_shell_cmd(cmd) cmd = 'python2.7 p_merge_tf_results.py --batch_name %s --chr_str chr%s --value_type ref' % ( batch_name, chr_num) my.f_shell_cmd(cmd) if my.f_get_server_name() == 'wqshi': if batch_name == '800samples': my.f_shell_cmd( 'scp $HOME/expression_var/data/%s/deep_result/all/chrMergeTF/*.gz [email protected]:/homed/home/shi/expression_var/data/800samples/deep_result/all/chr800/diff/' % (batch_name)) else: my.f_shell_cmd( 'scp $HOME/expression_var/data/%s/deep_result/all/chrMergeTF/*.gz [email protected]:/homed/home/shi/expression_var/data/445samples_region/deep_result/all/chrMerge2/diff/' % (batch_name))
new_batch = '%ssamples_peer' % sample_num #This one removes the population/gender, 27 hidden factors. elif 'GTex' in other_info: new_batch = '%ssamples_gtex_norm' % sample_num #This one removes the population/gender, 27 hidden factors. else: new_batch = '%ssamples_snyder_norm' % sample_num #This one removes the population/gender else: new_batch = '%ssamples_snyder_original' % sample_num #chr_list=[10, 2, 22] #chr_list=[22] if norm_mode == 'norm': population = 'None' for chr_num in chr_list: chr_str = 'chr%s' % chr_num print chr_str #mode_list=('All' 'SNP' 'SNPinTF' 'TF' 'AlltfShuffle' 'noInteract') #mode_list=('randomSNPinTF') #mode_list=('AlltfShuffle' 'AllsnpShuffle') #mode_list=['All', 'SNPinTF', 'random', 'AlltfShuffle'] #mode_list=['AlltfShuffle'] for new_batch_random in [mode_list[i - 1] for i in modes_index]: run_cmd = 'sh s_start_cluster_gene_job.sh %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s; echo done' % ( batch_name, test_flag, model, chr_str, gene, add_miRNA, add_TF_exp, add_penalty, add_TF_exp_only, add_predict_tf, add_YRI, population, TF_exp_type, add_gm12878, new_batch, new_batch_random, other_info) my.f_shell_cmd(run_cmd)
from subprocess import * from tempfile import mkdtemp import sys import os home_dir = os.path.expanduser('~') lib_dir = '%s/python/' % home_dir import sys sys.path.insert(0, lib_dir) sys.path.insert(0, '%s/expression_var/python/' % home_dir) import pandas as pd import p_mymodule as my from p_project_metadata import * #chr_list = [10, 15, 22] #chr_list = [22] #batch_name = '462samples' #batch_name = '800samples' for batch_name in ['462samples', '800samples']: output_dir = '%s/data/%s/deep_result/all/chrMergeTF' % (project_dir, batch_name) for chr_num in chr_list: chr_str = 'chr' + str(chr_num) loc_output_dir = '%s/%s' % (output_dir, chr_str) loc_vcf_file = '%s/data/%s/chr_vcf_files/chrMerge2/%s.vcf.gz' % ( project_dir, batch_name, chr_str) cmd = 'python2.7 p_rundeepsea.py --vcf_file %s --out_dir %s' % ( loc_vcf_file, loc_output_dir) my.f_shell_cmd(cmd)
reload(my) cur_time = time.strftime("%Y%m%d_%H%M%S") + my.f_id_generator(5) server_name = socket.gethostname() print cur_time tf_list = [tf] guest_cells = [guest_cell] if "het_loc" in steps: #loc_pattern = "*.(%s).narrowPeak"%"|".join(tf_list) het_loc_cmd = "python2.7 p_het_sites_in_narrow_peak_dp.py %s %s %s %s %s %s %s" % ( cell, my.f_send_list_para(tf_list), my.f_send_list_para(guest_cells), "locker.het_loc", my.f_send_list_para(labs), bed_dir, wgs_dir) my.f_shell_cmd(het_loc_cmd) else: print "===Skip Het Loc!===" if "extract_depth" in steps: #Tried 7G, now it's 4G extract_cmd="python2.7 p_extract_depth_from_bam_dp.py %s %s %s %s %s %s "%\ (cell, my.f_send_list_para(tf_list), 'extract_depth', mapQ, my.f_send_list_para(labs), bam_dir) print extract_cmd my.f_shell_cmd(extract_cmd) else: print "===Skip Extract Depth!===" if "add_feature_light" in steps: #small is fine, check 2G add_cmd = "python2.7 p_add_feature_on_loc_dp_light.py %s %s %s %s %s" % (