Esempio n. 1
0
def f_bam_remove_dup(bam_file, data_dir, head_dir, picard_java_lib_path):
    import p_mymodule as my
    cell_tf_name = my.f_get_prefix(bam_file)
    output_file = data_dir + "/" + cell_tf_name + ".rmdup.bam"
    input_bam = data_dir + "/" + bam_file
    cmd = "java -jar %s/MarkDuplicates.jar I=%s O=%s M=%s.duplicate_report.txt ASSUME_SORTED=true VALIDATION_STRINGENCY=LENIENT REMOVE_DUPLICATES=true" % (
        picard_java_lib_path, input_bam, output_file,
        data_dir + "/" + cell_tf_name)
    print cmd
    my.f_shell_cmd(cmd)
    output_pattern = '%s.(rmdup.bam|duplicate_report.txt)' % (cell_tf_name)
    my.f_grep_and_copy(data_dir, output_pattern, head_dir)
    os.remove(output_file)
    os.remove(os.path.join(data_dir, cell_tf_name + '.duplicate_report.txt'))
Esempio n. 2
0
 def head_file(self, pattern=".*", wc_flag=True, n=10):
     import random, sys, os
     print "\n==========================="
     print "============Sample Files========="
     print "============================="
     print self.new_files()
     file_list = my.grep_list(pattern, self.new_files())
     os.chdir(self.test_dir)
     for single_file in file_list:
         print "\n=========%s============" % single_file
         if wc_flag == True:
             cmd = "wc -l %s" % single_file
             print "[File lines:]", "\t".join(
                 (my.f_shell_cmd(cmd, quiet=True).replace("\n",
                                                          "").split(" ")))
         print ""
         cmd = "head -n %s %s" % (n, single_file)
         my.f_shell_cmd(cmd)
Esempio n. 3
0
    def wc_file(self, pattern=".*"):
        print "\n==========================="
        print "============WC Files========="
        print "============================="
        file_list = self.new_files()

        os.chdir(self.test_dir)
        for single_file in file_list:
            cmd = "wc -l %s" % single_file
            print "\t".join(
                (my.f_shell_cmd(cmd, quiet=True).replace("\n", "").split(" ")))
def f_process_one_CTCF(loc_bam, head_dir, node_base_dir):
    #import ipdb; ipdb.set_trace()
    individual_id = loc_bam.split('-')[1]+'_'+loc_bam.split('.')[0].split('-')[3]
    node_dir = node_base_dir + '/' + individual_id
    my.f_ensure_make_dir(node_dir)
    add_chr_cmd = "samtools view -H %s/%s | sed -e 's/SN:\([0-9XY]\)/SN:chr\\1/' -e 's/SN:MT/SN:chrM/' | samtools reheader - %s/%s > %s/%s" % (head_dir, loc_bam, head_dir, loc_bam, node_dir, loc_bam)
    my.f_shell_cmd(add_chr_cmd)
    individual_id = loc_bam.split('-')[1]+'_'+loc_bam.split('.')[0].split('-')[3]
    mkdir_cmd = 'makeTagDirectory %s/%s %s/%s' % (node_dir, individual_id, node_dir, loc_bam)
    my.f_shell_cmd(mkdir_cmd)
    copy_cmd = 'cp -r %s/%s %s; rm -r %s' % (node_dir, individual_id, head_dir, node_dir)
    my.f_shell_cmd(copy_cmd)
Esempio n. 5
0
 def sample_file(self, pattern=".*", wc_flag=True, n=10):
     import random, sys, os
     print "\n==========================="
     print "============Sample Files========="
     print "============================="
     file_list = my.f_grep_files_from_dir(self.test_dir,
                                          pattern,
                                          path=False)
     os.chdir(self.test_dir)
     for single_file in file_list:
         print "\n=========%s============" % single_file
         if wc_flag == True:
             cmd = "wc -l %s" % single_file
             print "[File lines:]", "\t".join(
                 (my.f_shell_cmd(cmd, quiet=True).replace("\n",
                                                          "").split(" ")))
         print ""
         file_handle = open(single_file, "r")
         print("".join(random.sample(file_handle.readlines(), n)))
         file_handle.close()
Esempio n. 6
0
def f_get_tf_peak_list(project_dir, version='processed'):

    tf_dir = '%s/data/raw_data/tf/encode_peaks/%s/' % (project_dir, version)
    peak_list_raw = my.f_shell_cmd("find %s -name '*gm12878-*.narrowPeak'" %
                                   (tf_dir),
                                   quiet=True).split('\n')
    black_list = my.grep_list(
        ".*(--|Rep[1-9]|-myc|xyy1|test|pax5n19|embl|encode-)", peak_list_raw)
    duplicate_list = [
        'uta-gm12878-ctcf.narrowPeak', 'uw-gm12878-ctcf.narrowPeak',
        'sydh-gm12878-yy1.narrowPeak', 'sydh-gm12878-rad21.narrowPeak',
        'haib-gm12878-p300.narrowPeak', 'ut-gm12878-cmyc.narrowPeak',
        'haib-gm12878-pol24h8.narrowPeak', 'sydh-gm12878-pol2.narrowPeak',
        'uta-gm12878-pol2.narrowPeak'
    ]
    peak_list = list(
        set(peak_list_raw) - set(['']) - set(black_list) -
        set(my.grep_list('.*(%s)' % '|'.join(duplicate_list), peak_list_raw)))
    logging.info('Length of peaks: %s' % len(peak_list))
    return peak_list
    my.f_call_shell_fun(cmd)

    file_names = my.f_parse_file_name(os.path.basename(narrowPeak_file))
    print os.path.basename(narrowPeak_file), file_names

    file_prefix = my.f_get_prefix(narrowPeak_file)
    if "gm" in file_names[1]:

        print("Interpret as gm cell")
        overlab_cmd = "sed '/^#/d' %s/%s.vcf | sed 's/^/chr/g' | intersectBed -u -a stdin -b %s > %s" % (
            wgs_dir, file_names[1], output_file, file_prefix + ".wgs.vcf")
    else:
        overlab_cmd = "sed '/^#/d' %s/%s.vcf | intersectBed -u -a stdin -b %s > %s" % (
            wgs_dir, file_names[1], output_file, file_prefix + ".wgs.vcf")

    my.f_shell_cmd(overlab_cmd)
    grep_het_cmd = "f_complete_genome_read_depth %s |  f_grep_legal_snp | sed '/^#/d' | grep -v '1/1' > %s " % (
        file_prefix + ".wgs.vcf", file_prefix + ".het.loc")
    grep_alt_cmd = "f_complete_genome_read_depth %s |  f_grep_legal_snp | sed '/^#/d' | grep '1[/\|]1' > %s " % (
        file_prefix + ".wgs.vcf", file_prefix + ".alt.loc")
    #os.remove(file_prefix + ".wgs.vcf")
    print grep_het_cmd
    print grep_alt_cmd
    my.f_shell_fun_pipe(grep_het_cmd)
    my.f_shell_fun_pipe(grep_alt_cmd)

if (server_name != "loire"):
    het_pattern = my.f_create_pattern(cell_list, tf_list, ".het.loc")
    alt_pattern = my.f_create_pattern(cell_list, tf_list, ".alt.loc")
    my.f_grep_and_scp_to_loire(bed_dir, het_pattern, syn_dir)
    my.f_grep_and_scp_to_loire(bed_dir, alt_pattern, syn_dir)
Esempio n. 8
0
                tempdir = './tmp/aaa/'
                my.f_ensure_make_dir(tempdir)
            else:
                tempdir = mkdtemp()
            tmp_dir = tempdir
            peak_file = peak_file_df_rmdup.ix[loc_tf, 'file_path']
            #vcf_file = '%s/deepsea/tests/data/chr22.merge.head.vcf.gz'%(project_dir)
            deepsea_tf = peak_file_df_rmdup.ix[loc_tf, 'deepsea_tf']

            print "Successfully copied input to working directory " + tempdir

            try:

                #logging.info("python2.7 p_generate_peak_fastq.py --vcf_file %s --peak_file %s --tmp_dir %s --hg19_file %s" % (vcf_file, peak_file, tmp_dir, hg19_file))
                my.f_shell_cmd(
                    "python2.7 p_generate_peak_fastq.py --vcf_file %s --peak_file %s --tmp_dir %s --hg19_file %s"
                    % (vcf_file, peak_file, tmp_dir, hg19_file))
            except:
                raise Exception('Vcf format error.')
            #retrieve 1100bp instead of 1000bp for supporting deletion variants (<100bp)
            check_call([
                "python2.7 p_fasta2input.py --fasta_file %s/infile.vcf.wt1100.fasta"
                % tmp_dir
            ],
                       shell=True)
            print "Successfully converted to input format"

            check_call([
                "luajit 2_DeepSEA.lua -test_file_h5 " + tempdir +
                "/infile.vcf.wt1100.fasta.ref.h5"
            ],
Esempio n. 9
0
        for deepsea_col in my.grep_list('.*%s[|]' % target_tf,
                                        gm12878_predictors):
            vcf_df.ix[pred_data.index, deepsea_col] = pred_data.ix[:,
                                                                   deepsea_col]
    else:
        logging.info('Missing %s deepsea output' % loc_tf)

assert all(vcf_df.chr == chr_str), 'Error in chr'

print vcf_df.ix[:, 1:10].head()
zero_variants = (vcf_df.ix[:, gm12878_predictors].sum(axis=1) == 0).sum()
logging.info('%s out of %s are zero' % (zero_variants, vcf_df.shape[0]))

vcf_df.columns = [
    re.sub('None.[0-9]*', 'None', col_name) for col_name in vcf_df.columns
]

print pd.isnull(vcf_df.pos).sum()

assert pd.isnull(vcf_df.pos).sum() == 0, 'No null positions'
print vcf_df.shape

print vcf_df.chr
vcf_df.index = range(vcf_df.shape[0])

outfile = '%s/data/%s/deep_result/all/chrMergeTF/%s.%s' % (
    project_dir, batch_name, chr_str, value_type)
vcf_df.to_csv(outfile, sep=',', float_format='%.4e')
my.f_shell_cmd('gzip -f %s' % (outfile))

while True:

    if os.path.exists(full_result_dir):
        gene_output = my.f_grep_files_from_dir(full_result_dir, '%s.*enet$' % last_gene)
    else:
        gene_output = []

    
    if len(gene_output) == 0:
        time.sleep(time_interval)
    else:
        logging.info( 'Check the output %s' % gene_output)
    
        interval = start_time - os.path.getmtime(gene_output[0])
    
        if interval < 0:
            time.sleep(10*time_interval)
            my.f_shell_cmd('Rscript3 %s/R/r_summary_features_in_one_mode.R --batch_name %s --target_mode %s --chr_str %s ' %(project_dir, batch_name, loc_dir, chr_str ))
            break
        else:
            time.sleep(time_interval)


    if time.time() - start_time > 10*time_interval:
        my.f_shell_cmd('Rscript3 %s/R/r_summary_features_in_one_mode.R --batch_name %s --target_mode %s --chr_str %s ' %(project_dir, batch_name, loc_dir, chr_str ))
        break


Esempio n. 11
0
def main():

    if __doc__ is None:
        parser.add_argument('--out_dir',
                            help='Out',
                            default='%s/qsub_445samples/' % project_dir)
        parser.add_argument('--test_flag', help='Test flag', default='T')
        opts = parser.parse_args()
        out_dir = opts.out_dir
        test_flag = (opts.test_flag == 'T')
        node_dir = "/state/partition1/shi/tmp_depth/%s/" % my.f_shell_cmd(
            'echo $JOB_ID', quiet=True).replace('\n', '')
    else:
        out_dir = '%s/qsub_445samples/' % project_dir
        node_dir = out_dir + '/node/'
        test_flag = True
    my.f_ensure_make_dir(out_dir)
    FQ_dir = '%s/fastq/' % project_dir
    geuvadis_meta = '%s/metaData/E-GEUV-1.sdrf.txt' % project_dir
    our_study = '%s/metaData/our_sample.list' % project_dir
    metadata = '%s/metadata' % project_dir

    #import ipdb; ipdb.set_trace()
    our_people = set()
    gender = {}
    pop = {}
    for line in open(our_study, 'r').readlines():
        our_people.add(line.strip().split('\t')[0])
        items = line.strip().split('\t')
        person = items[0]
        person_gender = items[3]
        if person not in gender.keys():
            gender[person] = person_gender
        if person not in pop.keys():
            pop[person] = items[1]

    geu1 = set()
    for line in open(geuvadis_meta, 'r').readlines():
        items = line.strip().split('\t')
        geu1.add(items[0])

    of_interest = geu1.intersection(our_people)
    print of_interest
    print len(of_interest)

    person_to_fq = {}
    for line in open(geuvadis_meta, 'r').readlines():
        items = line.strip().split('\t')
        person = items[0]
        if person not in of_interest:
            continue
        if person not in person_to_fq.keys():
            person_to_fq[person] = set()
        curr_fq = items[28]
        person_to_fq[person].add(FQ_dir + os.path.basename(curr_fq))
        #print items

    print person_to_fq
    metadata_file = open(metadata, 'w')
    for person in person_to_fq.keys():

        out_curr = node_dir + person + '.sailfish/'
        metadata_file.write(person + '\t' + ','.join(person_to_fq[person]) +
                            '\t' + out_curr + '\n')
        #And run sailfish
        cur_gender = gender[person]
        cur_pop = pop[person]
        #sailfish_idx='%s/Transcriptome/gencode.v19.annotation.PC.lincRNA.gtf.splicedExon.N'% project_dir +cur_gender+'.fa.dedup.fa_IDX_sailfish'
        index_dir = '~/expression_var/data/raw_data/pop/%s_dir' % cur_pop
        sailfish_idx = '%s/gencode.v19.annotation.PC.lincRNA.gtf.splicedExon.N' % index_dir + cur_gender + '.fa.dedup.fa_IDX_sailfish'
        #cmd_module='module load sailfish/0.6.3'
        library_type = '"T=PE:O=><"'  #T=PE:O=><:S=SA
        fastqs = list(person_to_fq[person])

        #If the output is there, don't lanch the jobs again.
        final_out_file = '%s/%s.sailfish/%squant.gene_level.sf' % (
            out_dir, person, person)
        if os.path.isfile(final_out_file):
            print 'Got the results of %s' % person
            continue
        else:
            print 'Sailfish %s' % person
            my.f_remove_dir('%s/%s.sailfish' % (out_dir, person))

        if not os.path.isfile(fastqs[0]):
            print 'Missing person %s: %s' % (person, fastqs[0])
            if not os.path.isfile(fastqs[1]):
                print 'Missing person %s: %s' % (person, fastqs[1])
                continue
            continue
        cmds = []
        cmds.append('#!/usr/bin/env bash')
        cmds.append('mkdir -p %s' % out_curr)
        cmds.append('cp -u %s %s' % (' '.join(fastqs), out_curr))
        loc_fastqs = [
            os.path.join(out_curr, os.path.basename(fastq_file))
            for fastq_file in fastqs
        ]

        #cmds.append(cmd_module)
        sailfish_exe = '~/packages/Sailfish-0.6.3-Linux_x86-64/bin/sailfish'
        sailfish_cmd = sailfish_exe + ' quant -i ' + sailfish_idx + ' -l ' + library_type + ' -1 <(gunzip -c ' + loc_fastqs[
            0] + ') -2 <(gunzip -c ' + loc_fastqs[
                1] + ') -o ' + out_curr + ' -f'
        cmds.append(sailfish_cmd)
        cmds.append('cd ' + out_curr)
        #cmds.append('module load java/latest')
        gtf = '%s/GENCODE_v19_2014-06-03/gencode.v19.annotation.PC.lincRNA.gtf' % project_dir
        cmds.append('%s/TranscriptsToGenes.sh --gtf-file ' % script_dir + gtf +
                    ' --exp-file ' + out_curr + '/quant.sf' + ' --res-file ' +
                    person + 'quant.gene_level.sf')
        cmds.append('mv ' + out_curr + '/quant.sf' + ' ' + out_curr + '/' +
                    person + 'quant.sf')
        cmds.append('rm %s/*.fastq.gz' % (out_curr))
        cmds.append('rm %s/reads.*' % (out_curr))
        cmds.append('mv %s %s/' % (out_curr, out_dir))
        cmds.append('rm -r %s' % (out_curr))
        print '\n'.join(cmds)
        if test_flag == False:
            qsub_a_command('qqqq'.join(cmds), out_dir + person + '_script.sh',
                           'qqqq', '10G')
    bam_list = my.f_grep_files_from_dir(data_dir, 'embl.*.bam', path=False)
    my.f_print_list(bam_list)
    loc_bam = bam_list[0]
    
    
    if para_flag == True:
        for loc_bam in bam_list:
            f_process_one_CTCF(loc_bam, data_dir, node_dir )
    else:
        Parallel(n_jobs=num_cores)(delayed(f_process_one_CTCF)(loc_bam, data_dir, node_dir) for loc_bam in bam_list)
    
    dir_list = my.f_grep_files_from_dir(data_dir, 'NA.*', path=True)
    my.f_print_list(dir_list)

    data_dirs=' '.join([loc_dir + '/' for loc_dir in dir_list])
    tf_dir = '%s/data/raw_data/tf/encode_peaks/processed/' % project_dir
    annotate_cmd = 'annotatePeaks.pl %s/%s hg19 -size given  -d %s -noann > %s/output.file' %(tf_dir, tf_peak[loc_tf], data_dirs, data_dir)
    my.f_shell_cmd(annotate_cmd)


#annotatePeaks.pl /homed/home/shi/expression_var/data/raw_data/tf/encode_peaks/processed/haib-gm12878-pu1.narrowPeak hg19 -size given  -d NA10851-PU1-Rep1/ NA10852-PU1-Rep1/ -noann > output.file

#RNA-seq


#[Fri Feb 24 22:59:31 2017] p p_run_cluster_sep.py preprocess-24g p_extract_chipseq_signal_from_bam.py 1 1 1 1 1 1 1 1 1 1 1
#[Fri Feb 24 23:03:18 2017] p p_run_cluster_sep.py preprocess-shi-24g p_extract_chipseq_signal_from_bam.py 1 1 1 1 1 1 1 1 1 1 1
#[Fri Feb 24 23:06:24 2017] p p_run_cluster_sep.py preprocess-shi-24g p_extract_chipseq_signal_from_bam.py 1 1 1 1 1 1 1 1 1 1 1
#[Fri Feb 24 23:08:34 2017] p p_run_cluster_sep.py preprocess-shi-24g p_extract_chipseq_signal_from_bam.py 1 1 1 1 1 1 1 1 1 1 1
Esempio n. 13
0
    dnase_dir = target_dir + "/tmp_dir/"
    #dnase_dir="/state/partition1/shi/tmp_depth/%s/" % my.f_shell_cmd('echo $JOB_ID', quiet = True).replace('\n', '')
    #my.f_scp_python_script_to_clustdell("p_extract_depth_from_bam_fun.py")
    #my.f_scp_python_script_to_clustdell("p_pd.py")
    reload(fun)
    reload(loc)
    cofactor_list_raw = []
    feature_list = []
    #feature_list=["methy"]
    #tf_list=["inputigg","inputstd","ctcf"]
    #tf_list=["ctcf","znf143","bhlhe40","ebf1"]
    #tf_list=["znf143","ctcf","ebf1"]
    #tf_list=['brca1', 'chd2', 'elk1', 'max', 'maz', 'mxi1', 'nfya', 'nfyb', 'rad21', 'rfx5', 'smc3', 'stat3',  'tbp', 'usf2']

reload(fun)
print my.f_shell_cmd('echo $HOME', quiet=True).replace('\n', '')
my.f_unique_element_in_list(guest_cells)
guest_cell = guest_cells[0]
guest_extract_flag = guest_cell != cell  #When the host and guest cell are different, means that try to predict variation impact.
my.f_ensure_make_dir(dnase_dir)
logging.info('Node dir name: ' + dnase_dir)
cofactor_list = [tf_name.lower() for tf_name in cofactor_list_raw]
logging.info(cofactor_list)

reload(my)
reload(fun)
#if in clustdell, copy the tf's bam file to the local node
#import ipdb; ipdb.set_trace()

for tf in tf_list + feature_list + cofactor_list:
    tf_bam_pattern = "*%s*bam" % tf
Esempio n. 14
0
target_server = sys.argv[1]
cell_name = sys.argv[2]
test_flag = sys.argv[3]

print cell_name
if test_flag == 'test':
    fastq_gz_list = ['sydh-testcell-test-Rep1.fastq.gz']
elif "gm12878" in cell_name:
    fastq_gz_list = gm12878_gz_list
elif "gm12xxx" in cell_name:
    fastq_gz_list = gm12xxx_list
elif "helas3" in cell_name:
    #fastq_gz_list = helas3_gz_list
    file_list = my.f_shell_cmd(
        "ssh [email protected] find /home/wenqiang/encode/helas3/ -name '*.fastq.gz'",
        quiet=True).split('\n')
    #ctcf_list_raw = [ os.path.basename(fastq_file) for fastq_file in my.grep_list('^(?!.*gm12xxx|.*/ut-|.*open-).*%s'%feature, file_list)]
    #ctcf_list = my.grep_list('uw-(gm12864|gm12873).*', ctcf_list_raw)

    tf_list_file = '/homed/home/shi/projects/wgs/tf_list.txt'
    tf_list = my.f_parse_tf_list_file(tf_list_file)

    compiled_list = []

    rest_list = list(set(tf_list) - set(compiled_list))
    #tf_list = ['egr1']
    map_fastq_list = [
        os.path.basename(fastq_file)
        for fastq_file in my.grep_list('.*-helas3.*(%s)' %
                                       '|'.join(rest_list), file_list)
Esempio n. 15
0
import sys

sys.path.insert(0, lib_dir)
sys.path.insert(0, '%s/expression_var/python/' % home_dir)
import pandas as pd
import p_mymodule as my
from p_project_metadata import *

#batch_name = '800samples'
batch_name = '462samples'
#chr_num_list =[22, 10, 15]
chr_num_list = ['X']

for chr_num in chr_num_list:
    cmd = 'python2.7 p_merge_tf_results.py --batch_name %s --chr_str chr%s --value_type diff' % (
        batch_name, chr_num)
    my.f_shell_cmd(cmd)
    cmd = 'python2.7 p_merge_tf_results.py --batch_name %s --chr_str chr%s --value_type ref' % (
        batch_name, chr_num)
    my.f_shell_cmd(cmd)

if my.f_get_server_name() == 'wqshi':
    if batch_name == '800samples':
        my.f_shell_cmd(
            'scp $HOME/expression_var/data/%s/deep_result/all/chrMergeTF/*.gz [email protected]:/homed/home/shi/expression_var/data/800samples/deep_result/all/chr800/diff/'
            % (batch_name))
    else:
        my.f_shell_cmd(
            'scp $HOME/expression_var/data/%s/deep_result/all/chrMergeTF/*.gz [email protected]:/homed/home/shi/expression_var/data/445samples_region/deep_result/all/chrMerge2/diff/'
            % (batch_name))
Esempio n. 16
0
        new_batch = '%ssamples_peer' % sample_num  #This one removes the population/gender, 27 hidden factors.
    elif 'GTex' in other_info:
        new_batch = '%ssamples_gtex_norm' % sample_num  #This one removes the population/gender, 27 hidden factors.
    else:
        new_batch = '%ssamples_snyder_norm' % sample_num  #This one removes the population/gender
else:
    new_batch = '%ssamples_snyder_original' % sample_num

#chr_list=[10, 2, 22]
#chr_list=[22]

if norm_mode == 'norm':
    population = 'None'

for chr_num in chr_list:
    chr_str = 'chr%s' % chr_num
    print chr_str

    #mode_list=('All' 'SNP' 'SNPinTF' 'TF' 'AlltfShuffle' 'noInteract')
    #mode_list=('randomSNPinTF')
    #mode_list=('AlltfShuffle' 'AllsnpShuffle')
    #mode_list=['All', 'SNPinTF', 'random', 'AlltfShuffle']
    #mode_list=['AlltfShuffle']
    for new_batch_random in [mode_list[i - 1] for i in modes_index]:

        run_cmd = 'sh s_start_cluster_gene_job.sh %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s; echo done' % (
            batch_name, test_flag, model, chr_str, gene, add_miRNA, add_TF_exp,
            add_penalty, add_TF_exp_only, add_predict_tf, add_YRI, population,
            TF_exp_type, add_gm12878, new_batch, new_batch_random, other_info)
        my.f_shell_cmd(run_cmd)
Esempio n. 17
0
from subprocess import *
from tempfile import mkdtemp
import sys
import os
home_dir = os.path.expanduser('~')
lib_dir = '%s/python/' % home_dir
import sys
sys.path.insert(0, lib_dir)
sys.path.insert(0, '%s/expression_var/python/' % home_dir)
import pandas as pd
import p_mymodule as my
from p_project_metadata import *

#chr_list = [10, 15, 22]
#chr_list = [22]

#batch_name = '462samples'
#batch_name = '800samples'

for batch_name in ['462samples', '800samples']:
    output_dir = '%s/data/%s/deep_result/all/chrMergeTF' % (project_dir,
                                                            batch_name)
    for chr_num in chr_list:
        chr_str = 'chr' + str(chr_num)
        loc_output_dir = '%s/%s' % (output_dir, chr_str)
        loc_vcf_file = '%s/data/%s/chr_vcf_files/chrMerge2/%s.vcf.gz' % (
            project_dir, batch_name, chr_str)
        cmd = 'python2.7 p_rundeepsea.py --vcf_file %s --out_dir %s' % (
            loc_vcf_file, loc_output_dir)
        my.f_shell_cmd(cmd)
reload(my)

cur_time = time.strftime("%Y%m%d_%H%M%S") + my.f_id_generator(5)
server_name = socket.gethostname()

print cur_time
tf_list = [tf]
guest_cells = [guest_cell]

if "het_loc" in steps:
    #loc_pattern = "*.(%s).narrowPeak"%"|".join(tf_list)

    het_loc_cmd = "python2.7 p_het_sites_in_narrow_peak_dp.py %s %s %s %s %s %s %s" % (
        cell, my.f_send_list_para(tf_list), my.f_send_list_para(guest_cells),
        "locker.het_loc", my.f_send_list_para(labs), bed_dir, wgs_dir)
    my.f_shell_cmd(het_loc_cmd)
else:
    print "===Skip Het Loc!==="

if "extract_depth" in steps:
    #Tried 7G, now it's 4G
    extract_cmd="python2.7 p_extract_depth_from_bam_dp.py %s %s %s %s %s %s "%\
      (cell, my.f_send_list_para(tf_list), 'extract_depth', mapQ, my.f_send_list_para(labs), bam_dir)
    print extract_cmd
    my.f_shell_cmd(extract_cmd)
else:
    print "===Skip Extract Depth!==="

if "add_feature_light" in steps:
    #small is fine, check 2G
    add_cmd = "python2.7 p_add_feature_on_loc_dp_light.py  %s %s %s %s %s" % (