Example #1
0
    def test_add_tf_database(self):
        #import ipdb; ipdb.set_trace()
        
        db_file=my.f_create_file_name(self.test_dir, 'gmtest', 'ctcf',"database")
        tf_db =data_table(db_file)
        
        update_cols=my.grep_list('^(ref|alt)$', tf_db.data.columns)
        new_cols    = my.grep_list('genotype', tf_db.data.columns)
        expected_cols = ['chr', 'start']
        
        small_db = tf_db
        small_db.data = tf_db.data.ix[0:5, expected_cols + update_cols + new_cols]

        db_file2 =my.f_create_file_name(self.test_dir, 'gmtest', 'znf143',"database")
        tf_db2   = data_table(db_file2)
        tf_db2.data = tf_db2.data.ix[0:5, expected_cols + update_cols + new_cols]

        suffix = ['_tf1','_tf2']
        merged_file = self.test_dir + '/gmtest_merged_db.database'
        merged_db = data_table(merged_file, data = small_db.data[expected_cols + update_cols])

        merged_db.add_another_database(tf_db, expected_cols, update_cols, new_cols, '_tf1', debug = False)
        merged_db.add_another_database(tf_db2, expected_cols, update_cols, new_cols, '_tf2', debug = False)
        
    
        
    
        merged_db.save_data()
        self.assertTrue( set( tf_db2.get_cord_name() ) <= set(merged_db.get_cord_name()) )

        new_add_rows = tf_db2.data.index[0]
        self.assertEqual( merged_db.data.ix[new_add_rows, 'genotype_tf2'], tf_db2.data.ix[new_add_rows, 'genotype'] )
Example #2
0
    def check_lokcer(self, pattern, timeout=10000000):

        import os.path
        import time

        #import ipdb; ipdb.set_trace()
        enquiry_lockers = my.grep_list(pattern, self.locker_list)
        return_status = True

        if len(enquiry_lockers) == 0:
            logging.warning("The equiery lokcer doesn't exist :" + pattern)
            return_status = False
        else:
            for locker_file in enquiry_lockers:
                print "Check the locker:" + locker_file
                total_time = 0
                while True:
                    if total_time > timeout:
                        return_status = False
                        logging.warning(locker_file + " reached timeout")
                        break

                    if os.path.isfile("/homed/home/shi/locker_dir/%s" %
                                      (locker_file)):
                        print "Get the locker"
                        self.locker_list.remove(locker_file)
                        break

                    time.sleep(10)
                    total_time = total_time + 10

        return return_status
Example #3
0
def f_check_loc_cols(input_data):

    matched_cols = my.grep_list('start|end|pos', input_data.columns.tolist())
    for loc_col in matched_cols:
        if input_data[loc_col].dtype != 'int64':
            logging.error('Error data type in %s', loc_col)
            input_data[loc_col] = input_data[loc_col].astype(int)
    return input_data
Example #4
0
def f_get_tf_peak_list(project_dir, version='processed'):

    tf_dir = '%s/data/raw_data/tf/encode_peaks/%s/' % (project_dir, version)
    peak_list_raw = my.f_shell_cmd("find %s -name '*gm12878-*.narrowPeak'" %
                                   (tf_dir),
                                   quiet=True).split('\n')
    black_list = my.grep_list(
        ".*(--|Rep[1-9]|-myc|xyy1|test|pax5n19|embl|encode-)", peak_list_raw)
    duplicate_list = [
        'uta-gm12878-ctcf.narrowPeak', 'uw-gm12878-ctcf.narrowPeak',
        'sydh-gm12878-yy1.narrowPeak', 'sydh-gm12878-rad21.narrowPeak',
        'haib-gm12878-p300.narrowPeak', 'ut-gm12878-cmyc.narrowPeak',
        'haib-gm12878-pol24h8.narrowPeak', 'sydh-gm12878-pol2.narrowPeak',
        'uta-gm12878-pol2.narrowPeak'
    ]
    peak_list = list(
        set(peak_list_raw) - set(['']) - set(black_list) -
        set(my.grep_list('.*(%s)' % '|'.join(duplicate_list), peak_list_raw)))
    logging.info('Length of peaks: %s' % len(peak_list))
    return peak_list
Example #5
0
 def head_file(self, pattern=".*", wc_flag=True, n=10):
     import random, sys, os
     print "\n==========================="
     print "============Sample Files========="
     print "============================="
     print self.new_files()
     file_list = my.grep_list(pattern, self.new_files())
     os.chdir(self.test_dir)
     for single_file in file_list:
         print "\n=========%s============" % single_file
         if wc_flag == True:
             cmd = "wc -l %s" % single_file
             print "[File lines:]", "\t".join(
                 (my.f_shell_cmd(cmd, quiet=True).replace("\n",
                                                          "").split(" ")))
         print ""
         cmd = "head -n %s %s" % (n, single_file)
         my.f_shell_cmd(cmd)
def f_extract_features_on_location_dp(database_file,
                                      loc_tf,
                                      feature_list,
                                      cell,
                                      data_dir,
                                      rmdup=False,
                                      labs='',
                                      guest_extract_flag=False,
                                      mapQ=0,
                                      debug=False):
    #Extract the dp information accoriding to the loc_flie (loc_cell and loc_tf) in bam files of feature_list, and loc_tf in the cell data
    #loc_file: the locations where to extract the binding signal
    #loc_tf / loc_cell: the source of loc_file

    #feature_list, cell, data_dir: for the bam files
    #this version is for collecting all the data in to a central database file

    if debug == True:
        import ipdb
        ipdb.set_trace()

    tf_database = loc.data_table(database_file)
    loc_file = tf_database.extract_loc(data_dir)

    error_cols = my.grep_list('.*simulate%s_dp_encode' % loc_tf,
                              tf_database.data.columns.tolist())
    my.f_print_list(error_cols)
    #my.f_print_list(tf_database.data.columns.tolist())
    tf_database.drop_feautre(error_cols)

    error_cols = my.grep_list('.*_%s_dp_encode' % loc_tf,
                              tf_database.data.columns.tolist())
    tf_database.drop_feautre(error_cols)

    #import ipdb; ipdb.set_trace()

    for tf in feature_list:

        print tf
        bam_files = []
        bam_pattern = '(%s).*%s-%s[\.-].*' % ("|".join(labs), cell, tf)
        if rmdup == True:
            bam_files = my.f_grep_files_from_dir(data_dir,
                                                 bam_pattern + "rmdup.bam$")
            rmdup_str = ".rmdup"

        if rmdup == False or bam_files == []:
            bam_files = my.f_grep_files_from_dir(data_dir,
                                                 bam_pattern + "sorted.bam$")
            rmdup_str = ""

        if bam_files == []:
            bam_files = my.f_grep_files_from_dir(data_dir,
                                                 bam_pattern + "bam$")

        #import ipdb; ipdb.set_trace()
        logging.info('bam files:' + data_dir + bam_pattern)
        logging.info(bam_files)
        for bam_file in bam_files:
            #print bam_file
            #Get the replicate number
            rep_object = re.match(r".*(Rep[1-9]).*",
                                  bam_file,
                                  flags=re.IGNORECASE)
            lab = re.match(r".*(%s).*%s.*" % ('|'.join(labs), cell),
                           bam_file,
                           flags=re.IGNORECASE)

            if lab == None:
                print "Lab is missing %s in labs(%s)" % (bam_file,
                                                         ' '.join(labs))
            lab = lab.group(1)
            if rep_object == None:
                print "Escape the %s" % bam_file
                continue

            loc_dir = data_dir
            #my.f_get_dir_name_from_file(loc_file)
            #print loc_dir
            #vcf_file=f_create_file_name(data_dir=loc_dir,cell=cell,tf=tf,suffix= loc_cell + "." + loc_tf + "." + extract_type + rmdup_str + ".dp.vcf",rep=rep_object.group(1))
            vcf_file = loc_dir + my.f_generate_tmp_file_name("vcf")

            #dp_file=f_create_file_name(data_dir=loc_dir,cell=cell,tf=tf,suffix= loc_cell + "." + loc_tf + "." + extract_type + rmdup_str + ".bam.vcf.dp",rep=rep_object.group(1))
            dp_file = loc_dir + my.f_generate_tmp_file_name("dp")

            #print vcf_file, dp_file
            f_extract_depth_from_bam(loc_file, bam_file, vcf_file, dp_file,
                                     mapQ)

            #import ipdb; ipdb.set_trace()
            rep_names = {}
            if labs != '':
                for i in range(1, 8):
                    rep_names['Rep%s' % i] = '_%s%s' % (lab, i)
            else:
                for i in range(1, 8):
                    rep_names['Rep%s' % i] = '_broad%s' % i

            if guest_extract_flag == True:
                rep_names['Rep1'] = '_guest'

            feature_data = tf_database.read_feature_replace_name(
                dp_file, ["chip_ref_dp", "chip_alt_dp"], [
                    "ref_%s_dp%s" % (tf, rep_names[rep_object.group(1)]),
                    "alt_%s_dp%s" % (tf, rep_names[rep_object.group(1)])
                ])
            #feature_data=tf_database.read_feature(dp_file, tf)

            tf_database.merge_feature(feature_data)
Example #7
0
deepsea_out = '%s/data/%s/deep_result/all/chrMergeTF/' % (project_dir,
                                                          batch_name)

outdir = '%s/%s/' % (deepsea_out, chr_str)

logging.info('Out dir: %s', outdir)

print vcf_df.head()

peak_file_df_rmdup = f_get_peak_file_df_rmdup(project_dir, version='processed')

predictors = pd.read_csv('%s/deepsea/resources/predictor.names' % project_dir,
                         header=None)
print predictors.head()
gm12878_predictors = f_add_suffix_on_duplicates(
    my.grep_list('gm12878', predictors.ix[:, 0]))
logging.info('GM12878 features: %s' % len(gm12878_predictors))
for deepsea_col in gm12878_predictors:
    vcf_df[deepsea_col] = 0

target_tf = 'CTCF'
print filter(
    lambda x: re.search(r'GM12878[|]%s[|]' % target_tf, x, re.IGNORECASE),
    gm12878_predictors)

if f_judge_debug(DEBUG):
    import ipdb
    ipdb.set_trace()

for loc_tf in peak_file_df_rmdup.tf:
    pred_file = '%s/%s.out.%s' % (outdir, loc_tf, value_type)
    file_list = my.f_shell_cmd(
        "ssh [email protected] find /home/wenqiang/encode/helas3/ -name '*.fastq.gz'",
        quiet=True).split('\n')
    #ctcf_list_raw = [ os.path.basename(fastq_file) for fastq_file in my.grep_list('^(?!.*gm12xxx|.*/ut-|.*open-).*%s'%feature, file_list)]
    #ctcf_list = my.grep_list('uw-(gm12864|gm12873).*', ctcf_list_raw)

    tf_list_file = '/homed/home/shi/projects/wgs/tf_list.txt'
    tf_list = my.f_parse_tf_list_file(tf_list_file)

    compiled_list = []

    rest_list = list(set(tf_list) - set(compiled_list))
    #tf_list = ['egr1']
    map_fastq_list = [
        os.path.basename(fastq_file)
        for fastq_file in my.grep_list('.*-helas3.*(%s)' %
                                       '|'.join(rest_list), file_list)
    ]

    map_fastq_list.sort()
    my.f_print_list(map_fastq_list)
    print len(map_fastq_list)
    fastq_gz_list = map_fastq_list

elif 'simulate_mask' in cell_name:
    file_list = my.f_shell_cmd(
        "ssh [email protected] find /home/shi/encode/ -name 'encode-*mask*simulate*.fastq.gz'",
        quiet=True).split('\n')

    fastq_gz_list = [os.path.basename(fastq_file) for fastq_file in file_list]
    my.f_print_list(fastq_gz_list)
elif 'simulate' in cell_name:
Example #9
0
    def get_ref_and_alt_peak_fastq_files_from_database(self, output_dir, hg19_file, file_prefix = None, target_lab = '', debug = False):
        if debug == True:
            import ipdb; ipdb.set_trace()

        tf_database = self
        
        peak_start = 'peak_%s_bed_start' % target_lab
        tf_database.data[peak_start] = tf_database.data['peak_%s_dis' % target_lab] - 50
        
        print my.grep_list('lab', tf_database.data.columns.tolist())
        
        lab_data=tf_database.data[ tf_database.data['lab_%s' % target_lab] != '.' ]
        
        peak_data = lab_data.ix[:, ['chr', peak_start]]
        peak_data[peak_start] = peak_data.ix[:,peak_start].astype('float').astype('int')
        peak_data['end'] = peak_data[peak_start] + 100
        
        #old
        full_data = tf_database.data
        tf_database.data = lab_data
        allele_file=tf_database.extract_allele(output_dir, header=True)
        allele_data=pd.io.parsers.read_csv(allele_file, sep="\t", index_col=None)

        tf_database.data = full_data
        if allele_data.shape[0] == 0:
            print "empty input"

        bed_str=peak_data.to_string(header=False, index=False)
        bed_file=pybedtools.BedTool(bed_str, from_string=True)

        fasta =  pybedtools.example_filename(hg19_file)
        a = bed_file.sequence(fi=fasta)
        from Bio import SeqIO
        fasta_file= os.path.join(output_dir, my.f_generate_tmp_file_name('fasta'))

        import shutil
        shutil.copyfile(a.seqfn, fasta_file)

        allele_data['fastq'] = ''
        mutation_seq=[]
        i=-1

        #mutation_pos_col = [int(record[5]) - int(record[1])  for record in peak_regions]
        #print allele_data.ix[:,'start']
        #print peak_data.ix[:, peak_start]
        peak_data.index = allele_data.index
        mutation_pos_col = allele_data.ix[:,'start'] - peak_data.ix[:,peak_start] - 1 
        #print mutation_pos_col
        
        alt_fastq_records = []
        ref_fastq_records = []
        for record in SeqIO.parse(open(fasta_file), "fasta"):
            i=i+1;
            mutation_pos=mutation_pos_col[i]
            line=allele_data.ix[i,]

            ref_allele=line[3]
            alt_allele=line[4]
            pwm_ref_strand = line[5]
            #print  'index %s' % i
            #if i == 112:
            #    import ipdb; ipdb.set_trace()
            assert ref_allele.upper() == record[mutation_pos].upper(), "Ref Allele doesn't Match'"
            record.seq.alphabet = IUPAC.unambiguous_dna
            mutation_record = SeqRecord(record, id = record.id, name = record.name, description = record.description)
            mutation_seq = record.seq.lower().tomutable()
            mutation_seq[mutation_pos]=alt_allele.upper()
            mutation_record.seq = mutation_seq.toseq()
            mutation_record.seq.alphabet = IUPAC.unambiguous_dna

            ref_record = record
            ref_seq = record.seq.lower().tomutable()
            ref_seq[mutation_pos] = ref_allele.upper()
            ref_record.seq = ref_seq.toseq()
                        
            alt_fastq_records.append(mutation_record)
            ref_fastq_records.append(ref_record)
            allele_data.ix[i, 'fastq' ] = str(ref_record.seq)

        
        alt_sequence_file= self.write_records_fastq(output_dir, alt_fastq_records, prefix = file_prefix + '.alt.fastq')
        ref_sequence_file= self.write_records_fastq(output_dir, ref_fastq_records, prefix = file_prefix + '.ref.fastq')

        os.remove(allele_file)
        return [ref_sequence_file, alt_sequence_file, allele_data]
Example #10
0
 def head(self):
     sample_cols = my.grep_list('(NA|HG)[0-9]+', self.data.columns)
     show_cols = list(set(self.data.columns) - set(sample_cols))
     self.data = self.data.drop(sample_cols, axis=1)
     print self.data.ix[:, show_cols].head()
Example #11
0
 def get_sample_cols(self):
     return my.grep_list('(NA|HG)[0-9]+', self.data.columns)
Example #12
0
def f_grep_wget_from_given_embl_file(index_file,
                                     pattern,
                                     output_dir,
                                     prefix,
                                     download_pattern,
                                     test_flag=False,
                                     quiet=False,
                                     debug=False):
    if debug == True:
        import ipdb
        ipdb.set_trace()

    import urllib
    matched_lines = my.grep_file(pattern, index_file)
    if matched_lines == None:
        if quiet == False:
            print "-----------------------Warning--------------------------\nNo matching for the pattern %s in %s\n" % (
                pattern, index_file)
        return "failed"

    file_names = [
        my.grep_list(download_pattern, re.split('\t', line))[0]
        for line in matched_lines
    ]
    #print file_names

    i = 1
    for file_name in file_names:

        data_url = os.path.dirname(file_name)
        file_name = os.path.basename(file_name)
        #file_suffix=re.match(r"[a-zA-Z0-9_]*\.(.*)",file_name).group(1)
        #print file_suffix

        tmp, file_suffix = os.path.splitext(file_name)

        match_object = re.match(r".*(Rep[1-9]).*",
                                file_name,
                                flags=re.IGNORECASE)
        if match_object or len(file_names):
            if match_object:
                output_name = prefix + "-" + match_object.group(
                    1) + file_suffix
            else:
                output_name = prefix + "-Rep%s" % i + file_suffix
                i = i + 1
        else:
            output_name = prefix + file_suffix

        output_file = output_dir + "/" + output_name

        if test_flag == False:
            urllib.urlretrieve(url=data_url + "/" + file_name,
                               filename=output_file)

        print "Downlaod " + file_name + " " + data_url + ' ' + output_file

        match_object = re.match(r".*\Peak.gz$", file_name)
        if match_object:
            if test_flag == False:
                f_unzip_targz(output_file)
            print "Unzip " + output_name

    return "success"
sys.path.insert(0, lib_dir)
import pandas as pd
import p_mymodule as my

project_dir = '%s/expression_var/' % home_dir
#Parse the enhancer file

loc_cell = 'gm12878'
enhancer_path = '%s/data/fantom5/hg19_permissive_enhancers_expression_rle_tpm.csv' % project_dir
enhancer_pd = pd.read_csv(
    enhancer_path,
    header=0,
    sep=',',
)
print enhancer_pd.shape
target_cel_columns = my.grep_list('.*gm12878', enhancer_pd.columns.tolist())
print['Unnamed: 0'] + target_cel_columns
extract_data = enhancer_pd.loc[:, ['Unnamed: 0'] + target_cel_columns]
extract_data
print extract_data.head()

coord_data = pd.DataFrame(
    list(extract_data.loc[:, 'Unnamed: 0'].str.split(':|-')))

extract_data['chr'] = coord_data[0]
extract_data['start'] = coord_data[1]
extract_data['end'] = coord_data[2]

extract_data.columns = ['name', 'rep1', 'rep2', 'rep3', 'chr', 'start', 'end']
bed_data = extract_data.loc[:, ['chr', 'start', 'end', 'rep1', 'rep2', 'rep3']]
print bed_data.head()