コード例 #1
0
ファイル: p_generate_peak_fastq.py プロジェクト: wqshi/TF2Exp
    def vcf_to_tmp_bed(self, vcf_file, tmp_dir, debug=False):
        if f_judge_debug(debug):
            import ipdb
            ipdb.set_trace()
        if vcf_file.endswith('gz'):
            compress_format = 'gzip'
        else:
            compress_format = None
        vcf_df_raw = pd.io.parsers.read_csv(
            vcf_file, sep="\t", header=None,
            compression=compress_format).ix[:, 0:5]
        vcf_df_raw.columns = ['chr', 'pos', 'name', 'ref', 'alt']

        vcf_df = vcf_df_raw.ix[vcf_df_raw.alt.str.contains('^[ATGCatgc]+$'), :]
        if vcf_df.shape[0] != vcf_df_raw.shape[0]:
            logging.info('Filter %s illegal alt variants' %
                         (vcf_df_raw.shape[0] - vcf_df.shape[0]))

        vcf_df['start'] = vcf_df.ix[:, 'pos'] - 1
        vcf_df['name'] = vcf_df['chr'] + '_' + vcf_df['pos'].map(
            str) + '_' + vcf_df['ref'] + '_' + vcf_df['alt']
        tmp_bed_file = tmp_dir + '/' + my.f_generate_tmp_file_name('bed')
        vcf_df.ix[:, ['chr', 'start', 'pos', 'name']].to_csv(tmp_bed_file,
                                                             header=False,
                                                             index=False,
                                                             sep='\t')
        return tmp_bed_file
コード例 #2
0
ファイル: p_pd.py プロジェクト: wqshi/asb_pipeline
 def write_records_fastq(self, output_dir, fastq_recoreds, prefix = ''):
     if prefix == '':
         sequence_file= os.path.join( output_dir, my.f_generate_tmp_file_name('seq') )
     else:
         sequence_file= os.path.join( output_dir, prefix )
         
     output_handle = open(sequence_file, "w")
     SeqIO.write(fastq_recoreds, output_handle, "fasta")
     output_handle.close()
     return sequence_file 
コード例 #3
0
ファイル: p_pd.py プロジェクト: wqshi/asb_pipeline
 def extract_allele(self, data_dir, header=False, extra_cols=[]):
     #print self.data.head()
     #import ipdb; ipdb.set_trace()
     allele_file = data_dir + my.f_generate_tmp_file_name("allele")
     allele_data= self.data[["chr","start"]]
     allele_data[["end"]]=self.data[["start"]] + 1
     allele_data[["ref",'alt']]=self.data[["ref","alt"]]
     allele_data[extra_cols]=self.data[extra_cols]
     allele_data.drop_duplicates(cols=["chr","start","alt"],inplace=True)
     allele_data.drop_duplicates().to_csv(allele_file, header=header, index=False, sep="\t")
     return allele_file
コード例 #4
0
    def test_basic(self):
        other_col = 9
        peak_file = '%s/deepsea/tests/data/yy1.sorted.bed' % project_dir

        chr_str = 'chr22'
        vcf_file = '%s/deepsea/tests/data/%s.merge.head.vcf.gz' % (project_dir,
                                                                   chr_str)
        tmp_dir = '%s/deepsea/tmp/%s/' % (project_dir,
                                          my.f_generate_tmp_file_name('t'))
        tmp_dir = '%s/deepsea/tmp/' % (project_dir)
        my.f_ensure_make_dir(tmp_dir)

        fastq_file = f_prepare_deepsea_fastq_based_on_vcf(
            peak_file, vcf_file, tmp_dir)
コード例 #5
0
ファイル: p_pd.py プロジェクト: wqshi/asb_pipeline
 def extract_snp(self, data_dir):
     #import ipdb; ipdb.set_trace()
     #Only for the het sites.
     #mainly for read simulation.
     #zero-based in the format: snp_name chr1 933790 + G A
     
     allele_file = data_dir + my.f_generate_tmp_file_name("snp")
     allele_data= self.data.ix[self.data.het_type=='het', ["chr","start"]]
     allele_data[["start"]]=allele_data[["start"]]
     allele_data["strand"]='+'
     allele_data[["ref",'alt']]=self.data.ix[self.data.het_type=='het', ["ref","alt"]]
     allele_data.drop_duplicates(cols=["chr","start","alt"],inplace=True)
     allele_data.index=range(0, allele_data.shape[0])
     allele_data.drop_duplicates().to_csv(allele_file, header=True, index=True, sep=" ")
     return allele_file
コード例 #6
0
 def extract_bed(self):
     #print self.data.head()
     #import ipdb; ipdb.set_trace()
     if self.loc_file is not None:
         return self.loc_file
     logging.info('Generate new bed')
     data_dir = os.path.dirname(self.file_path)
     bed_file = data_dir + '/' + my.f_generate_tmp_file_name("loc.bed")
     bed_data = self.data[["chr", "start", 'end']]
     bed_data["name"] = bed_data["chr"] + "-" + bed_data["start"].map(str)
     bed_data.drop_duplicates().to_csv(bed_file,
                                       header=False,
                                       index=False,
                                       sep="\t")
     self.loc_file = bed_file
     return bed_file
コード例 #7
0
ファイル: p_pd.py プロジェクト: wqshi/asb_pipeline
 def extract_bed(self, data_dir, filter_col=None, filter_val=None, add_info = None):
     #print self.data.head()
     #import ipdb; ipdb.set_trace()
     bed_file = data_dir + '/' + my.f_generate_tmp_file_name("loc.bed")
     if filter_col is not None:
         bed_data=self.data[self.data[filter_col] == filter_val][['chr','start']]
     else:
         bed_data=self.data[["chr","start"]]
     bed_data["end"]=bed_data["start"].astype(int)
     bed_data["start"]=bed_data["start"].astype(int)-1
     bed_data["name"]=bed_data["chr"]+"-"+bed_data["start"].map(str)
     
     if add_info is not None:
         bed_data["name"]=bed_data["name"]+"-"+self.data[add_info]
     
     bed_data.set_index(keys=["chr","start"], inplace=True, drop=False)
     bed_data.drop_duplicates().to_csv(bed_file, header=False, index=False, sep="\t")
     return bed_file
コード例 #8
0
def f_extract_features_on_location_dp(database_file,
                                      loc_tf,
                                      feature_list,
                                      cell,
                                      data_dir,
                                      rmdup=False,
                                      labs='',
                                      guest_extract_flag=False,
                                      mapQ=0,
                                      debug=False):
    #Extract the dp information accoriding to the loc_flie (loc_cell and loc_tf) in bam files of feature_list, and loc_tf in the cell data
    #loc_file: the locations where to extract the binding signal
    #loc_tf / loc_cell: the source of loc_file

    #feature_list, cell, data_dir: for the bam files
    #this version is for collecting all the data in to a central database file

    if debug == True:
        import ipdb
        ipdb.set_trace()

    tf_database = loc.data_table(database_file)
    loc_file = tf_database.extract_loc(data_dir)

    error_cols = my.grep_list('.*simulate%s_dp_encode' % loc_tf,
                              tf_database.data.columns.tolist())
    my.f_print_list(error_cols)
    #my.f_print_list(tf_database.data.columns.tolist())
    tf_database.drop_feautre(error_cols)

    error_cols = my.grep_list('.*_%s_dp_encode' % loc_tf,
                              tf_database.data.columns.tolist())
    tf_database.drop_feautre(error_cols)

    #import ipdb; ipdb.set_trace()

    for tf in feature_list:

        print tf
        bam_files = []
        bam_pattern = '(%s).*%s-%s[\.-].*' % ("|".join(labs), cell, tf)
        if rmdup == True:
            bam_files = my.f_grep_files_from_dir(data_dir,
                                                 bam_pattern + "rmdup.bam$")
            rmdup_str = ".rmdup"

        if rmdup == False or bam_files == []:
            bam_files = my.f_grep_files_from_dir(data_dir,
                                                 bam_pattern + "sorted.bam$")
            rmdup_str = ""

        if bam_files == []:
            bam_files = my.f_grep_files_from_dir(data_dir,
                                                 bam_pattern + "bam$")

        #import ipdb; ipdb.set_trace()
        logging.info('bam files:' + data_dir + bam_pattern)
        logging.info(bam_files)
        for bam_file in bam_files:
            #print bam_file
            #Get the replicate number
            rep_object = re.match(r".*(Rep[1-9]).*",
                                  bam_file,
                                  flags=re.IGNORECASE)
            lab = re.match(r".*(%s).*%s.*" % ('|'.join(labs), cell),
                           bam_file,
                           flags=re.IGNORECASE)

            if lab == None:
                print "Lab is missing %s in labs(%s)" % (bam_file,
                                                         ' '.join(labs))
            lab = lab.group(1)
            if rep_object == None:
                print "Escape the %s" % bam_file
                continue

            loc_dir = data_dir
            #my.f_get_dir_name_from_file(loc_file)
            #print loc_dir
            #vcf_file=f_create_file_name(data_dir=loc_dir,cell=cell,tf=tf,suffix= loc_cell + "." + loc_tf + "." + extract_type + rmdup_str + ".dp.vcf",rep=rep_object.group(1))
            vcf_file = loc_dir + my.f_generate_tmp_file_name("vcf")

            #dp_file=f_create_file_name(data_dir=loc_dir,cell=cell,tf=tf,suffix= loc_cell + "." + loc_tf + "." + extract_type + rmdup_str + ".bam.vcf.dp",rep=rep_object.group(1))
            dp_file = loc_dir + my.f_generate_tmp_file_name("dp")

            #print vcf_file, dp_file
            f_extract_depth_from_bam(loc_file, bam_file, vcf_file, dp_file,
                                     mapQ)

            #import ipdb; ipdb.set_trace()
            rep_names = {}
            if labs != '':
                for i in range(1, 8):
                    rep_names['Rep%s' % i] = '_%s%s' % (lab, i)
            else:
                for i in range(1, 8):
                    rep_names['Rep%s' % i] = '_broad%s' % i

            if guest_extract_flag == True:
                rep_names['Rep1'] = '_guest'

            feature_data = tf_database.read_feature_replace_name(
                dp_file, ["chip_ref_dp", "chip_alt_dp"], [
                    "ref_%s_dp%s" % (tf, rep_names[rep_object.group(1)]),
                    "alt_%s_dp%s" % (tf, rep_names[rep_object.group(1)])
                ])
            #feature_data=tf_database.read_feature(dp_file, tf)

            tf_database.merge_feature(feature_data)
コード例 #9
0
ファイル: p_pd.py プロジェクト: wqshi/asb_pipeline
 def extract_loc(self, data_dir, header=False):
     #print self.data.head()
     loc_file = data_dir + my.f_generate_tmp_file_name("loc")
     self.data[["chr","start"]].drop_duplicates().to_csv(loc_file, header=header, index=False, sep="\t")
     return loc_file
コード例 #10
0
ファイル: p_pd.py プロジェクト: wqshi/asb_pipeline
    def get_ref_and_alt_peak_fastq_files_from_database(self, output_dir, hg19_file, file_prefix = None, target_lab = '', debug = False):
        if debug == True:
            import ipdb; ipdb.set_trace()

        tf_database = self
        
        peak_start = 'peak_%s_bed_start' % target_lab
        tf_database.data[peak_start] = tf_database.data['peak_%s_dis' % target_lab] - 50
        
        print my.grep_list('lab', tf_database.data.columns.tolist())
        
        lab_data=tf_database.data[ tf_database.data['lab_%s' % target_lab] != '.' ]
        
        peak_data = lab_data.ix[:, ['chr', peak_start]]
        peak_data[peak_start] = peak_data.ix[:,peak_start].astype('float').astype('int')
        peak_data['end'] = peak_data[peak_start] + 100
        
        #old
        full_data = tf_database.data
        tf_database.data = lab_data
        allele_file=tf_database.extract_allele(output_dir, header=True)
        allele_data=pd.io.parsers.read_csv(allele_file, sep="\t", index_col=None)

        tf_database.data = full_data
        if allele_data.shape[0] == 0:
            print "empty input"

        bed_str=peak_data.to_string(header=False, index=False)
        bed_file=pybedtools.BedTool(bed_str, from_string=True)

        fasta =  pybedtools.example_filename(hg19_file)
        a = bed_file.sequence(fi=fasta)
        from Bio import SeqIO
        fasta_file= os.path.join(output_dir, my.f_generate_tmp_file_name('fasta'))

        import shutil
        shutil.copyfile(a.seqfn, fasta_file)

        allele_data['fastq'] = ''
        mutation_seq=[]
        i=-1

        #mutation_pos_col = [int(record[5]) - int(record[1])  for record in peak_regions]
        #print allele_data.ix[:,'start']
        #print peak_data.ix[:, peak_start]
        peak_data.index = allele_data.index
        mutation_pos_col = allele_data.ix[:,'start'] - peak_data.ix[:,peak_start] - 1 
        #print mutation_pos_col
        
        alt_fastq_records = []
        ref_fastq_records = []
        for record in SeqIO.parse(open(fasta_file), "fasta"):
            i=i+1;
            mutation_pos=mutation_pos_col[i]
            line=allele_data.ix[i,]

            ref_allele=line[3]
            alt_allele=line[4]
            pwm_ref_strand = line[5]
            #print  'index %s' % i
            #if i == 112:
            #    import ipdb; ipdb.set_trace()
            assert ref_allele.upper() == record[mutation_pos].upper(), "Ref Allele doesn't Match'"
            record.seq.alphabet = IUPAC.unambiguous_dna
            mutation_record = SeqRecord(record, id = record.id, name = record.name, description = record.description)
            mutation_seq = record.seq.lower().tomutable()
            mutation_seq[mutation_pos]=alt_allele.upper()
            mutation_record.seq = mutation_seq.toseq()
            mutation_record.seq.alphabet = IUPAC.unambiguous_dna

            ref_record = record
            ref_seq = record.seq.lower().tomutable()
            ref_seq[mutation_pos] = ref_allele.upper()
            ref_record.seq = ref_seq.toseq()
                        
            alt_fastq_records.append(mutation_record)
            ref_fastq_records.append(ref_record)
            allele_data.ix[i, 'fastq' ] = str(ref_record.seq)

        
        alt_sequence_file= self.write_records_fastq(output_dir, alt_fastq_records, prefix = file_prefix + '.alt.fastq')
        ref_sequence_file= self.write_records_fastq(output_dir, ref_fastq_records, prefix = file_prefix + '.ref.fastq')

        os.remove(allele_file)
        return [ref_sequence_file, alt_sequence_file, allele_data]
コード例 #11
0
ファイル: p_pd.py プロジェクト: wqshi/asb_pipeline
    def get_ref_and_alt_fastq_files_from_database(self, output_dir, hg19_file, flanking_length =30, file_prefix = None, debug = False):
        if debug == True:
            import ipdb; ipdb.set_trace()

        tf_database = self
        
        if 'pwm_ref_strand' not in tf_database.data.columns:
            tf_database.data['pwm_ref_strand'] = '+'
        
        
        allele_file=tf_database.extract_allele(output_dir, header=True, extra_cols = ['pwm_ref_strand']  )
        pssm_len = flanking_length
        allele_data=pd.io.parsers.read_csv(allele_file, sep="\t", index_col=None)


        if allele_data.shape[0] == 0:
            print "empty input"
            #return

        bed_data=allele_data[["chr","start"]]
        bed_data[["start"]]= allele_data[["start"]] - pssm_len
        bed_data[["end"]]= allele_data[["start"]]   +pssm_len -1
        bed_data["name"] = '.'
        bed_data['score'] = '.'
        bed_data[['strand']] = allele_data[['pwm_ref_strand']]

        bed_str=bed_data.to_string(header=False, index=False)
        bed_file=pybedtools.BedTool(bed_str, from_string=True)

        fasta =  pybedtools.example_filename(hg19_file)
        a = bed_file.sequence(fi=fasta)
        from Bio import SeqIO
        fasta_file= os.path.join(output_dir, my.f_generate_tmp_file_name('fasta'))

        import shutil
        shutil.copyfile(a.seqfn, fasta_file)

        allele_data['fastq'] = ''


        mutation_seq=[]
        i=-1
        mutation_pos_col=pssm_len -1

        alt_fastq_records = []
        ref_fastq_records = []
        for record in SeqIO.parse(open(fasta_file), "fasta"):
            i=i+1;
            mutation_pos=mutation_pos_col
            line=allele_data.ix[i,]

            ref_allele=line[3]
            alt_allele=line[4]
            pwm_ref_strand = line[5]

            assert ref_allele.upper() == record[mutation_pos].upper(), "Ref Allele doesn't Match'"
            record.seq.alphabet = IUPAC.unambiguous_dna
            #The following is necessary to copy
            mutation_record = SeqRecord(record, id = record.id, name = record.name, description = record.description)
            mutation_seq = record.seq.lower().tomutable()
            mutation_seq[mutation_pos]=alt_allele.upper()
            mutation_record.seq = mutation_seq.toseq()
            mutation_record.seq.alphabet = IUPAC.unambiguous_dna

            ref_record = record
            ref_seq = record.seq.lower().tomutable()
            ref_seq[mutation_pos] = ref_allele.upper()
            ref_record.seq = ref_seq.toseq()

            #if pwm_ref_strand == '-':
                #ref_record.seq = record.seq.complement()
                #mutation_record.seq = mutation_record.seq.complement()
            #print mutation_seq
            #import ipdb; ipdb.set_trace()
            
                       
            #mutation_record.id = ref_record.id
            #mutation_record.name = ref_record.name
            
            alt_fastq_records.append(mutation_record)
            ref_fastq_records.append(ref_record)
            allele_data.ix[i, 'fastq' ] = str(ref_record.seq)

        
        alt_sequence_file= self.write_records_fastq(output_dir, alt_fastq_records, prefix = file_prefix + '.alt.fastq')
        ref_sequence_file= self.write_records_fastq(output_dir, ref_fastq_records, prefix = file_prefix + '.ref.fastq')

        os.remove(allele_file)
        return [ref_sequence_file, alt_sequence_file, allele_data]