Python BedTool.mapの例、pybedtools.BedTool.map Pythonの例

コード例 #1

0

ファイルを表示

ファイル: GRO_Analysis.py プロジェクト: jdrubin91/Read_p53

def run(Nutlin1,GRODMSO,GRONutlin,figuredir,filedir):
    x = BedTool(Nutlin1)
    y = BedTool(GRODMSO)
    z = BedTool(GRONutlin)

    a = x.map(y, c='4', o='sum')
    b = x.map(z, c='4', o='sum')

    F = plt.figure()
    ax = F.add_subplot(111)
    ax.set_title('Nutlin1hr vs. DMSO')
    ax.set_ylabel('Count')
    ax.set_xlabel('Log2 Fold Change (Nutlin/DMSO)')
    ax.hist([math.log(float(n[3])/float(m[3]),2) for m,n in zip(a,b) if m[3] != 0 and n[3] != 0 and m[3] != '.' and n[3] != '.'], bins =100)
    ax.set_xlim([-20,20])
    plt.axvline(0, color='red',linestyle='dashed')
    # ax.set_ylabel('Log2 Fold Change (Nutlin/DMSO)')
    # ax.set_xticklabels(['Nutlin/DMSO'])
    # bp = ax.boxplot([math.log(float(n[3])/float(m[3]),2) for m,n in zip(a,b) if m[3] != 0 and n[3] != 0 and m[3] != '.' and n[3] != '.'],patch_artist=True)
    # format_boxplot(bp)
    plt.savefig(figuredir + 'GRO_Analysis_Fold_Change_hist.png', dpi=1200)

    outfile = open(filedir + 'false_positives_GRO-Seq_fold_change.bed','w')
    for interval in [m[:3] for m,n in zip(a,b) if m[3] != 0 and n[3] != 0 and m[3] != '.' and n[3] != '.' and float(n[3])/float(m[3]) < 1]:
        outfile.write('\t'.join(interval) + '\n')
    outfile.close()

    outfile = open(filedir + 'p53_txn_fold_change.bed','w')
    for interval in sorted([(m[0],m[1],m[2],float(n[3])/float(m[3])) for m,n in zip(a,b) if m[3] != 0 and n[3] != 0 and m[3] != '.' and n[3] != '.'], key=lambda x: x[-1], reverse=True):
        outfile.write('\t'.join(interval[:3]) + '\t' + str(interval[-1]) + '\n')
    outfile.close()

コード例 #2

0

ファイルを表示

def calc_signals(bam_filename, region_bed_filename, signal_colnum, region_type,
                 normalize, verbose):
    ''' generator to calculate signals from BED regions mapped onto positive and
    negative strand data.'''

    region_bedtool = BedTool(region_bed_filename)

    # bedtools.map operations
    operations = ('sum', 'count')

    signal_type = 'raw'
    if normalize:
        signal_type = 'norm'

    for signal_strand in STRANDS:

        signal_bedtool = load_coverage(bam_filename,
                                       strand=signal_strand,
                                       verbose=verbose)
        for oper in operations:

            map_bedtool = region_bedtool.map(signal_bedtool,
                                             o=oper,
                                             c=signal_colnum,
                                             null=0)

            for region_row, signal_row in izip(region_bedtool, map_bedtool):

                try:
                    region_name = region_row[3]
                    region_score = region_row[4]
                    region_strand = region_row[5]

                except IndexError:
                    region_name = '%s-%s-%d-%d' % (
                        region_type, region_row.chrom, region_row.start,
                        region_row.end)
                    region_score = 0
                    # default
                    region_strand = 'none'

                if region_strand == '+':
                    region_strand = 'pos'
                elif region_strand == '-':
                    region_strand = 'neg'

                # last field is the calculated signal
                signal = float(signal_row[-1])

                if normalize and signal != 0:
                    region_size = float(region_row.end - region_row.start)
                    signal = signal / region_size

                result = (region_name, region_score, 'region-' + region_strand,
                          region_type, 'signal-' + signal_strand, oper, signal,
                          signal_type)

                yield result

コード例 #3

0

ファイルを表示

ファイル: make_annot.py プロジェクト: cuiran/makeAnnot

def make_annot_files(args, df, binary):
    df = df.sort_values(by=['CHR', 'START'])
    if binary == True:
        iter_df = [['chr' + (str(x1)).lstrip('chr'), x2, x3]
                   for (x1, x2, x3) in np.array(df[['CHR', 'START', 'END']])]
        genesetbed = BedTool(iter_df).sort().merge()
    elif binary == False:
        iter_df = [[
            'chr' + (str(x1).lstrip('chr')),
            int(x2),
            int(x3), 'annot',
            str(x4)
        ] for (x1, x2, x3,
               x4) in np.array(df[['CHR', 'START', 'END', 'ANNOT']])]
        genesetbed = BedTool(iter_df).sort()

    print('making annot file for chromosome {}'.format(args.chrom))
    df_bim = pd.read_csv(args.bfile_chr + str(args.chrom) + '.bim',
                         delim_whitespace=True,
                         usecols=[0, 1, 2, 3],
                         names=['CHR', 'SNP', 'CM', 'BP'])
    iter_bim = [['chr' + str(x1), int(x2), int(x2)]
                for (x1, x2) in np.array(df_bim[['CHR', 'BP']])]
    bimbed = BedTool(iter_bim).sort()
    if binary == True:
        annotbed = bimbed.intersect(genesetbed)
        bp = [x.start for x in annotbed]
        df_int = pd.DataFrame({'BP': bp, 'ANNOT': 1})
        df_annot = pd.merge(df_bim, df_int, how='left', on='BP')
        df_annot.fillna(0, inplace=True)
        #df_annot = df_annot[['ANNOT']].astype(int)
        df_annot.drop_duplicates(inplace=True)
    else:
        annotbed = bimbed.map(genesetbed, c=5, o='mean', null=0).to_dataframe()
        bp = annotbed.start
        annot = annotbed.name
        df_int = pd.DataFrame({'BP': bp, 'ANNOT': annot})
        df_annot = pd.merge(df_bim, df_int, how='left', on='BP')
        df_annot.drop_duplicates(inplace=True)
        if df_annot.shape[0] != df_bim.shape[0]:
            print(
                '{} SNPs in annotation df, whereas {} SNPs in bim file'.format(
                    df_annot.shape[0], df_bim.shape[0]))
            sys.exit(1)
        num_snps_final = df_annot.ANNOT.count()
        df_annot.fillna(0, inplace=True)
        df_annot = df_annot[['ANNOT']].astype(float)
        df_bim['ANNOT'] = df_annot[['ANNOT']]
        cont_annot = df_bim[['SNP', 'ANNOT']]
        cont_annot_file = args.prefix + '.' + str(args.chrom) + '.cont_bin.gz'
        cont_annot.to_csv(cont_annot_file,
                          sep="\t",
                          index=False,
                          header=None,
                          compression='gzip')

    annot_file = args.prefix + '.' + str(args.chrom) + '.annot.gz'
    df_annot.to_csv(annot_file, sep="\t", index=False, compression='gzip')

コード例 #4

0

ファイルを表示

ファイル: signal_analysis.py プロジェクト: hesselberthlab/modmap

def calc_signals(bam_filename, region_bed_filename, signal_colnum,
                 region_type, normalize, verbose):

    ''' generator to calculate signals from BED regions mapped onto positive and
    negative strand data.'''

    region_bedtool = BedTool(region_bed_filename)

    # bedtools.map operations
    operations = ('sum','count')

    signal_type = 'raw'
    if normalize:
        signal_type = 'norm'

    for signal_strand in STRANDS:

        signal_bedtool = load_coverage(bam_filename, strand=signal_strand,
                                       verbose=verbose)
        for oper in operations:

            map_bedtool = region_bedtool.map(signal_bedtool, o=oper,
                                             c=signal_colnum, null=0)

            for region_row, signal_row in izip(region_bedtool, map_bedtool):
 
                try:
                    region_name = region_row[3]
                    region_score = region_row[4]
                    region_strand = region_row[5]

                except IndexError:
                    region_name = '%s-%s-%d-%d' % (region_type,
                                                   region_row.chrom,
                                                   region_row.start,
                                                   region_row.end)
                    region_score = 0
                    # default
                    region_strand = 'none'

                if region_strand == '+':
                    region_strand = 'pos'
                elif region_strand == '-':
                    region_strand = 'neg'

                # last field is the calculated signal
                signal = float(signal_row[-1])

                if normalize and signal != 0:
                    region_size = float(region_row.end - region_row.start)
                    signal = signal / region_size

                result = (region_name, region_score, 'region-'+region_strand,
                          region_type, 'signal-'+signal_strand,
                          oper, signal, signal_type)

                yield result

コード例 #5

0

ファイルを表示

ファイル: DataImporterWGS.py プロジェクト: Jasenme/crispy

def matching_svs(brass_bedpes, ascat_beds, offsets=[1e3, 1e4, 1e5]):
    sv_df, ratio_df = [], []
    for dist_offset in offsets:
        for c in brass_bedpes:
            svs = sv_cn_aligned(brass_bedpes[c], ascat_beds[c].drop(['sample'], axis=1), offset=dist_offset)

            if svs.shape[0] != 0:
                names = ['chr', 'start_sv', 'end_sv', 'start_cn', 'end_cn', 'cn', 'sv']

                svs = svs[svs['start_sv'] < svs['end_sv']]

                svs_bed = BedTool(svs[names].to_string(index=False, header=False), from_string=True).sort()

                svs_sgrnas = svs_bed.map(crispr_beds[c], c='4', o='mean,count') \
                    .to_dataframe(names=names + ['fc_mean', 'fc_count'])

                ratios_sgrnas = svs_bed.map(crispr_beds[c], c='5', o='mean,count') \
                    .to_dataframe(names=names + ['ratio_mean', 'ratio_count'])

                sv_align = pd.concat([
                    svs_sgrnas.set_index(names),
                    ratios_sgrnas.set_index(names)
                ], axis=1)

                sv_df.append(
                    sv_align\
                        .reset_index()\
                        .assign(sample=c)\
                        .assign(offset=dist_offset)
                )

    sv_df = pd.concat(sv_df).query("fc_mean != '.'").reset_index(drop=True)

    sv_df['fc_mean'] = sv_df['fc_mean'].astype(float).values
    sv_df['fc_count'] = sv_df['fc_count'].astype(int).values

    sv_df['ratio_mean'] = sv_df['ratio_mean'].astype(float).values
    sv_df['ratio_count'] = sv_df['ratio_count'].astype(int).values

    return sv_df

コード例 #6

0

ファイルを表示

ファイル: random_dist.py プロジェクト: hjanime/modmap

def interval_counts(bedtool, interval_size, chrom_size_filename, only_chroms,
                    ignore_chroms, verbose):

    result = defaultdict()

    # make windows for analysis
    windows = BedTool().window_maker(w=interval_size,
                                     g=chrom_size_filename).sort()

    # collapse per inteval (comma delim counts, or '0')
    mapresult = windows.map(bedtool, o='collapse', c=4, null=0)

    total_intervals = 0

    for idx, row in enumerate(mapresult):

        if (only_chroms and row.chrom not in only_chroms) or \
           (ignore_chroms and row.chrom in ignore_chroms):
            continue

        if row.end - row.start < interval_size: continue

        nums = [int(i) for i in row.name.split(',')]
        counts = Counter(nums)

        # find number of non-zero counts
        total_counts = sum([i for i in counts.values() if i > 0])

        total_size = int(row.end - row.start)
        num_zeros = total_size - total_counts

        # change the 0 counts to the calculated number
        counts[0] = num_zeros

        result[idx] = counts

        total_intervals += 1

    if verbose:
        print >>sys.stderr, ">> seen %d intervals of obs data" \
                             % total_intervals

    return (result, total_intervals)

コード例 #7

0

ファイルを表示

ファイル: random_dist.py プロジェクト: hesselberthlab/modmap

def interval_counts(bedtool, interval_size, chrom_size_filename,
                    only_chroms, ignore_chroms, verbose):

    result = defaultdict()

    # make windows for analysis
    windows = BedTool().window_maker(w=interval_size,
                                     g=chrom_size_filename).sort()

    # collapse per inteval (comma delim counts, or '0')
    mapresult = windows.map(bedtool, o='collapse', c=4, null=0)

    total_intervals = 0

    for idx, row in enumerate(mapresult):

        if (only_chroms and row.chrom not in only_chroms) or \
           (ignore_chroms and row.chrom in ignore_chroms):
            continue

        if row.end - row.start < interval_size: continue

        nums = [int(i) for i in row.name.split(',')]
        counts = Counter(nums)

        # find number of non-zero counts
        total_counts = sum([i for i in counts.values() if i > 0])

        total_size = int(row.end - row.start)
        num_zeros = total_size - total_counts

        # change the 0 counts to the calculated number 
        counts[0] = num_zeros

        result[idx] = counts

        total_intervals += 1

    if verbose:
        print >>sys.stderr, ">> seen %d intervals of obs data" \
                             % total_intervals

    return (result, total_intervals)

コード例 #8

0

ファイルを表示

ファイル: signal_analysis.py プロジェクト: speach/modmap

def calc_signals(bam_filename, region_bed_filename, signal_colnum, verbose):

    ''' generator to calculate signals from BED regions mapped onto positive and
    negative strand data.'''

    region_bedtool = BedTool(region_bed_filename)

    # bedtools.map operations
    operations = ('sum','count')

    for signal_strand in STRANDS:

        signal_bedtool = load_coverage(bam_filename, strand=signal_strand,
                                       verbose=verbose)
        for oper in operations:

            map_bedtool = region_bedtool.map(signal_bedtool, o=oper,
                                             c=signal_colnum, null=0)

            for region_row, signal_row in izip(region_bedtool, map_bedtool):

                region_name = region_row[3]
                region_score = region_row[4]

                region_strand = region_row[5]
                if region_strand == '+':
                    region_strand = 'pos'
                else:
                    region_strand = 'neg'

                signal = signal_row[6]

                result = (region_name, region_score, region_strand,
                          signal_strand, oper, signal)

                yield result

コード例 #9

0

ファイルを表示

def overlay_resources_score_motifs(motif_sites_input_file, 
                                   motifs_overlapping_tracks_output_dir,
                                   chromatin_tracks_dir_path,  
                                   chromatin_tracks_files): 
    
    

    """intersect motifs with chromatin tracks, sort and group the tracks per motif
    Input: moitf instances file (motif pos, name_id, scorePval, strand)
           chromatin data collection file in bed4 format; track pos, track cell#assaytype#value or cell#TFname in case of chip-seq
    Return a file in bed7 format (motif info (6cols), overlapping_tracks. 
    """
    
    #for motif_sites_input_file in motif_sites_input_files:
    with open(motif_sites_input_file) as f:
        chr_n_file = f.readline().strip().split('\t')[0].strip()+'.bed'
        if (chr_n_file in chromatin_tracks_files):#it is assumed for every motif file name there exists a matching file name in the chromatin_tracks_input_dir
            motifs_overlapping_tracks_file = motifs_overlapping_tracks_output_dir+'/' + '.'.join(motif_sites_input_file.split('/')[-1].split('.')[0:-1])+'_overlapping_tracks' + '.bed7'
            motifs_overlapping_tracks_file_tmp = motifs_overlapping_tracks_file + '_tmp'
            print("in overlay_resources_score_motifs: " + motifs_overlapping_tracks_file)
            if not os.path.exists(motifs_overlapping_tracks_file):
                motif_sites_input_file_sorted = motif_sites_input_file + '_sorted'
                chromatin_tracks_input_file = chromatin_tracks_dir_path +'/'+ chr_n_file
                chromatin_tracks_input_file_sorted = chromatin_tracks_input_file + '_sorted'
                
                print("intersecting: " + motif_sites_input_file + ' and ' + chromatin_tracks_input_file)
                
                os.system("""sort -k1,1 -k2,2n -k3,3n {} > {}""".format(motif_sites_input_file, motif_sites_input_file_sorted))
                os.system("""sort -k1,1 -k2,2n -k3,3n {} > {}""".format(chromatin_tracks_input_file, chromatin_tracks_input_file_sorted))
                

                motif_sites_file_obj = BedTool(motif_sites_input_file_sorted)
                motif_sites_file_obj.map(BedTool(chromatin_tracks_input_file_sorted), c=4, o=['collapse']).saveas(motifs_overlapping_tracks_file_tmp)
                
                with open(motifs_overlapping_tracks_file_tmp, 'r') as infile, open(motifs_overlapping_tracks_file, 'w') as outfile:
                        line = infile.readline()
                        while line:
                            
                            sline = line.split('\t')
                            #print(sline)
                            if(len(sline)>6):
                                if(sline[7]!='.'):
                                    my_list=sline[7].split(',')
                                    cell_assay_values_dict_ChromHMM = {}
                                    cell_assay_values_dict_cCRE = {}
                                    cell_assay_values_dict_IndexDHS = {}
                                    cell_assay_values_dict_RegElem = {}
                                    cell_assay_values_dict_DNaseq = {}
                                    elem_list =[]
                                    #elem_list_EpiMap =[]
                                    for elem in my_list:
                                        #print(elem)
                    
                                        cell_value=elem.split('#')[0]
                                        assay_value = elem.split('#')[1]
                                        if(len(elem.split('#'))>2):
                                            state_value = elem.split('#')[2].rstrip("\n")
                    
                                        if assay_value== "ChromHMM":
                                            if cell_value not in cell_assay_values_dict_ChromHMM.keys():
                                                cell_assay_values_dict_ChromHMM[cell_value] = []
    
                                            cell_assay_values_dict_ChromHMM[cell_value].append(state_value)
                                        elif assay_value== "cCRE": 
                                            if cell_value not in cell_assay_values_dict_cCRE.keys():
                                                cell_assay_values_dict_cCRE[cell_value] = []
                                            cell_assay_values_dict_cCRE[cell_value].append(state_value)
                                        elif assay_value== "IndexDHS":
                                            if cell_value not in cell_assay_values_dict_IndexDHS.keys():
                                                cell_assay_values_dict_IndexDHS[cell_value] = []
                                            cell_assay_values_dict_IndexDHS[cell_value].append(state_value)
                                        elif assay_value== "RegElem":
                                            if cell_value not in cell_assay_values_dict_RegElem.keys():
                                                cell_assay_values_dict_RegElem[cell_value] = []
                                            cell_assay_values_dict_RegElem[cell_value].append(state_value)
                                        elif assay_value== "DNase-seq":
                                            if cell_value not in cell_assay_values_dict_DNaseq.keys():
                                                cell_assay_values_dict_DNaseq[cell_value] = []
                                            cell_assay_values_dict_DNaseq[cell_value].append(float(state_value))
                                        else:
    
                                            elem_list.append(elem.rstrip("\n"))                               
                                    for cell in cell_assay_values_dict_ChromHMM:
                                        #print(cell)
                                        #print(cell+"#ChromHMM#"+Counter(cell_assay_values_dict_ChromHMM[cell]).most_common(1)[0][0])
                                        elem_list.append(cell+"#ChromHMM#"+Counter(cell_assay_values_dict_ChromHMM[cell]).most_common(1)[0][0])
                                    for cell in cell_assay_values_dict_cCRE.keys():
                                            #print(cell+"#cCRE#"+Counter(cell_assay_values_dict_cCRE[cell_value]).most_common(1)[0][0])
                                        elem_list.append(cell+"#cCRE#"+Counter(cell_assay_values_dict_cCRE[cell]).most_common(1)[0][0])
    
                                    for cell in cell_assay_values_dict_IndexDHS.keys():
                                            #print(cell_assay_values_dict_IndexDHS[cell])
                                        elem_list.append(cell+"#IndexDHS#"+Counter(cell_assay_values_dict_IndexDHS[cell]).most_common(1)[0][0])
                                    for cell in cell_assay_values_dict_RegElem.keys():
                                            #print(cell_assay_values_dict_IndexDHS[cell])
                                        elem_list.append(cell+"#RegElem#"+Counter(cell_assay_values_dict_RegElem[cell]).most_common(1)[0][0])
                                    for cell in cell_assay_values_dict_DNaseq.keys():
                                            #print(cell_assay_values_dict_IndexDHS[cell])
                                        elem_list.append(cell+"#DNase-seq#"+str(max(cell_assay_values_dict_DNaseq[cell])))
                    
                                    outfile.write('\t'.join(sline[0:7])+'\t'+','.join(elem_list)+'\n')
                    
                            line = infile.readline()
                os.remove(motif_sites_input_file_sorted)
                os.remove(chromatin_tracks_input_file_sorted)
                os.remove(motifs_overlapping_tracks_file_tmp)
            
            
        cleanup()   
    return motifs_overlapping_tracks_file