def exclusion_regions(blacklist_file, chip_seq_data):
    """
    This function takes as input a bound bed file (from multiGPS).
    The assumption is that the bed file reports the peak center
    For example: chr2   45  46
    It converts these peak centers into 501 base pair windows, and adds them to
    the exclusion list which will be used when constructing negative sets.
    It also adds the mm10 blacklisted windows to the exclusion list.

    Parameters:
        blacklist_file (str): Path to the blacklist file.
        chip_seq_data (dataFrame): The pandas chip-seq data loaded by load_chipseq_data
    Returns:
        exclusion_windows (BedTool): A bedtools object containing all exclusion windows.
        bound_exclusion_windows (BedTool): A bedtool object containing only
        those exclusion windows where there exists a binding site.

    """
    temp_chip_file = chip_seq_data.copy()  # Doesn't modify OG array.
    temp_chip_file['start'] = temp_chip_file['start'] - 250
    temp_chip_file['end'] = temp_chip_file['end'] + 250

    bound_exclusion_windows = BedTool.from_dataframe(
        temp_chip_file[['chr', 'start', 'end']])
    blacklist_exclusion_windows = BedTool(blacklist_file)
    exclusion_windows = BedTool.cat(
        *[blacklist_exclusion_windows, bound_exclusion_windows])
    return bound_exclusion_windows, exclusion_windows
def generate_remainder(whole_bed, bed_prefix, bed_list):
    """
    Calculate the remaining regions that are not included in the truth set
    :param whole_bed: path to the truth regions for the whole panel
    :param bed_prefix: prefix used for all the bed files
    :param bed_list: list of all the bed files for that panel
    :return: BEDTool containing any regions that are completely missing from the truth regions
    """

    whole_truth = BedTool(whole_bed)
    whole_truth.saveas()
    whole = BedTool()

    for bed in bed_list:
        print bed
        tool = BedTool(bed)
        tool.saveas()
        if bed == bed_list[0]:
            whole = tool
        else:
            whole = whole.cat(tool)
            whole.saveas()

    whole_sorted = whole.sort()
    whole_merged = whole_sorted.merge()
    whole_merged.saveas()

    remainder = whole_merged.subtract(whole_truth)
    remainder.moveto('/results/Analysis/MiSeq/MasterBED/GIAB/' + bed_prefix + '.remainder.bed')
    missing_regions = whole_merged.subtract(whole_truth, A=True)
    return missing_regions
Beispiel #3
0
def bed_merge(list_of_beds, merged_bed):
    """
    merge any number of bed files (merges overlapping regions)
    """
    bed = BedTool(list_of_beds[0])
    if list_of_beds[1:]:
        bed = bed.cat(*list_of_beds[1:])
    bed.saveas(merged_bed)
Beispiel #4
0
def negative_shuffle_wrapper(args, include_bed, num_copies, noOverlapping):
    positive_windows = args[0]
    nonnegative_regions_bed = args[1]
    bigwig_files = args[2]
    randomseed = args[3]
    if num_copies > 1:
        positive_windows = BedTool.cat(*(num_copies * [positive_windows]), postmerge=False)
    negative_windows = positive_windows.shuffle(g=genome_sizes_file,
                                                incl=include_bed.fn,
                                                excl=nonnegative_regions_bed.fn,
                                                noOverlapping=noOverlapping,
                                                seed=randomseed)
    return negative_windows
Beispiel #5
0
def make_blacklist():
    blacklist = BedTool(blacklist_file)
    blacklist = blacklist.slop(g=genome_sizes_file, b=L)
    # Add ends of the chromosomes to the blacklist
    genome_sizes_info = np.loadtxt(genome_sizes_file, dtype=str)
    chroms = list(genome_sizes_info[:,0])
    chroms_sizes = list(genome_sizes_info[:,1].astype(int))
    blacklist2 = []
    for chrom, size in zip(chroms, chroms_sizes):
        blacklist2.append(Interval(chrom, 0, L))
        blacklist2.append(Interval(chrom, size - L, size))
    blacklist2 = BedTool(blacklist2)
    blacklist = blacklist.cat(blacklist2)
    return blacklist
Beispiel #6
0
def get_chip_beds(input_dir):
    chip_info_file = input_dir + '/chip.txt'
    chip_info = np.loadtxt(chip_info_file, dtype=str)
    if len(chip_info.shape) == 1:
        chip_info = np.reshape(chip_info, (-1, len(chip_info)))
    tfs = list(chip_info[:, 1])
    chip_bed_files = [input_dir + '/' + i for i in chip_info[:, 0]]
    chip_beds = [BedTool(chip_bed_file) for chip_bed_file in chip_bed_files]
    print 'Sorting BED files'
    chip_beds = [chip_bed.sort() for chip_bed in chip_beds]
    if len(chip_beds) > 1:
        merged_chip_bed = BedTool.cat(*chip_beds)
    else:
        merged_chip_bed = chip_beds[0]
    return tfs, chip_beds, merged_chip_bed
Beispiel #7
0
class GenomicSubset(object):
    def __init__(self, name, path=paths.genome_subsets, assembly='hg19'):
        self.assembly = assembly
        self.name = name
        self.bedtool = BedTool(path + name + '.bed').sort()

        # Intersect the pathway with the appropriate genome build
        # TODO: this step should be unnecessary if the pathways are correct
        if name != self.assembly:
            self.bedtool = GenomicSubset.reference_genome(
                    self.assembly).bedtool.intersect(self.bedtool).sort().saveas()

    def expand_by(self, expansion_in_each_direction_Mb):
        window_size_str = str(expansion_in_each_direction_Mb) + 'Mb'
        print('total size before window addition:', self.bedtool.total_coverage(), 'bp')

        # compute the flanks
        # TODO: use 1cM instead of 1Mb
        print('computing flanks')
        flanks = self.bedtool.flank(
            genome=self.assembly,
            b=expansion_in_each_direction_Mb*1000000).sort().merge().saveas()

        # compute the union of the flanks and the pathway
        print('computing union')
        union = self.bedtool.cat(flanks, postmerge=False).sort()
        merged = union.merge().saveas()
        print('total size after window addition:', merged.total_coverage(), 'bp')
        self.bedtool = merged

    def restricted_to_chrom_bedtool(self, chrnum):
        return self.bedtool.filter(
                lambda x : x[0] == 'chr' + str(int(chrnum))).saveas()

    @classmethod
    def reference_genome(cls, assembly='hg19'):
        return GenomicSubset(assembly, path=paths.reference, assembly=assembly)

    @classmethod
    def reference_chrom_bedtool(cls, chrnum, assembly='hg19'):
        return cls.reference_genome(assembly=assembly).restricted_to_chrom_bedtool(chrnum)

    @classmethod
    def whole_genome(cls, assembly='hg19'):
        return cls(assembly, path=paths.reference)
Beispiel #8
0
def get_chip_beds_multiple(input_dir):
    chip_info_file = input_dir + '/chip.txt'
    chip_info = np.loadtxt(chip_info_file, dtype=str)
    if len(chip_info.shape) == 1:
        chip_info = np.reshape(chip_info, (-1, len(chip_info)))
    tfs = list(chip_info[:, 1])
    chip_bed_files = [input_dir + '/' + i for i in chip_info[:, 0]]
    chip_beds = [BedTool(chip_bed_file) for chip_bed_file in chip_bed_files]
    print 'Sorting BED files'
    chip_beds = [chip_bed.sort() for chip_bed in chip_beds]
    merged_chip_bed_list = []
    for item in chip_beds:
        if 1 > 1:
            merged_chip_bed = BedTool.cat(*item)
        else:
            merged_chip_bed = item
        merged_chip_bed_list.append(merged_chip_bed)
    return tfs, chip_beds, merged_chip_bed_list
Beispiel #9
0
def get_chip_beds_multiple(input_dir,process_batch):
    chip_info_file = input_dir + '/chip.txt'
    chip_info = np.loadtxt(chip_info_file, dtype=str)
    if len(chip_info.shape) == 1:
        chip_info = np.reshape(chip_info, (-1,len(chip_info)))
    tfs = list(chip_info[:, 1])
    chip_bed_files = [input_dir + '/' + i for i in chip_info[:,0]]
    chip_beds = [BedTool(chip_bed_file) for chip_bed_file in chip_bed_files]
    if process_batch:
        batch_name_list = list(np.unique(chip_info[:,1]))
        batch_list_all_dict = {}
        exchange_dict = {}
        for index,item in enumerate(batch_name_list):
            batch_tmp = [chip_beds[i] for i in list(np.where(chip_info[:,1]==item)[0])]
            batch_0 = batch_tmp[0]
            batch_tmp = batch_tmp[1:]
            print 'concatenate batch bedfiles for batch %d...'%(index)
            batch_list = batch_0.cat(*batch_tmp,postmerge=False)
            if item not in batch_list_all_dict.keys():
                batch_list_all_dict[item] = batch_list
                batch_name_list_tmp = copy.deepcopy(batch_name_list)
                batch_name_list_tmp.remove(item)
                if len(batch_name_list_tmp) > 1:
                    exchange_dict[item] = batch_name_list_tmp
                else:
                    exchange_dict[item] = batch_name_list_tmp[0]
            else:
                print "Error!!!"
    else:
        print 'No need process batch,continue...'
    print 'Sorting BED files'
    chip_beds = [chip_bed.sort() for chip_bed in chip_beds]
    merged_chip_bed_list = []
    for item in chip_beds:
        if 1 > 1:
            merged_chip_bed = BedTool.cat(*item)
        else:
            merged_chip_bed = item
        merged_chip_bed_list.append(merged_chip_bed)
    if process_batch:
        return tfs, chip_beds, merged_chip_bed_list,batch_list_all_dict,chip_info[:,1],exchange_dict
    else:
        return tfs, chip_beds, merged_chip_bed_list
Beispiel #10
0
def merge_bed(beds=None):
    '''Concatenates, sorts, and merges (bedtools) a list of bed files. Outputs 
        into the tempdir directory created by TFEA

    Parameters
    ----------
    beds : list or array
        full paths to bed files (python Path objects from pathlib)

    Returns
    -------
    merged_bed : BedTool object 
        resulting merged bed object 
    '''
    parent_bed = BedTool(str(beds[0]))
    for bed in beds[1:]:
        parent_bed = parent_bed.cat(str(bed))
    merged_bed = parent_bed.sort().merge().sort()

    return merged_bed
def generate_remainder(whole_bed, out_dir, bed_list):
    """
    Calculate the remaining regions that are not included in the truth set

    :param whole_bed: Path to the truth regions for the whole panel
    :type whole_bed: String
    :param out_dir: Prefix used for all the bed files
    :type out_dir: String
    :param bed_list: List of all the bed files for that panel
    :type bed_list: List of String
    :return: BEDTool containing any regions that are completely missing from the truth regions
    :rtype: BedTool
    """
    try:
        whole_truth = BedTool(whole_bed)
        whole_truth.saveas()
        whole = BedTool()

        for bed in bed_list:
            print(bed)
            tool = BedTool(bed)
            tool.saveas()
            if bed == bed_list[0]:
                whole = tool
            else:
                whole = whole.cat(tool)
                whole.saveas()

        whole_sorted = whole.sort()
        whole_merged = whole_sorted.merge()
        whole_merged.saveas()

        remainder = whole_merged.subtract(whole_truth)
        remainder.moveto(out_dir + '/remainder.bed')
        missing_regions = whole_merged.subtract(whole_truth, A=True)
    except UnicodeDecodeError:
        missing_regions = None
    return missing_regions
Beispiel #12
0
def read_bed_regions(bed_files, chroms):
    """ Creates a merge region overall provided bed-files.

    Args:
        bed_files (list): Of bed-file paths (str)
           chroms (dict): With parents/chromosome id as key
                          and size (bp) as value.

    Returns:
        dict: A dictionary with chromosome/parent id as key
              and list of Region(start, end) (named tuple)
              object as value.
    """
    bed_file = BedTool(bed_files[0])
    if len(bed_files) > 1:
        bed_file = bed_file.cat(*bed_files[1:], postmerge=False)
    bed_file = bed_file.sort().merge()

    regions = defaultdict(list)
    for region in bed_file:
        if region.chrom in chroms.keys():
            regions[region.chrom].append(Region(region.start, region.end))
    return regions
df['sum'] = df['sum'].astype(float).fillna(0.0)

# filter by sum of E-B in the interval
df2 = df.loc[df['sum'] > 0,:]

# ---- Second:  filter by number < 2
tmp = df['score'].str.split(',').tolist()
c=[0]*len(tmp)
for i,ilist in zip(range(len(tmp)),tmp):
    try:
        for j in ilist:
            if float(j)<= 0 :
                c[i]+=1
    except (TypeError, ValueError):
        c[i] = 10
df['lt0_count'] = c
df3 = df.loc[df['lt0_count'] <=2, :]

# ---- Third: union of two selections:
x=list(set(df2.index) | set(df3.index))
df_final = df.loc[x,:]

bw_final = BedTool.from_dataframe(df_final.iloc[:,:3])

# merge the original seeds and the intervals with <2 E-B
peaks = BedTool.cat(seed,bw_final).sort(g=hg19).merge(d=0)
peaks.saveas(out)



Beispiel #14
0
def get_tf_predictive_setup(true_feature_bedtools, region_bedtool=None,
                            ambiguous_feature_bedtools=None,
                            bin_size=200, flank_size=400, stride=50,
                            n_jobs=1, genome='hg19',
                            min_bin_distance_to_chrom_edge=5000,
                            filter_flank_overlaps=False):
    """
    Implements the tf (and general) imputation data setup for a single sample.
    TODOs
        support chrom.sizes file for personal genomes

    Parameters
    ----------
    tf_feature_peak_bedtools : list of filenames, BedTools or None items
        None items are treated as missing data.
    region_bedtools : filename or BedTool, optional
        If not set, union of tf_feature_peak_bedtools is used.
    filter_flank_overlaps : bool, default: True
        Labels negative bins whose flanks overlap target regions as ambiguous.
    ambiguous_feature_bedtools : list of filenames, BedTools or None items, optional
    genome : str, default: 'hg19'
        Can be any genome name supported by pybedtools.
    """
    # initialize feature bedtools
    true_feature_bedtools = [BedTool(bedtool) if bedtool is not None else None
                             for bedtool in true_feature_bedtools]
    # sanity checks
    if ambiguous_feature_bedtools is not None:
        assert len(ambiguous_feature_bedtools) == len(true_feature_bedtools)
        ambiguous_feature_bedtools = [BedTool(bedtool) if bedtool is not None else None
                                      for bedtool in ambiguous_feature_bedtools]
    # merge and bin region_bedtools
    if region_bedtool is not None:
	print(region_bedtool)
        region_bedtool = BedTool(region_bedtool).sort()
	print("Made Bedtool")
	region_bedtool = region_bedtool.merge()
        bins = bin_bed(region_bedtool, bin_size=bin_size, stride=stride)
    else:  # use union of true peak bedtools
        bedtools_to_merge = [
            bedtool for bedtool in true_feature_bedtools if bedtool is not None]
        region_bedtool = BedTool.cat(
            *bedtools_to_merge, postmerge=True, force_truncate=True)
        bins = bin_bed(region_bedtool, bin_size=bin_size, stride=stride)
    # throw out bins within 5kb of chromosome edge
    genome_chrom_sizes = getattr(genome_registry, genome)
    bins = bins.each(filter_by_chrom_sizes, genome_chrom_sizes,
                     min_bin_distance_to_chrom_edge)
    # filter bins to chr1-22,X,Y
    chrom_list = ["chr%i" % (i) for i in range(1, 23)]
    chrom_list += ["chrX", "chrY"]
    bins = BedTool(bins).each(filter_interval_by_chrom, chrom_list)
    bins = bins.saveas()  # save to temp file to enable counting
    num_bins = bins.count()
    # set genome to hg19
    bins = bins.set_chromsizes(genome)
    # intersect bins and tf_true_peaks for true labels
    if n_jobs == 1:
        true_labels_list = []
        for true_feature_bedtool in true_feature_bedtools:
            true_labels = bed_intersection_labels(bins, true_feature_bedtool)
            true_labels_list.append(true_labels)
    elif n_jobs > 1:  # multiprocess bed intersections
        # save feature bedtools in temp files. Note: not necessary when inputs
        # are filnames
        true_feature_fnames = [
            bedtool.fn if bedtool is not None else None for bedtool in true_feature_bedtools]
        true_labels_list = Parallel(n_jobs=n_jobs)(delayed(bed_intersection_labels)(bins.fn, fname)
                                                   for fname in true_feature_fnames)
    true_labels = np.concatenate(true_labels_list, axis=1)
    bins_and_flanks = bins.slop(b=flank_size)
    if filter_flank_overlaps:
        # intersect bins and flanks for any overlap  with true features
        if n_jobs == 1:
            flank_labels_list = []
            for true_feature_bedtool in true_feature_bedtools:
                flank_labels = bed_intersection_labels(
                    bins, true_feature_bedtool, f=10**-9, F=10**-9)
                flank_labels_list.append(flank_labels)
        elif n_jobs > 1:
            flank_labels_list = Parallel(n_jobs=n_jobs)(delayed(bed_intersection_labels)(bins.fn, bedtool.fn, f=10**-9, F=10**-9)
                                                        for bedtool in true_feature_bedtools)
        flank_labels = np.concatenate(flank_labels_list, axis=1)
        # we label negative bins with any flank overlap with true features as
        # ambiguous
        true_labels[(true_labels == 0) * (flank_labels == 1)] = AMBIG_LABEL
    if ambiguous_feature_bedtools is not None:
        # intersect bins and ambiguous tfs for ambiguous labels
        if n_jobs == 1:
            ambiguous_labels_list = []
            for ambiguous_feature_bedtool in ambiguous_feature_bedtools:
                ambiguous_labels = bed_intersection_labels(
                    bins, ambiguous_feature_bedtool)
                ambiguous_labels_list.append(ambiguous_labels)
        elif n_jobs > 1:
            ambiguous_feature_fnames = [
                bedtool.fn if bedtool is not None else None for bedtool in ambiguous_feature_bedtools]
            ambiguous_labels_list = Parallel(n_jobs=n_jobs)(delayed(bed_intersection_labels)(bins.fn, fname)
                                                            for fname in ambiguous_feature_fnames)
        ambiguous_labels = np.concatenate(ambiguous_labels_list, axis=1)
        # we label negative bins that overlap ambiguous feature as ambiguous
        true_labels[(true_labels == 0) * (ambiguous_labels == 1)] = AMBIG_LABEL
        # TODO: do we want to also filter based on any flank overlap with
        # ambiguous features??

    return bins_and_flanks, true_labels
Beispiel #15
0
 def merge(regulators):
     """Merge a list of regulators using BedTool.cat"""
     if len(regulators) > 1:
         return BedTool.cat(*regulators, postmerge=False)
     else:
         return regulators[0]
###
#   name      | mary lauren benton
#   conda_env | enh_gain-loss
#   created   | 2019.04.04
#
#   this script will process the files downloaded in accessions.txt and merge
#   chip-seq from the same tissues
#   sorted and merged files saved to ./ctcf/encode_tissues/
###

from collections import defaultdict
from pybedtools import BedTool

name_dict = defaultdict(list)

with open('accessions_mapped_filenames.txt') as infile:
    next(infile)
    for line in infile:
        data = line.strip().split('\t')
        tissue_type = data[1].replace(' ', '_').replace('\'', '').lower()
        name_dict[tissue_type].append(data[0])

for tissue in name_dict.keys():
    num_files = len(name_dict[tissue])
    for idx, bed in enumerate(name_dict[tissue]):
        if idx == 0:
            a = BedTool(bed + '.bed')
        else:
            a = a.cat(BedTool(bed + '.bed'), postmerge=True)
    a.sort().merge().saveas(tissue + '.bed')
Beispiel #17
0
def make_features_multiTask(positive_windows, y_positive, nonnegative_regions_bed, 
                            bigwig_files, bigwig_names, genome, epochs, valid_chroms, test_chroms):
    chroms, chroms_sizes, genome_bed = get_genome_bed()
    train_chroms = chroms
    for chrom in valid_chroms + test_chroms:
        train_chroms.remove(chrom)
    genome_bed_train, genome_bed_valid, genome_bed_test = \
        [subset_chroms(chroms_set, genome_bed) for chroms_set in
         (train_chroms, valid_chroms, test_chroms)]

    positive_windows_train = []
    positive_windows_valid = []
    positive_windows_test = []
    positive_data_train = []
    positive_data_valid = []
    positive_data_test = []
    
    import pdb
    print 'Splitting positive windows into training, validation, and testing sets'
    for positive_window, target_array in itertools.izip(positive_windows, y_positive):
        if len(positive_window.chrom) > 8:
            pdb.set_trace()
        chrom = positive_window.chrom
        start = int(positive_window.start)
        stop = int(positive_window.stop)
        if chrom in test_chroms:
            positive_windows_test.append(positive_window)
            positive_data_test.append((chrom, start, stop, shift_size, bigwig_files, [], target_array))
        elif chrom in valid_chroms:
            positive_windows_valid.append(positive_window)
            positive_data_valid.append((chrom, start, stop, shift_size, bigwig_files, [], target_array))
        else:
            positive_windows_train.append(positive_window)
            positive_data_train.append((chrom, start, stop, shift_size, bigwig_files, [], target_array))
    
    positive_windows_train = BedTool(positive_windows_train)
    positive_windows_valid = BedTool(positive_windows_valid)
    positive_windows_test = BedTool(positive_windows_test)

    import pdb
    print 'Getting negative training examples'
    negative_windows_train = BedTool.cat(*(epochs*[positive_windows]), postmerge=False)
    #negative_windows_train = BedTool.cat(*(10*[positive_windows]), postmerge=False)
    #pdb.set_trace()
    negative_windows_train = negative_windows_train.shuffle(g=genome_sizes_file,
                                                            incl=genome_bed_train.fn,
                                                            excl=nonnegative_regions_bed.fn,
                                                            noOverlapping=False,
                                                            seed=np.random.randint(-214783648, 2147483647))
                                                            #seed=np.random.randint(-21478364, 21474836))
    print 'Getting negative validation examples'
    negative_windows_valid = positive_windows_valid.shuffle(g=genome_sizes_file,
                                                            incl=genome_bed_valid.fn,
                                                            excl=nonnegative_regions_bed.fn,
                                                            noOverlapping=False,
                                                            seed=np.random.randint(-214783648, 2147483647))
                                                            #seed=np.random.randint(-21478364, 21474836))
    print 'Getting negative testing examples'
    negative_windows_test = positive_windows_test.shuffle(g=genome_sizes_file,
                                                            incl=genome_bed_test.fn,
                                                            excl=nonnegative_regions_bed.fn,
                                                            noOverlapping=False,
                                                            seed=np.random.randint(-214783648, 2147483647))
                                                            #seed=np.random.randint(-21478364, 21474836))

    # Train
    print 'Extracting data from negative training BEDs'
    negative_targets = np.zeros(y_positive.shape[1])
    negative_data_train = [(window.chrom, window.start, window.stop, shift_size, bigwig_files, [], negative_targets)
                           for window in negative_windows_train]

    # Validation
    print 'Extracting data from negative validation BEDs'
    negative_data_valid = [(window.chrom, window.start, window.stop, shift_size, bigwig_files, [], negative_targets)
                           for window in negative_windows_valid]
    
    # Test
    print 'Extracting data from negative testing BEDs'
    negative_data_test = [(window.chrom, window.start, window.stop, shift_size, bigwig_files, [], negative_targets)
                           for window in negative_windows_test]

    num_positive_train_windows = len(positive_data_train)
    
    data_valid = negative_data_valid + positive_data_valid
    data_test = negative_data_test + positive_data_test

    print 'Shuffling training data'
    data_train = []
    for i in xrange(epochs):
        epoch_data = []
        epoch_data.extend(positive_data_train)
        epoch_data.extend(negative_data_train[i*num_positive_train_windows:(i+1)*num_positive_train_windows])
        np.random.shuffle(epoch_data)
        data_train.extend(epoch_data)

    print 'Generating data iterators'
    bigwig_rc_order = get_bigwig_rc_order(bigwig_names)
    datagen_train = DataIterator(data_train, genome, batch_size, L, bigwig_rc_order)
    datagen_valid = DataIterator(data_valid, genome, batch_size, L, bigwig_rc_order)
    datagen_test = DataIterator(data_test, genome, batch_size, L, bigwig_rc_order)

    print len(datagen_train), 'training samples'
    print len(datagen_valid), 'validation samples'
    print len(datagen_test), 'test samples'
    return datagen_train, datagen_valid, datagen_test, data_test
Beispiel #18
0
def nonnegative_wrapper(a, bl_file):
    bl = BedTool(bl_file)
    a_slop = a.slop(g=genome_sizes_file, b=genome_window_size)
    return bl.cat(a_slop).fn
Beispiel #19
0
def main(argv):

    # DEFINIR date
    # ============
    start = time.time()

    # parce arguments
    # ===============
    usage = "compare_bed.py -f <bed file 1> -F <bed file 2> -c <index chr1> -C <index chr2> -s <index start 1> -S <index start 2> -e <index end 1> -E <index end 2> -v <index value1> -V <index value2> -o <output path> -n <name of first> -N <name of second>"
    parser = OptionParser(usage=usage)
    parser.add_option("-f",
                      "--file1",
                      type="string",
                      metavar="<file>",
                      dest="f1",
                      help="one bed file")
    parser.add_option("-F",
                      "--File2",
                      type="string",
                      dest="f2",
                      help="one bed file")
    parser.add_option("-c",
                      "--chr",
                      type="int",
                      dest="chr1",
                      help="index of chromosome for file1",
                      default=0)
    parser.add_option("-C",
                      "--CHR",
                      type="int",
                      dest="chr2",
                      help="index of chromosome",
                      default=0)
    parser.add_option("-s",
                      "--start",
                      type="int",
                      dest="st1",
                      help="index of start for file1",
                      default=1)
    parser.add_option("-S",
                      "--START",
                      type="int",
                      dest="st2",
                      help="index of start for file2",
                      default=1)
    parser.add_option("-e",
                      "--end",
                      type="int",
                      dest="sp1",
                      help="index of stop for file1",
                      default=2)
    parser.add_option("-E",
                      "--END",
                      type="int",
                      dest="sp2",
                      help="index of stop for file2",
                      default=2)
    parser.add_option("-v",
                      "--value",
                      type="int",
                      dest="sc1",
                      help="index of score for file1",
                      default=4)
    parser.add_option("-V",
                      "--VALUE",
                      type="int",
                      dest="sc2",
                      help="index of score for file2",
                      default=4)
    parser.add_option("-o",
                      "--output",
                      type="string",
                      dest="output",
                      help="prefixe of output")
    parser.add_option("-n",
                      "--name1",
                      type="string",
                      dest="name_d1",
                      help="name of d1")
    parser.add_option("-N",
                      "--NAME2",
                      type="string",
                      dest="name_d2",
                      help="name of d2")
    (opt, args) = parser.parse_args(argv)
    # check
    if len(argv) < 2:
        print HELP
        parser.print_help()
        sys.exit(1)

    # create log file
    saveout = sys.stdout
    fsock = open(
        '%s_%s.vs.%s_compare_bed.log' % (opt.output, opt.name_d1, opt.name_d2),
        'w')
    sys.stdout = sys.stderr = fsock

    # tracability
    print "[LOG] command: ", " ".join(argv)

    # load data
    d1 = BedTool(opt.f1)
    d2 = BedTool(opt.f2)

    # format
    d1 = d1.sort()
    d2 = d2.sort()

    # record sorted bed
    f1_sorted = (opt.f1).replace(".bed", "sorted.bed")
    d1.saveas(f1_sorted)
    f2_sorted = (opt.f2).replace(".bed", "sorted.bed")
    d2.saveas(f2_sorted)

    # intersect between d1 and d2 i.e. if d1 intersect 2 differentes regions in d2 the line is duplicat
    d_intersect = d1.intersect(d2, wo=True, sorted=True)
    d_intersect.saveas(opt.output + "_intersectbed.tsv")
    # which d1 are not overlapping with d2
    d1_specifique = d1.intersect(d2, v=True, sorted=True)
    d1_specifique.saveas(opt.output + "_specific_%s.tsv" % (opt.name_d1))
    # which d2 are not overlapping with d1
    d2_specifique = d2.intersect(d1, v=True, sorted=True)
    d2_specifique.saveas(opt.output + "_specific_%s.tsv" % (opt.name_d2))
    # which d1 intersect d2 and how many time (last column)
    d1_intersect = (d1.intersect(
        d2, c=True,
        sorted=True)).filter(lambda x: int(x.fields[-1]) > 0).sort()
    d1_intersect.saveas(opt.output + "_intersect_%s.tsv" % (opt.name_d1))
    # which d2 intersect d1 and how many time (last column)
    d2_intersect = (d2.intersect(
        d1, c=True,
        sorted=True)).filter(lambda x: int(x.fields[-1]) > 0).sort()
    d2_intersect.saveas(opt.output + "_intersect_%s.tsv" % (opt.name_d2))
    # merge d1 and 2
    try:  # don't work on my computer
        d_union = d1.cat(d2, c=4, delim="|", o="collapse")
    except:
        print "[ERROR] bedtools merge must be v2.25.0 or more recent, miss some options in bedtools merge"
        print "[LOG] file _mergebed.tsv generated without value column"
        d_union = d1.cat(d2)
    d_union.saveas(opt.output + "_mergebed.tsv")

    # Jaccard
    #print d1.jaccard(d2)
    print "[LOG] jaccard test between bed file:"
    jaccard_res = commands.getstatusoutput("bedtools jaccard -a %s -b %s " %
                                           (f1_sorted, f2_sorted))
    if jaccard_res[0] == 0 and len(jaccard_res) > 1:
        jaccard_res = [line.split("\t") for line in jaccard_res[1].split("\n")]
        print pandas.DataFrame(jaccard_res)
    else:
        print "[ERROR] Jaccard can't compute"
        print jaccard_res
    # represent score by area in venn
    score1_spe = [float(I.fields[opt.sc1]) for I in d1_specifique]
    score2_spe = [float(I.fields[opt.sc2]) for I in d2_specifique]
    score1_int = [float(I.fields[opt.sc1]) for I in d1_intersect]
    score2_int = [float(I.fields[opt.sc1]) for I in d2_intersect]
    data = [score1_spe, score1_int, score2_int, score2_spe]
    from difflib import SequenceMatcher
    match = SequenceMatcher(None, opt.name_d1, opt.name_d2).find_longest_match(
        0, len(opt.name_d1), 0, len(opt.name_d2))
    longeststring = opt.name_d1[match.a:match.a + match.size]
    xname = [
        opt.name_d1 + "_spe".replace(longeststring, ""),
        opt.name_d1 + "_int".replace(longeststring, ""),
        opt.name_d2 + "_int".replace(longeststring, ""),
        opt.name_d2 + "_spe".replace(longeststring, "")
    ]
    color = ["grey", "lightblue", "lightblue", "blue"]
    if flag_graph:
        graph.boxplot(list_of_list=data,
                      name_out=opt.output + "_boxplotscore.png",
                      xlab="",
                      ylab="",
                      title="",
                      xname=xname,
                      color=color)
    # represent distribution of number bp overlapping
    nb_overlap = [int(I.fields[-1]) for I in d_intersect]
    if flag_graph:
        graph.hist(x=nb_overlap,
                   label_x="overlap %s vs %s" % (opt.name_d1, opt.name_d2),
                   name_out=opt.output + "_nbOverlap.png",
                   cum=False,
                   xline=None,
                   yline=None,
                   limit_x=[],
                   limit_y=[],
                   bins=50)
        # represent venn
        graph.my_venn2(a_specific=len(d1_specifique),
                       b_specific=len(d2_specifique),
                       nb_intersect=len(d1_intersect),
                       a_label=opt.name_d1,
                       b_label=opt.name_d2,
                       main="total %s:%i, total %s:%i" %
                       (opt.name_d1, len(d1), opt.name_d2, len(d2)),
                       name_out=opt.output + "_venn_%s.png" % (opt.name_d1))
        graph.my_venn2(a_specific=len(d1_specifique),
                       b_specific=len(d2_specifique),
                       nb_intersect=len(d2_intersect),
                       a_label=opt.name_d1,
                       b_label=opt.name_d2,
                       main="total %s:%i, total %s:%i" %
                       (opt.name_d1, len(d1), opt.name_d2, len(d2)),
                       name_out=opt.output + "_venn_%s.png" % (opt.name_d2))
        # representation pie
        graph.pie_fast(
            absolute_values=[len(d1) - len(d1_intersect),
                             len(d1_intersect)],
            labels=["specific", "intersect"],
            main="venn %s vs %s" % (opt.name_d1, opt.name_d2),
            explode=None,
            name_out=opt.output + "_pie_%s.png" % (opt.name_d1),
            colors=["green", "red"])
        graph.pie_fast(
            absolute_values=[len(d2) - len(d2_intersect),
                             len(d2_intersect)],
            labels=["specific", "intersect"],
            main="venn %s vs %s" % (opt.name_d2, opt.name_d1),
            explode=None,
            name_out=opt.output + "_pie_%s.png" % (opt.name_d2),
            colors=["green", "red"])
    # represent repartition of score for each
    score_d1 = [float(I.fields[opt.sc1]) for I in d1]
    if flag_graph:
        graph.hist(x=score_d1,
                   label_x=opt.name_d1,
                   name_out=opt.output + "_hist-allscore_%s.png" %
                   (opt.name_d1),
                   cum=False,
                   xline=None,
                   yline=None,
                   limit_x=[],
                   limit_y=[],
                   bins=50)
        score_d2 = [float(I.fields[opt.sc2]) for I in d2]
        graph.hist(x=score_d2,
                   label_x=opt.name_d2,
                   name_out=opt.output + "_hist-allscore_%s.png" %
                   (opt.name_d2),
                   cum=False,
                   xline=None,
                   yline=None,
                   limit_x=[],
                   limit_y=[],
                   bins=50)
    # represent size of region
    size_d1 = [int(I.fields[opt.sp1]) - int(I.fields[opt.st1]) for I in d1]
    size_d2 = [int(I.fields[opt.sp2]) - int(I.fields[opt.st2]) for I in d2]
    if flag_graph:
        graph.hist(x=size_d1,
                   label_x=opt.name_d1,
                   name_out=opt.output + "_hist-size_%s.png" % (opt.name_d1),
                   cum=False,
                   xline=None,
                   yline=None,
                   limit_x=[],
                   limit_y=[],
                   bins=50)
        graph.hist(x=size_d2,
                   label_x=opt.name_d2,
                   name_out=opt.output + "_hist-size_%s.png" % (opt.name_d2),
                   cum=False,
                   xline=None,
                   yline=None,
                   limit_x=[],
                   limit_y=[],
                   bins=50)
    ## represent specifique
    #score_d1_spe=[ float(I.fields[opt.sc1]) for I in d1_specifique ]
    #graph.hist(x=score_d1_spe, label_x=opt.name_d1,
    #    name_out=opt.output+"_d1-hist-spescored1.png", cum=False, xline=None, yline=None, limit_x=[], limit_y=[], bins=50)
    #score_d2_spe=[ float(I.fields[opt.sc2]) for I in d2_specifique ]
    #graph.hist(x=score_d2_spe, label_x=opt.name_d2,
    #    name_out=opt.output+"_d2-hist-spescored2.png", cum=False, xline=None, yline=None, limit_x=[], limit_y=[], bins=50)
    ## represent intersect
    #score_d1_int=[ float(I.fields[opt.sc1]) for I in d1_intersect ]
    #graph.hist(x=score_d1_int, label_x=opt.name_d1,
    #    name_out=opt.output+"_d1-hist-intscored1.png", cum=False, xline=None, yline=None, limit_x=[], limit_y=[], bins=50)
    #score_d2_int=[ float(I.fields[opt.sc2]) for I in d2_intersect ]
    #graph.hist(x=score_d2_int, label_x=opt.name_d2,
    #    name_out=opt.output+"_d2-hist-intscored2.png", cum=False, xline=None, yline=None, limit_x=[], limit_y=[], bins=50)

    # extract information
    # - number of column in d1 to get first value of d2 in d_intersect
    nb_col1 = len(d1[0].fields)
    # - number of column in d2 to get overlap value in d_intersect
    nb_col2 = len(d2[0].fields)
    # - extract values from d_intersect
    dico_d1 = {
    }  # to get how many d2 in one d1, and median in score d2 vs ths score of d1
    dico_d2 = {}  # to get how many d1 in one d2
    dico_d1andd2 = {}  # to get for each pair, score d1 and score d2
    # - scan intersection
    for I in d_intersect:
        I = I.fields
        # build key for dico
        try:
            k1 = "%s:%i-%i" % (I[opt.chr1], int(I[opt.st1]), int(I[opt.sp1]))
            k2 = "%s:%i-%i" % (I[opt.chr2 + nb_col1], int(
                I[opt.st2 + nb_col1]), int(I[opt.sp2 + nb_col1]))
        except:
            print "[ERROR] ", I
            print opt.chr1, opt.st1, opt.sp1, opt.chr2 + nb_col1, opt.st2 + nb_col1, opt.sp2 + nb_col1
            print I[opt.chr1], I[opt.st1], I[opt.sp1], I[
                opt.chr2 + nb_col1], I[opt.st2 + nb_col1], I[opt.sp2 + nb_col1]
        else:
            k12 = k1 + "VS" + k2
            # extract for dico_d1
            if not dico_d1.has_key(k1):
                dico_d1[k1] = {"nb_d2": 0, "score_d2": [], "score_d1": 0}
            else:
                print "[WARNING] several key for d1:", k1
            dico_d1[k1]["nb_d2"] = dico_d1[k1]["nb_d2"] + 1
            dico_d1[k1]["score_d2"].append(float(I[opt.sc2 + nb_col1]))
            dico_d1[k1]["score_d1"] = I[opt.sc1]
            # extract for dico_d2
            if not dico_d2.has_key(k1):
                dico_d2[k2] = {"nb_d1": 0, "score_d1": [], "score_d2": 0}
            else:
                print "[WARNING] several key for d2:", k2
            dico_d2[k2]["nb_d2"] = dico_d2[k2]["nb_d1"] + 1
            dico_d2[k2]["score_d1"].append(float(I[opt.sc1]))
            dico_d2[k2]["score_d2"] = I[opt.sc2 + nb_col1]
    ## - scan specifique d1
    #for I in d1_specifique:
    #    I=I.fields
    #    # build key for dico
    #    k1="%s:%i-%i" %(I[opt.chr1], int(I[opt.st1]),int(I[opt.sp1]))
    #    if not dico_d1.has_key(k1):
    #        dico_d1[k1]={"nb_d2":0,"score_d2":[], "score_d1":0}
    #    else:
    #        print "[WARNING] several key for d1:",k1
    #    dico_d1[k1]["score_d1"]=I[opt.sc1]
    ## - scan specifique d2
    #for I in d2_specifique:
    #    I=I.fields
    #    # build key for dico
    #    k2="%s:%i-%i" %(I[opt.chr2], int(I[opt.st2]),int(I[opt.sp2]))
    #    if not dico_d2.has_key(k2):
    #        dico_d2[k2]={"nb_d1":0,"score_d1":[], "score_d2":0}
    #    else:
    #        print "[WARNING] several key for d2:",k2
    #    dico_d2[k2]["score_d2"]=I[opt.sc2]
    # - format dico_d1
    x = []
    y = []
    z = []
    for k in dico_d1.keys():
        x.append(float(dico_d1[k]["score_d1"]))
        if len(dico_d1[k]["score_d2"]) > 0:
            y.append(float(numpy.max(dico_d1[k]["score_d2"])))
        else:
            y.append(-.09)
        z.append(int(dico_d1[k]["nb_d2"]))
    # build graph
    if flag_graph:
        graph.scatter_hist(x,
                           y,
                           xlabel="score %s" % (opt.name_d1),
                           ylabel="score %s" % (opt.name_d2),
                           main="",
                           marker="o",
                           color="black",
                           alpha=0.5,
                           size_mark=20,
                           name_out=opt.output + "_scatterhist_%s.png" %
                           (opt.name_d1))
    # - format dico_d2
    x = []
    y = []
    z = []
    for k in dico_d2.keys():
        x.append(float(dico_d2[k]["score_d2"]))
        if len(dico_d2[k]["score_d1"]) > 0:
            y.append(float(numpy.median(dico_d2[k]["score_d1"])))
        else:
            y.append(-999.0)
        z.append(int(dico_d2[k]["nb_d1"]))
    if flag_graph:
        graph.scatter_hist(x,
                           y,
                           xlabel="score %s" % (opt.name_d2),
                           ylabel="score %s" % (opt.name_d1),
                           main="",
                           marker="o",
                           color="black",
                           alpha=0.5,
                           size_mark=20,
                           name_out=opt.output + "_scatterhist_%s.png" %
                           (opt.name_d2))
    # close log
    sys.stdout = saveout
    fsock.close()
Beispiel #20
0
def main():
    parser = argparse.ArgumentParser(
        description='Use a sliding window to aggregate breaks in bed file')
    parser.add_argument('genome',
                        help='Name of the model used to produce input')
    parser.add_argument('input', help='Input .bed file with detected breaks')
    parser.add_argument(
        'annotations',
        help=
        'Annotation file. If annotation file has gtf or gff extention (possibly .gz) then only transcripts are selected. If .bed file is provided then all annotations from bed file are used'
    )
    parser.add_argument('output',
                        help='Output .bed file with longest transcripts')
    parser.add_argument('-w|--window-size',
                        dest="window_size",
                        default=int(1e5),
                        type=int,
                        help='Window at which to agregate breaks number')
    parser.add_argument('-s|--window-step',
                        dest="window_step",
                        default=int(1e4),
                        type=int,
                        help='Step after each window')
    parser.add_argument('-f|--features',
                        dest="features",
                        action="append",
                        nargs="*",
                        help='Additional features to annotate input file')

    args = parser.parse_args()
    start = time.time()

    if args.features is None:
        features = []
    else:
        features = list(itertools.chain.from_iterable(args.features))

    output_dir = os.path.dirname(args.output)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print(
        'Processing "{input}" using annotation="{annotation}" window {window}/{step}. Writing output to "{output}"...'
        .format(input=args.input,
                window=args.window_size,
                step=args.window_step,
                output=args.output,
                annotation=args.annotations))

    # Create temporary files
    tmp = {
        n: tempfile.NamedTemporaryFile(delete=False).name
        for n in [
            "genome_bin_pos", "genome_bin_neg", "genome_bin", "breaks_bin",
            "results", "all_transcripts", "transcripts"
        ]
    }

    # Create windows template for sliding window
    genome_bin_pos = BedTool().window_maker(
        genome=args.genome, w=args.window_size,
        s=args.window_step).each(strand, "+").saveas(tmp["genome_bin_pos"])
    genome_bin_neg = BedTool().window_maker(
        genome=args.genome, w=args.window_size,
        s=args.window_step).each(strand, "-").saveas(tmp["genome_bin_neg"])
    genome_bin = genome_bin_pos.cat(
        genome_bin_neg, postmerge=False).sort().saveas(tmp["genome_bin"])

    # Read input file
    dna_breaks = BedTool(args.input)

    # Read annotation file
    if re.search(r"\.(gtf|gff)(\.gz)?$", args.annotations):
        annotations = BedTool(args.annotations)
        annotations = annotations.filter(filter_transcript).\
            each(gff2bed, name_field="gene_id").sort().\
            saveas(tmp["all_transcripts"]).\
            groupby(g="1,2,3,6", c="4,5", o="distinct").\
            cut([0,1,2,4,5,3]).\
            saveas(tmp["transcripts"])
    elif re.search(r"\.bed$", args.annotations):
        annotations = BedTool(args.annotations)
    else:
        parser.error(
            "Annotation have to be either in gtf/gff or in bed format")

    bin_breaks = BedTool().intersect(a=genome_bin, b=dna_breaks, wa=True, c=True, s=True). \
        saveas(tmp["breaks_bin"])

    # Map breaks statistics to annotation file
    results = BedTool().map(a=bin_breaks, b=annotations, c="4",
                            o="distinct").cut([0, 1, 2, 7, 6,
                                               5]).sort().saveas(
                                                   tmp["results"])  # s=True,
    results_df = splitDataFrameList(results.to_dataframe(), "name", ",")
    results_df = results_df[results_df.name != "."]
    results_df.to_csv(args.output, sep="\t", header=True, index=False)

    # Remove old temporary files
    for f in tmp.values():
        os.remove(f)

    end = time.time()
    print("Total time: {:.1f} minutes".format((end - start) / 60))
Beispiel #21
0
		sorting.wait()
	return(0)

def newber(binner,factor):
	"""Returns bed record with all binding for a factor merged and renamed."""
	newbie = BedTool(binner).sort().merge(nms=True).each(featurefuncs.midpoint)
	newbie = newbie.each(featurefuncs.rename,factor)
	return(newbie)

print "Loading bed files..."
tssbed = BedTool(tsss)
jacker = BedTool(jacked)
chiper = BedTool(chiped)

print "Combining binding files..."
comber = chiper.cat(jacker,force_truncate=False,postmerge=False).moveto('./combtemp.bed')

#print "Generating oldstyle bed file..."
#oldstyle = comber.each(featurefuncs.midpoint)
#oldstyle2 = bedmaker(oldstyle,tssbed,oldout)

print "Generating newstyle bed files..."
for factor in sorted(pwmlist.keys()):
	print factor
	for model in pwmlist[factor]:
		print model
		grepper = 'grep ' + model + ' ./combtemp.bed >> ./temp1.bed'
		grepping = subprocess.Popen(grepper,shell=True)
		grepping.wait()
	newbie = newber('./temp1.bed',factor)
	newbie2 = bedmaker(newbie,tssbed,outdir + factor + '_midpoint_sorted.bed',unmatched=False)