Esempio n. 1
0
def construct_training_data(genome_sizes_file, peaks_file, genome_fasta_file,
                            blacklist_file, to_keep, to_filter,
                            window_length, acc_regions_file, out_prefix, chromatin_track_list, nbins):
    """
    This generator can either generate training data or validation data based on
    the to_keep and to_filter arguments.
    The train generate uses the to_filter argument, whereas to_keep=None
    For example:
    train_generator:  to_filter=['chr10', 'chr17, 'chrUn', 'chrM', 'random']
    i.e. In this construction; chr10 and chr17 can be used for testing/validation.
    The val generator uses the to_keep argument, whereas to_filter=None.
    For example:
    val_generator: to_keep=['chr17']
    i.e. In this construction; chr17 data is used for validation.
    Additional Parameters:
        genome_sizes_file: sizes
        peaks_file: multiGPS formatted *events* file
        blacklist_file: BED format blacklist file
        genome_fasta_file: fasta file for the whole genome
        batch_size (int): batch size used for training and validation batches
        window_len (int): the length of windows used for training and testing.
    """
    # Load the genome_sizes_file (Filtering out the validation and test chromosomes):
    curr_genome_bed = utils.get_genome_sizes(genome_sizes_file, to_keep=to_keep,
                                             to_filter=to_filter)
    genome_bed_df = curr_genome_bed.to_dataframe()

    # Loading the chip-seq bed file (Filtering out the validation and test chromosomes):
    chip_seq_coordinates = utils.load_chipseq_data(peaks_file,
                                                   genome_sizes_file=genome_sizes_file,
                                                   to_keep=to_keep,
                                                   to_filter=to_filter)

    # Loading the exclusion bed file (Blacklist + ChIP-seq peaks, use for constructing negative sets):
    exclusion_windows_bdt = utils.exclusion_regions(blacklist_file,
                                                                               chip_seq_coordinates)
    exclusion_windows_df = exclusion_windows_bdt.to_dataframe()

    # constructing the training set
    construct_sets = ConstructTrainingData(genome_sizes_file=genome_sizes_file,
                                           genome_fasta_file=genome_fasta_file,
                                           blacklist_file=blacklist_file,
                                           chip_coords=chip_seq_coordinates,
                                           exclusion_df=exclusion_windows_df,
                                           window_length=window_length,
                                           curr_genome_bed=genome_bed_df,
                                           acc_regions_file=acc_regions_file,
                                           chromatin_track_list=chromatin_track_list,
                                           nbins=nbins)

    X_seq, X_chromatin_list, y, training_coords = construct_sets.get_data()
    # saving the data
    np.savetxt(out_prefix + '.seq', X_seq, fmt='%s')
    for idx, chromatin_track in enumerate(chromatin_track_list):
        chromatin_out_files = [x.split('/')[-1].split('.')[0] for x in chromatin_track_list]
        np.savetxt(out_prefix + '.' + chromatin_out_files[idx] + '.chromatin', X_chromatin_list[idx], delimiter='\t', fmt='%1.3f')
    np.savetxt(out_prefix + '.labels', y, fmt='%s')
    return training_coords
Esempio n. 2
0
    def define_coordinates(self):
        """
        This function loads and returns coords & labels for the test set.
        Logic for assigning test set labels:
        The multiGPS peak files are used as inputs; and expanded to record
        25 bp windows around the peak center.
        if 100% of peak center lies in window:
            label bound.
        elif < 100% of peak center lies in the window:
            label ambiguous.
        else:
            label unbound.
        Returns:
            test_coords (pd dataFrame): A dataFrame with chr, start, end and
            labels
        """
        genome_sizes = pd.read_csv(self.genome_sizes_file,
                                   sep="\t",
                                   names=['chr', 'len'])
        # subset the test chromosome:
        genome_test = genome_sizes[genome_sizes['chr'] == self.to_keep[0]]
        # the assumption here is that to_keep is a single chromosome list.
        end_idx = genome_test.iloc[0, 1]
        chromosome = genome_test.iloc[0, 0]
        test_set = []
        start_idx = 0
        while start_idx + self.window_len < end_idx:
            curr_interval = [
                chromosome, start_idx, start_idx + self.window_len
            ]
            start_idx += self.stride
            test_set.append(curr_interval)

        test_df = pd.DataFrame(test_set, columns=['chr', 'start', 'stop'])
        test_bdt_obj = BedTool.from_dataframe(test_df)

        chip_peaks = utils.load_chipseq_data(
            chip_peaks_file=self.peaks_file,
            to_keep=self.to_keep,
            genome_sizes_file=self.genome_sizes_file)
        # note: multiGPS reports 1 bp separated start and end,
        # centered on the ChIP-seq peak.
        chip_peaks['start'] = chip_peaks['start'] - int(self.window_len / 2)
        # (i.e. 250 if window_len=500 )
        chip_peaks['end'] = chip_peaks['end'] + int(self.window_len / 2 - 1)
        # (i.e. 249 if window_len=500); multiGPS reports 1bp intervals

        chip_peaks = chip_peaks[['chr', 'start', 'end']]
        chip_peaks_bdt_obj = BedTool.from_dataframe(chip_peaks)

        blacklist_exclusion_windows = BedTool(self.blacklist_file)
        # intersecting
        unbound_data = test_bdt_obj.intersect(chip_peaks_bdt_obj, v=True)
        if self.blacklist_file is None:
            bound_data = chip_peaks_bdt_obj
        else:
            unbound_data = unbound_data.intersect(blacklist_exclusion_windows,
                                                  v=True)
            # i.e. if there is any overlap with chip_peaks, that window is not
            # reported
            # removing blacklist windows
            bound_data = chip_peaks_bdt_obj.intersect(
                blacklist_exclusion_windows, v=True)
        # i.e. the entire 500 bp window is the positive window.
        # making data-frames
        bound_data_df = bound_data.to_dataframe()
        bound_data_df['label'] = 1
        unbound_data_df = unbound_data.to_dataframe()
        unbound_data_df['label'] = 0
        # exiting
        test_coords = pd.concat([bound_data_df, unbound_data_df])
        return test_coords
Esempio n. 3
0
def data_generator(genome_sizes_file, peaks_file, genome_fasta_file,
                   blacklist_file, to_keep, to_filter, window_lenght,
                   batch_size, acc_regions_file, ratios):
    """
    This generator can either generate training data or validation data based on
    the to_keep and to_filter arguments.

    The train generate uses the to_filter argument, whereas to_keep=None
    For example:
    train_generator:  to_filter=['chr10', 'chr17, 'chrUn', 'chrM', 'random']
    i.e. In this construction; chr10 and chr17 can be used for testing/validation.

    The val generator uses the to_keep argument, whereas to_filter=None.
    For example:
    val_generator: to_keep=['chr17']
    i.e. In this construction; chr17 data is used for validation.

    Additional Parameters:
        genome_sizes_file: sizes
        peaks_file: multiGPS formatted *events* file
        blacklist_file: BED format blacklist file
        genome_fasta_file: fasta file for the whole genome
        batch_size (int): batch size used for training and validation batches
        window_len (int): the length of windows used for training and testing.
    """
    # load the genome_sizes_file:
    genome_bed_val = utils.get_genome_sizes(genome_sizes_file,
                                            to_keep=to_keep,
                                            to_filter=to_filter)
    genome_bed_df = genome_bed_val.to_dataframe()
    # loading the chip-seq bed file
    chip_seq_coordinates = utils.load_chipseq_data(
        peaks_file,
        genome_sizes_file=genome_sizes_file,
        to_keep=to_keep,
        to_filter=to_filter)

    def make_flanks(lower_lim, upper_lim):
        # getting a list of chip-seq flanking windows:
        # (can be a separate fn in utils)
        flanks_left = chip_seq_coordinates.copy()
        flanks_right = chip_seq_coordinates.copy()
        flanks_left['start'] = chip_seq_coordinates['start'] - upper_lim
        flanks_left['end'] = chip_seq_coordinates['start'] - lower_lim
        flanks_right['start'] = chip_seq_coordinates['start'] + lower_lim
        flanks_right['end'] = chip_seq_coordinates['start'] + upper_lim
        return flanks_left, flanks_right

    fl_r, fl_l = make_flanks(lower_lim=250, upper_lim=750)
    fl_r_2, fl_l_2 = make_flanks(lower_lim=200, upper_lim=700)
    fl_r_3, fl_l_3 = make_flanks(lower_lim=1500, upper_lim=2000)
    fl_r_4, fl_l_4 = make_flanks(lower_lim=1000, upper_lim=1500)
    flanks = pd.concat(
        [fl_r, fl_l, fl_r_2, fl_l_2, fl_l_3, fl_r_3, fl_r_4, fl_l_4])
    # flanks_bdt_obj = BedTool.from_dataframe(flanks)
    # converting the df to a bedtools object inside the generator, to enable a
    # py-bedtools cleanup otherwise.
    # print(flanks_bdt_obj.head())
    # flanks_bdt_obj = flanks_bdt_obj.intersect(BedTool.from_dataframe(chip_seq_coordinates),
    #                                           v=True)
    # print(flanks_bdt_obj.head)

    # loading the exclusion coords:
    chipseq_exclusion_windows, exclusion_windows_bdt = utils.exclusion_regions(
        blacklist_file, chip_seq_coordinates)
    exclusion_windows_df = exclusion_windows_bdt.to_dataframe()
    # constructing the training set
    construct_sets = ConstructSets(genome_sizes_file=genome_sizes_file,
                                   genome_fasta_file=genome_fasta_file,
                                   blacklist_file=blacklist_file,
                                   chip_coords=chip_seq_coordinates,
                                   exclusion_df=exclusion_windows_df,
                                   window_length=window_lenght,
                                   curr_genome_bed=genome_bed_df,
                                   batch_size=batch_size,
                                   acc_regions_file=acc_regions_file,
                                   flanks=flanks,
                                   ratios=ratios)
    while True:
        X, y, coords = construct_sets.get_data()
        yield X, y