コード例 #1
0
def mnll(true_counts, logits=None, probs=None):
    """
        Compute the multinomial negative log-likelihood between true
        counts and predicted values of a BPNet-like profile model
        
        One of `logits` or `probs` must be given. If both are
        given `logits` takes preference.

        Args:
            true_counts (numpy.array): observed counts values
            
            logits (numpy.array): predicted logits values
            
            probs (numpy.array): predicted values as probabilities
          
        Returns:
            float: cross entropy
    
    """

    dist = None

    if logits is not None:

        # check for length mismatch
        if len(logits) != len(true_counts):
            raise quietexception.QuietException(
                "Length of logits does not match length of true_counts")

        # convert logits to softmax probabilities
        probs = logits - logsumexp(logits)
        probs = np.exp(probs)

    elif probs is not None:

        # check for length mistmatch
        if len(probs) != len(true_counts):
            raise quietexception.QuietException(
                "Length of probs does not match length of true_counts")

        # check if probs sums to 1
        if abs(1.0 - np.sum(probs)) > 1e-3:
            raise quietexception.QuietException(
                "'probs' array does not sum to 1")

    else:

        # both 'probs' and 'logits' are None
        raise quietexception.QuietException(
            "At least one of probs or logits must be provided. "
            "Both are None.")

    # compute the nmultinomial distribution
    mnom = multinomial(np.sum(true_counts), probs)
    return -(mnom.logpmf(true_counts) / len(true_counts))
コード例 #2
0
def profile_cross_entropy(true_counts, logits=None, probs=None):
    """
        Compute the cross entropy between true counts and predicted 
        values of a BPNet-like profile model
        
        One of `logits` or `probs` must be given. If both are
        given `logits` takes preference.

        Args:
            true_counts (numpy.array): observed counts values
            
            logits (numpy.array): predicted logits values
            
            probs (numpy.array): predicted values as probabilities
          
        Returns:
            float: cross entropy
    
    """

    if logits is not None:

        # check for length mismatch
        if len(logits) != len(true_counts):
            raise quietexception.QuietException(
                "Length of logits does not match length of true_counts")

        # convert logits to softmax probabilities
        probs = logits - logsumexp(logits)
        probs = np.exp(probs)

    elif probs is not None:

        # check for length mistmatch
        if len(probs) != len(true_counts):
            raise quietexception.QuietException(
                "Length of probs does not match length of true_counts")

        # check if probs sums to 1
        if abs(1.0 - np.sum(probs)) > 1e-3:
            raise quietexception.QuietException(
                "'probs' array does not sum to 1")
    else:

        # both 'probs' and 'logits' are None
        raise quietexception.QuietException(
            "At least one of probs or logits must be provided. "
            "Both are None.")

    # convert true_counts to probabilities
    true_counts_prob = true_counts / np.sum(true_counts)

    return -np.sum(np.multiply(true_counts_prob, np.log(probs + 1e-7)))
コード例 #3
0
ファイル: predict.py プロジェクト: erankotler/basepairmodels
def predict_main():
    # parse the command line arguments
    parser = argparsers.predict_argsparser()
    args = parser.parse_args()

    # check if the output directory exists
    if not os.path.exists(args.output_dir):
        logging.error("Directory {} does not exist".format(args.output_dir))

        return

    if args.automate_filenames:
        # create a new directory using current date/time to store the
        # predictions and logs
        date_time_str = local_datetime_str(args.time_zone)
        pred_dir = '{}/{}'.format(args.output_dir, date_time_str)
        os.mkdir(pred_dir)
    elif os.path.isdir(args.output_dir):
        pred_dir = args.output_dir
    else:
        logging.error("Directory does not exist {}.".format(args.output_dir))
        return

    # filename to write debug logs
    logfname = "{}/predict.log".format(pred_dir)

    # set up the loggers
    logger.init_logger(logfname)

    # make sure the input_data json file exists
    if not os.path.isfile(args.input_data):
        raise quietexception.QuietException(
            "File not found: {} OR you may have accidentally "
            "specified a directory path.".format(args.input_data))

    # load the json file
    with open(args.input_data, 'r') as inp_json:
        try:
            #: dictionary of tasks for training
            input_data = json.loads(inp_json.read())
        except json.decoder.JSONDecodeError:
            raise quietexception.QuietException(
                "Unable to load json file {}. Valid json expected. "
                "Check the file for syntax errors.".format(args.input_data))

    logging.info("INPUT DATA -\n{}".format(input_data))

    # predict
    logging.info("Loading {}".format(args.model))
    with CustomObjectScope(
        {'MultichannelMultinomialNLL': MultichannelMultinomialNLL}):

        predict(args, input_data, pred_dir)
コード例 #4
0
ファイル: bounds.py プロジェクト: erankotler/basepairmodels
def get_average_profile(input_bigWig, peaks_df, peak_width):
    """
        Function to compute the average profile across all peaks
        
        Args:
            input_bigWig (str): path to bigWig file
            
            peaks_df (str): pandas dataframe containing peaks 
                information. 
                
                The dataframe should have 'chrom', 'start', and 'end'
                as first 3 columns. Each peak should have the same
                width (equal to peak_width) i.e 'end' - 'start' is the
                same for all rows in the dataframe.
                
            peak_width (int): width of each peak.
        
        Returns:
            np.array: numpy array of length peak_width
                    
    """
    
    # open the bigWig file for reading
    bw = pyBigWig.open(input_bigWig)
    
    # initialize numpy array for average profile
    average_profile = np.zeros(peak_width)

    # iterate through all peaks and compute the average
    for idx, row in peaks_df.iterrows():
        # raise exception if 'end' - 'start' is not equal to peak_width
        if (row['end'] - row['start']) != peak_width:
            raise quietexception.QuietException(
                "Inconsistent peak width found at: {}:{}-{}".format(
                    row['chrom'], row['start'], row['end']))
        
        # read values from bigWig
        average_profile += np.nan_to_num(
            bw.values(row['chrom'], row['start'], row['end']))

    # average profile
    average_profile /= peaks_df.shape[0]
    
    # close bigWig file
    bw.close()
    
    return average_profile
コード例 #5
0
def motif_discovery_main():
    parser = motif_discovery_argsparser()
    args = parser.parse_args()

    if not os.path.exists(args.scores_path):
        raise quietexception.QuietException(
            "Score file {} does not exist".format(args.scores_path))

    if not os.path.exists(args.output_directory):
        raise quietexception.QuietException(
            "Output directiry {} does not exist".format(args.output_directory))

    # Load the scores
    scores = h5py.File(args.scores_path, 'r')

    # window start and end based on modisco_window_size
    center = scores['hyp_scores'].shape[1] // 2
    start = center - args.modisco_window_size // 2
    end = center + args.modisco_window_size // 2

    print("Shap scores shape - {}".format(scores['hyp_scores'].shape))

    shap_scores = scores['hyp_scores'][:, start:end, :]
    one_hot_seqs = scores['input_seqs'][:, start:end, :]
    print("Done slicing shap scores and one hot seqs")

    proj_shap_scores = np.multiply(one_hot_seqs, shap_scores)
    print("Done computing projected shap scores")

    scores.close()

    tasks = ['task0']
    task_to_scores = OrderedDict()
    task_to_hyp_scores = OrderedDict()
    task_to_scores['task0'] = proj_shap_scores
    task_to_hyp_scores['task0'] = shap_scores

    tfmodisco_workflow = modisco.tfmodisco_workflow.workflow.TfModiscoWorkflow(
        sliding_window_size=21, flank_size=10, target_seqlet_fdr=0.05,
        seqlets_to_patterns_factory=\
        modisco.tfmodisco_workflow.seqlets_to_patterns
            .TfModiscoSeqletsToPatternsFactory(
                n_cores=10,
                embedder_factory=\
                modisco.seqlet_embedding.advanced_gapped_kmer
                .AdvancedGappedKmerEmbedderFactory(),
            trim_to_window_size=30, initial_flank_to_add=10,
            final_min_cluster_size=30))

    tfmodisco_results = tfmodisco_workflow(
        task_names=["task0"],
        contrib_scores=task_to_scores,
        hypothetical_contribs=task_to_hyp_scores,
        one_hot=one_hot_seqs)

    modisco_results_path = '{}/modisco_results.h5'.format(
        args.output_directory)

    tfmodisco_results.save_hdf5(h5py.File(modisco_results_path, 'w'))
    print("Saved modisco results to file {}".format(str(modisco_results_path)))

    seqlet_path = '{}/seqlets.txt'.format(args.output_directory)
    print("Saving seqlets to %s" % seqlet_path)
    seqlets = \
        tfmodisco_results.metacluster_idx_to_submetacluster_results[0].seqlets
    bases = np.array(["A", "C", "G", "T"])
    with open(seqlet_path, "w") as f:
        for seqlet in seqlets:
            sequence = "".join(bases[np.argmax(seqlet["sequence"].fwd,
                                               axis=-1)])
            example_index = seqlet.coor.example_idx
            start, end = seqlet.coor.start, seqlet.coor.end
            f.write(">example%d:%d-%d\n" % (example_index, start, end))
            f.write(sequence + "\n")

    print("Saving pattern visualizations")

    patterns = (tfmodisco_results.metacluster_idx_to_submetacluster_results[0].
                seqlets_to_patterns_result.patterns)

    # generate .pngs of each motif and write motif seqlet to
    # individual files
    for idx, pattern in enumerate(patterns):
        print(pattern)
        print("pattern idx", idx)
        print(len(pattern.seqlets))

        pattern_seqlet_path = os.path.join(args.output_directory,
                                           'pattern{}_seqlets.txt'.format(idx))
        with open(pattern_seqlet_path, "w") as f:
            for seqlet in pattern.seqlets:
                sequence = "".join(bases[np.argmax(seqlet["sequence"].fwd,
                                                   axis=-1)])
                example_index = seqlet.coor.example_idx
                start, end = seqlet.coor.start, seqlet.coor.end
                f.write(">example%d:%d-%d\n" % (example_index, start, end))
                f.write(sequence + "\n")

        save_plot(pattern["task0_contrib_scores"].fwd,
                  '{}/contrib_{}.png'.format(args.output_directory, idx))
        save_plot(pattern["sequence"].fwd,
                  '{}/sequence_{}.png'.format(args.output_directory, idx))
コード例 #6
0
def metrics_main():
    # parse the command line arguments
    parser = metrics_argsparser()
    args = parser.parse_args()

    # check if the output directory exists
    if not os.path.exists(args.output_dir):
        raise quietexception.QuietException(
            "Directory {} does not exist".format(args.output_dir))

    # check if the peaks file exists
    if args.peaks is not None and not os.path.exists(args.peaks):
        raise quietexception.QuietException("File {} does not exist".format(
            args.peaks))

    # check if the bounds file exists
    if args.bounds_csv is not None and not os.path.exists(args.bounds_csv):
        raise quietexception.QuietException("File {} does not exist".format(
            args.bounds_csv))

    # check if profile A exists
    if not os.path.exists(args.profileA):
        raise quietexception.QuietException("File {} does not exist".format(
            args.profileA))

    # check if profile B exists
    if not os.path.exists(args.profileB):
        raise quietexception.QuietException("File {} does not exist".format(
            args.profileB))

    # check if counts A exists
    if args.countsA is not None and not os.path.exists(args.countsA):
        raise quietexception.QuietException("File {} does not exist".format(
            args.countsA))

    # check if counts B exists
    if args.countsB is not None and not os.path.exists(args.countsB):
        raise quietexception.QuietException("File {} does not exist".format(
            args.countsB))

    # check if we need to auto generate the output directory
    if args.automate_filenames:
        # create a new directory using current date/time to store the
        # metrics outputs & logs
        date_time_str = local_datetime_str(args.time_zone)
        metrics_dir = '{}/{}'.format(args.output_dir, date_time_str)
        os.mkdir(metrics_dir)
    elif os.path.isdir(args.output_dir):
        metrics_dir = args.output_dir
    else:
        raise quietexception.QuietException("{} is not a directory".format(
            args.output_dir))

    # filename to write debug logs
    logfname = "{}/metrics.log".format(metrics_dir)

    # set up the loggers
    init_logger(logfname)

    # read the bounds csv into a pandas DataFrame
    if args.bounds_csv is not None:
        logging.info("Loading lower and upper bounds ...")
        bounds_df = pd.read_csv(args.bounds_csv, header=0)
    else:
        bounds_df = None

    # check if peaks file has been supplied
    if args.peaks is not None:
        peaks_df = pd.read_csv(args.peaks,
                               sep='\t',
                               header=None,
                               names=[
                                   'chrom', 'st', 'end', 'name', 'score',
                                   'strand', 'signal', 'p', 'q', 'summit'
                               ])

        # keep only those rows corresponding to the required
        # chromosomes
        peaks_df = peaks_df[peaks_df['chrom'].isin(args.chroms)]

        # create new column for peak pos
        peaks_df['summit_pos'] = peaks_df['st'] + peaks_df['summit']

        # create new column for start pos
        peaks_df['start_pos'] = peaks_df['summit_pos'] - \
                                    args.metrics_seq_len // 2

        # create new column for end pos
        peaks_df['end_pos'] = peaks_df['summit_pos'] + \
                                    args.metrics_seq_len // 2

        # select only the chrom & summit positon columns
        allPositions = peaks_df[['chrom', 'start_pos', 'end_pos']]

        allPositions = allPositions.reset_index(drop=True)

    # else generate geome wide positions
    else:

        allPositions = getChromPositions(args.chroms,
                                         args.chrom_sizes,
                                         args.metrics_seq_len // 2,
                                         args.step_size,
                                         mode='sequential',
                                         num_positions=-1)

    # check that there are exactly the same number of rows in the
    # bounds dataframe as compared to allPositions
    if bounds_df is not None and (bounds_df.shape[0] != allPositions.shape[0]):
        raise quietexception.QuietException(
            "Bounds row count does not match chrom positions row "
            "count".format(args.peaks))

    # open the two bigWig files
    try:
        bigWigProfileA = pyBigWig.open(args.profileA)
        bigWigProfileB = pyBigWig.open(args.profileB)

        if args.countsA:
            bigWigCountsA = pyBigWig.open(args.countsA)
        if args.countsB:
            bigWigCountsB = pyBigWig.open(args.countsB)

    except Exception as e:
        logging.error("Problems occurred when opening one of the input files: "
                      "{}".format(str(e)))

    # for pearson on counts
    countsA = []
    countsB = []

    # initialize arrays to hold metrics values
    array_len = len(allPositions.index)
    multinomial_nll = np.zeros(array_len, dtype=np.float64)
    ce = np.zeros(array_len, dtype=np.float64)
    jsd = np.zeros(array_len, dtype=np.float64)
    pearson = np.zeros(array_len, dtype=np.float64)
    spearman = np.zeros(array_len, dtype=np.float64)
    mse = np.zeros(array_len, dtype=np.float64)

    for idx, row in tqdm(allPositions.iterrows(), total=allPositions.shape[0]):

        chrom = row['chrom']
        start = row['start_pos']
        end = row['end_pos']

        # get all the bounds values
        if bounds_df is not None:
            mnll_min = bounds_df.loc[idx, 'mnll_self']
            mnll_max = bounds_df.loc[idx, 'mnll_uniform']
            ce_min = bounds_df.loc[idx, 'ce_self']
            ce_max = bounds_df.loc[idx, 'ce_uniform']
            jsd_min = bounds_df.loc[idx, 'jsd_self']
            jsd_max = bounds_df.loc[idx, 'jsd_uniform']
            pearson_min = bounds_df.loc[idx, 'pearson_uniform']
            pearson_max = bounds_df.loc[idx, 'pearson_self']
            spearman_min = bounds_df.loc[idx, 'spearman_uniform']
            spearman_max = bounds_df.loc[idx, 'spearman_self']
        try:
            profileA = np.nan_to_num(
                np.array(bigWigProfileA.values(chrom, start, end)))
            profileB = np.nan_to_num(
                np.array(bigWigProfileB.values(chrom, start, end)))
        except Exception as e:
            raise quietexception.QuietException(
                "Error retrieving values {}, {}, {}".format(chrom, start, end))

        if args.countsA:
            # since every base is assigned the total counts in the
            # region we have to take the mean
            valsCountsA = np.mean(
                np.nan_to_num(np.array(bigWigCountsA.values(chrom, start,
                                                            end))))
        else:
            valsCountsA = np.sum(profileA)

        if args.countsB:
            # since every base is assigned the total counts in the
            # region we have to take the mean
            valsCountsB = np.mean(
                np.nan_to_num(np.array(bigWigCountsB.values(chrom, start,
                                                            end))))
        else:
            valsCountsB = np.sum(profileB)

        # check to see if we fetched the correct numnber of values
        # if the two array lengths dont match we cant compute the
        # metrics
        if len(profileA) != (end - start) or \
            len(profileB) != (end - start):
            logging.warning("Unable to fetch {} values on chrom {} from "
                            "{} to {}. Skipping.".format(
                                end - start, chrom, start, end))
            continue

        if sum(profileA) != 0:
            if args.apply_softmax_to_profileA:
                # we use log softmax to circumvent numerical instability
                # and then exponetiate
                probProfileA = profileA - logsumexp(profileA)
                probProfileA = np.exp(probProfileA)

                # we need actual counts to compute mse
                valsProfileA = np.multiply(valsCountsA, probProfileA)

                if len(args.smooth_profileA) > 0:
                    sigma = float(args.smooth_profileA[0])
                    width = float(args.smooth_profileA[1])
                    truncate = (((width - 1) / 2) - 0.5) / sigma

                    valsProfileA = gaussian_filter1d(valsProfileA,
                                                     sigma=sigma,
                                                     truncate=truncate)

                    # recompute probabilities
                    probProfileA = valsProfileA / sum(valsProfileA)

            else:
                if args.smooth_profileA:
                    sigma = float(args.smooth_profileA[0])
                    width = float(args.smooth_profileA[1])
                    truncate = (((width - 1) / 2) - 0.5) / sigma

                    profileA = gaussian_filter1d(profileA,
                                                 sigma=sigma,
                                                 truncate=truncate)

                # convert to probabilities by diving by sum
                probProfileA = profileA / sum(profileA)

                # if we are in the else block it implies profileA has
                # actual counts
                valsProfileA = profileA

        elif args.exclude_zero_profiles:
            continue

        else:
            # uniform distribution
            probProfileA = 1.0 / len(profileA) * np.ones(len(profileA),
                                                         dtype=np.float32)

        if sum(profileB) != 0:
            if args.apply_softmax_to_profileB:
                # we use log softmax to circumvent numerical instability
                # and then exponetiate
                probProfileB = profileB - logsumexp(profileB)
                probProfileB = np.exp(probProfileB)

                # we need actual counts to compute mse
                valsProfileB = np.multiply(valsCountsB, probProfileB)

                if len(args.smooth_profileB) > 0:
                    sigma = float(args.smooth_profileB[0])
                    width = float(args.smooth_profileB[1])
                    truncate = (((width - 1) / 2) - 0.5) / sigma

                    valsProfileB = gaussian_filter1d(valsProfileB,
                                                     sigma=sigma,
                                                     truncate=truncate)

                    # recompute probabilities
                    probProfileB = valsProfileB / sum(valsProfileB)
            else:
                if args.smooth_profileB:
                    sigma = float(args.smooth_profileB[0])
                    width = float(args.smooth_profileB[1])
                    truncate = (((width - 1) / 2) - 0.5) / sigma

                    profileB = gaussian_filter1d(profileB,
                                                 sigma=sigma,
                                                 truncate=truncate)

                # convert to probabilities by diving by sum
                probProfileB = profileB / sum(profileB)

                # if we are in the else block it implies profileB has
                # actual counts
                valsProfileB = profileB

        elif args.exclude_zero_profiles:
            continue

        else:
            # uniform distribution
            probProfileB = 1.0 / len(profileB) * np.ones(len(profileB),
                                                         dtype=np.float32)

        # pearson & spearman
        # with pearson we need to check if either of the arrays
        # has zero standard deviation (i.e having all same elements,
        # a zero or any other value). Unfortunately np.std
        # returns a very small non-zero value, so we'll use a
        # different approach to check if the array has the same value.
        # If true then pearson correlation is undefined
        if np.unique(probProfileA).size == 1 or \
            np.unique(probProfileB).size == 1:
            pearson[idx] = 0
            spearman[idx] = 0
        else:
            pearson[idx] = pearsonr(valsProfileA, valsProfileB)[0]
            spearman[idx] = spearmanr(valsProfileA, valsProfileB)[0]

        # mnll
        multinomial_nll[idx] = mnll(valsProfileA, probs=probProfileB)
        # cross entropy
        ce[idx] = profile_cross_entropy(valsProfileA, probs=probProfileB)
        # jsd
        jsd[idx] = jensenshannon(probProfileA, probProfileB)

        # apply min max normlization
        if bounds_df is not None:
            multinomial_nll[idx] = get_min_max_normalized_value(
                multinomial_nll[idx], mnll_min, mnll_max)
            ce[idx] = get_min_max_normalized_value(ce[idx], ce_min, ce_max)
            jsd[idx] = get_min_max_normalized_value(jsd[idx], jsd_min, jsd_max)
            pearson[idx] = get_min_max_normalized_value(
                pearson[idx], pearson_min, pearson_max)
            spearman[idx] = get_min_max_normalized_value(
                spearman[idx], spearman_min, spearman_max)

        # mse
        mse[idx] = np.square(np.subtract(valsProfileA, valsProfileB)).mean()

        # add to the counts list
        countsA.append(np.sum(valsProfileA))
        countsB.append(np.sum(valsProfileB))

    counts_pearson = pearsonr(countsA, countsB)[0]
    counts_spearman = spearmanr(countsA, countsB)[0]

    logging.info("\t\tmin\t\tmax\t\tmedian")
    logging.info("mnll\t\t{:0.3f}\t\t{:0.3f}\t\t{:0.3f}".format(
        np.min(multinomial_nll), np.max(multinomial_nll),
        np.median(multinomial_nll)))
    logging.info("cross_entropy\t{:0.3f}\t\t{:0.3f}\t\t{:0.3f}".format(
        np.min(ce), np.max(ce), np.median(ce)))
    logging.info("jsd\t\t{:0.3f}\t\t{:0.3f}\t\t{:0.3f}".format(
        np.min(jsd), np.max(jsd), np.median(jsd)))
    logging.info("pearson\t\t{:0.3f}\t\t{:0.3f}\t\t{:0.3f}".format(
        np.min(pearson), np.max(pearson), np.median(pearson)))
    logging.info("spearman\t{:0.3f}\t\t{:0.3f}\t\t{:0.3f}".format(
        np.min(spearman), np.max(spearman), np.median(spearman)))
    logging.info("mse\t\t{:0.3f}\t\t{:0.3f}\t\t{:0.3f}".format(
        np.min(mse), np.max(mse), np.median(mse)))
    logging.info("==============================================")
    logging.info("counts pearson: {}".format(counts_pearson))
    logging.info("counts spearman: {}".format(counts_spearman))

    np.savez_compressed('{}/mnll'.format(metrics_dir), mnll=multinomial_nll)
    np.savez_compressed('{}/cross_entropy'.format(metrics_dir),
                        cross_entropy=ce)
    np.savez_compressed('{}/mse'.format(metrics_dir), mse=mse)
    np.savez_compressed('{}/pearson'.format(metrics_dir), pearson=pearson)
    np.savez_compressed('{}/spearman'.format(metrics_dir), spearman=spearman)
    np.savez_compressed('{}/jsd'.format(metrics_dir), jsd=jsd)
    np.savez_compressed('{}/counts_pearson'.format(metrics_dir),
                        counts_pearson=counts_pearson)
    np.savez_compressed('{}/counts_spearman'.format(metrics_dir),
                        counts_spearman=counts_spearman)

    # write all the command line arguments to a json file
    config_file = '{}/config.json'.format(metrics_dir)
    with open(config_file, 'w') as fp:
        json.dump(vars(args), fp)
コード例 #7
0
def interpret_main():
    # parse the command line arguments
    parser = interpret_argsparser()
    args = parser.parse_args()

    # check if the output directory exists
    if not os.path.exists(args.output_directory):
        raise quietexception.QuietException(
            "Directory {} does not exist".format(args.output_directory))

    # check if the output directory is a directory path
    if not os.path.isdir(args.output_directory):
        raise quietexception.QuietException("{} is not a directory".format(
            args.output_directory))

    # check if the reference genome file exists
    if not os.path.exists(args.reference_genome):
        raise quietexception.QuietException("File {} does not exist".format(
            args.reference_genome))

    # check if the model file exists
    if not os.path.exists(args.model):
        raise quietexception.QuietException("File {} does not exist".format(
            args.model))

    # check if the bed file exists
    if not os.path.exists(args.bed_file):
        raise quietexception.QuietException("File {} does not exist".format(
            args.bed_file))

    # if controls are specified check if the control_info json exists
    if args.control_info is not None:
        if not os.path.exists(args.control_info):
            raise quietexception.QuietException(
                "Input data file {} does not exist".format(args.control_info))

    # check if both args.chroms and args.sample are specified, only
    # one of the two is allowed
    if args.chroms is not None and args.sample is not None:
        raise quietexception.QuietException(
            "Only one of [--chroms, --sample]  is allowed")

    if args.automate_filenames:
        # create a new directory using current date/time to store the
        # interpretation scores
        date_time_str = local_datetime_str(args.time_zone)
        interpret_dir = '{}/{}'.format(args.output_directory, date_time_str)
        os.mkdir(interpret_dir)
    else:
        interpret_dir = args.output_directory

    # filename to write debug logs
    logfname = "{}/interpret.log".format(interpret_dir)

    # set up the loggers
    init_logger(logfname)

    # interpret
    logging.info("Loading {}".format(args.model))
    with CustomObjectScope(
        {'MultichannelMultinomialNLL': MultichannelMultinomialNLL}):

        interpret(args, interpret_dir)
コード例 #8
0
def interpret(args, interpret_dir):
    # load the model
    model = load_model(args.model)

    # read all the peaks into a pandas dataframe
    peaks_df = pd.read_csv(args.bed_file,
                           sep='\t',
                           header=None,
                           names=[
                               'chrom', 'st', 'end', 'name', 'score', 'strand',
                               'signalValue', 'p', 'q', 'summit'
                           ])

    if args.chroms is not None:
        # keep only those rows corresponding to the required
        # chromosomes
        peaks_df = peaks_df[peaks_df['chrom'].isin(args.chroms)]

    if args.sample is not None:
        # randomly sample rows
        logging.info("Sampling {} rows from {}".format(args.sample,
                                                       args.bed_file))
        peaks_df = peaks_df.sample(n=args.sample, random_state=args.seed)

    if args.presort_bed_file:
        # sort the bed file in descending order of peak strength
        peaks_df = peaks_df.sort_values(['signalValue'], ascending=False)

    # reset index (if any of the above 3 filters have been applied,
    # no harm if they haven't)
    peaks_df = peaks_df.reset_index(drop=True)

    # get final number of peaks
    num_peaks = peaks_df.shape[0]

    # reference file to fetch sequences
    logging.info("Opening reference file ...")
    fasta_ref = pysam.FastaFile(args.reference_genome)

    # if controls have been specified we to need open the control files
    # for reading
    control_bigWigs = []
    if args.control_info is not None:
        # load the control info json file
        with open(args.control_info, 'r') as inp_json:
            try:
                input_data = json.loads(inp_json.read())
            except Exception as e:
                exc_type, exc_value, exc_traceback = sys.exc_info()
                raise quietexception.QuietException(exc_type.__name__ + ' ' +
                                                    str(exc_value))

        logging.info("Opening control bigWigs ...")
        # get the control bigWig for each task
        for task in input_data:
            if input_data[task]['task_id'] == args.task_id:
                if 'control' in input_data[task].keys():
                    control_bigWig_path = input_data[task]['control']

                    # check if the file exists
                    if not os.path.exists(control_bigWig_path):
                        raise quietexception.QuietException(
                            "File {} does not exist".format(
                                control_bigWig_path))

                    logging.info(control_bigWig_path)

                    # open the bigWig and add the file object to the
                    # list
                    control_bigWigs.append(pyBigWig.open(control_bigWig_path))

    # log of sum of counts of the control track
    # if multiple control files are specified this would be
    # log(sum(position_wise_sum_from_all_files))
    bias_counts_input = np.zeros((num_peaks, 1))

    # the control profile and the smoothed version of the control
    # profile (1 + 1 = 2, always :) )
    # if multiple control files are specified, the control profile for
    # each sample would be position_wise_sum_from_all_files
    bias_profile_input = np.zeros((num_peaks, args.control_len, 2))

    ## IF NO CONTROL BIGWIGS ARE SPECIFIED THEN THE TWO NUMPY ARRAYS
    ## bias_counts_input AND bias_profile_input WILL REMAIN ZEROS

    # list to hold all the sequences for the peaks
    sequences = []

    # get a list of valid rows to store only the peaks on which
    # the contribution scores are computed, excluding those that
    # have may some exceptions, later we'll convert these rows
    # to a dataframe and write it out to a new file
    rows = []

    # iterate through all the peaks
    for idx, row in peaks_df.iterrows():

        # peak interval based on 'summit' position
        start = row['st'] + row['summit'] - (args.input_seq_len // 2)
        end = row['st'] + row['summit'] + (args.input_seq_len // 2)

        # fetch the reference sequence at the peak location
        try:
            seq = fasta_ref.fetch(row['chrom'], start, end).upper()
        except ValueError:  # start/end out of range
            logging.warn("Unable to fetch reference sequence at peak: "
                         "{} {}-{}. Skipped.".format(row['chrom'], start, end))
            continue

        # check if we have the required length
        if len(seq) != args.input_seq_len:
            logging.warn("Reference genome doesn't have required sequence "
                         "length ({}) at peak: {} {}-{}. Returned length {}. "
                         "Skipped.".format(args.input_seq_len, row['chrom'],
                                           start, end, len(seq)))
            continue

        # fetch control values
        if len(control_bigWigs) > 0:
            # a different start and end for controls since control_len
            # is usually not the same as input_seq_len
            start = row['st'] + row['summit'] - (args.control_len // 2)
            end = row['st'] + row['summit'] + (args.control_len // 2)

            # read the values from the control bigWigs
            for i in range(len(control_bigWigs)):
                vals = np.nan_to_num(control_bigWigs[i].values(
                    row['chrom'], start, end))
                bias_counts_input[idx, 0] += np.sum(vals)
                bias_profile_input[idx, :, 0] += vals

            # we need to take the log of the sum of counts
            # we add 1 to avoid taking log of 0
            # same as mseqgen does while generating batches
            bias_counts_input[idx, 0] = np.log(bias_counts_input[idx, 0] + 1)

            # compute the smoothed control profile
            sigma = float(args.control_smoothing[0])
            window_width = int(args.control_smoothing[1])
            bias_profile_input[idx, :, 1] = gaussian1D_smoothing(
                bias_profile_input[idx, :, 0], sigma, window_width)

        sequences.append(seq)

        # row passes all exception handling
        rows.append(dict(row))

    # if null distribution is requested
    null_sequences = []
    if args.gen_null_dist:
        logging.info("generating null sequences ...")
        rng = np.random.RandomState(args.seed)

        # iterate over sequences and get the dinucleotide shuffled
        # sequence for each of them
        for seq in sequences:
            # get a list of shuffled seqs. Since we are setting
            # num_shufs to 1, the returned list will be of size 1
            shuffled_seqs = dinuc_shuffle(seq, 1, rng)
            null_sequences.append(shuffled_seqs[0])

        # null sequences are now our actual sequences
        sequences = null_sequences[:]

    # one hot encode all the sequences
    X = one_hot_encode(sequences)
    print(X.shape)

    # inline function to handle dinucleotide shuffling
    def data_func(model_inputs):
        rng = np.random.RandomState(args.seed)
        return [dinuc_shuffle(model_inputs[0], args.num_shuffles, rng)] + \
        [
            np.tile(
                np.zeros_like(model_inputs[i]),
                (args.num_shuffles,) + (len(model_inputs[i].shape) * (1,))
            ) for i in range(1, len(model_inputs))
        ]

    # shap explainer for the counts head
    profile_model_counts_explainer = shap.explainers.deep.TFDeepExplainer(
        ([model.input[0], model.input[1]
          ], tf.reduce_sum(model.outputs[1], axis=-1)),
        data_func,
        combine_mult_and_diffref=combine_mult_and_diffref)

    # explainer for the profile head
    weightedsum_meannormed_logits = get_weightedsum_meannormed_logits(
        model, task_id=args.task_id, stranded=True)
    profile_model_profile_explainer = shap.explainers.deep.TFDeepExplainer(
        ([model.input[0], model.input[2]], weightedsum_meannormed_logits),
        data_func,
        combine_mult_and_diffref=combine_mult_and_diffref)

    logging.info("Generating 'counts' shap scores")
    counts_shap_scores = profile_model_counts_explainer.shap_values(
        [X, bias_counts_input], progress_message=100)

    # construct a dictionary for the 'counts' shap scores & the
    # the projected 'counts' shap scores
    # MODISCO workflow expects one hot sequences with shape (?,4,1000)
    projected_shap_scores = np.multiply(X, counts_shap_scores[0])
    counts_scores = {
        'raw': {
            'seq': np.transpose(X, (0, 2, 1))
        },
        'shap': {
            'seq': np.transpose(counts_shap_scores[0], (0, 2, 1))
        },
        'projected_shap': {
            'seq': np.transpose(projected_shap_scores, (0, 2, 1))
        }
    }

    # save the dictionary in HDF5 formnat
    logging.info("Saving 'counts' scores")
    dd.io.save('{}/counts_scores.h5'.format(interpret_dir), counts_scores)

    logging.info("Generating 'profile' shap scores")
    profile_shap_scores = profile_model_profile_explainer.shap_values(
        [X, bias_profile_input], progress_message=100)

    # construct a dictionary for the 'profile' shap scores & the
    # the projected 'profile' shap scores
    projected_shap_scores = np.multiply(X, profile_shap_scores[0])
    profile_scores = {
        'raw': {
            'seq': np.transpose(X, (0, 2, 1))
        },
        'shap': {
            'seq': np.transpose(profile_shap_scores[0], (0, 2, 1))
        },
        'projected_shap': {
            'seq': np.transpose(projected_shap_scores, (0, 2, 1))
        }
    }

    # save the dictionary in HDF5 formnat
    logging.info("Saving 'profile' scores")
    dd.io.save('{}/profile_scores.h5'.format(interpret_dir), profile_scores)

    # create dataframe from all rows that were sucessfully processed
    df_valid_scores = pd.DataFrame(rows)

    # save the dataframe as a new .bed file
    df_valid_scores.to_csv('{}/peaks_valid_scores.bed'.format(interpret_dir),
                           sep='\t',
                           header=False,
                           index=False)

    # write all the command line arguments to a json file
    config_file = '{}/config.json'.format(interpret_dir)
    with open(config_file, 'w') as fp:
        config = vars(args)
        json.dump(config, fp)
コード例 #9
0
ファイル: generators.py プロジェクト: annashcherbina/mseqgen
    def __init__(self, input_params, batch_gen_params, reference_genome, 
                 chrom_sizes, chroms, num_threads, epochs, batch_size):
        
        # sampling mode to get chromosome positions
        self.sampling_mode = batch_gen_params['sampling_mode']

        # ML task mode "train", "val" or "test"
        self.mode = batch_gen_params['mode']
        
        # check if at least one of the two input modes is present
        if not os.path.isdir(input_params['data']) and \
            os.path.splitext(input_params['data'])[1] != '.json':
            raise quietexception.QuietException(
                "Either input directory or input json must be specified. "
                "None found.")
        
        # load the input tasks either from the input dir or from
        # the input json
        if os.path.isdir(input_params['data']):
            self.tasks = sequtils.getInputTasks(
                input_params['data'], stranded=input_params['stranded'], 
                has_control=input_params['has_control'],
                require_peaks=(self.sampling_mode == 'peaks'), 
                mode=self.mode)
        else:
            # make sure the input_data json file exists
            if not os.path.isfile(input_params['data']):
                raise quietexception.QuietException(
                    "File not found: {}", input_params['data'])
        
            with open(input_params['data'], 'r') as inp_json:
                self.tasks = json.loads(inp_json.read())

        # check if the reference genome file exists
        if not os.path.isfile(reference_genome):
            raise quietexception.QuietException(
                "File not found: {}", reference_genome)
        
        # check if the chrom_sizes file exists
        if not os.path.isfile(chrom_sizes):
            raise quietexception.QuietException(
                "File not found: {}", chrom_sizes)

        self.num_tasks = len(list(self.tasks.keys()))
        self.reference = reference_genome

        # read the chrom sizes into a dataframe 
        self.chrom_sizes_df = pd.read_csv(chrom_sizes, sep = '\t', 
                              header=None, names = ['chrom', 'size']) 

        # chromosome list
        self.chroms = chroms
        
        # keep only those chrom_sizes rows corresponding to the 
        # required chromosomes
        self.chrom_sizes_df = self.chrom_sizes_df[
            self.chrom_sizes_df['chrom'].isin(self.chroms)]

        # generate a new column for sampling weights of the chromosomes
        self.chrom_sizes_df['weights'] = (self.chrom_sizes_df['size'] / 
                                          self.chrom_sizes_df['size'].sum())

        self.num_threads = num_threads
        self.epochs = epochs
        self.batch_size = batch_size

        # rest of batch generation parameters
        self.input_flank =  batch_gen_params['input_seq_len'] // 2
        self.output_flank = batch_gen_params['output_len'] // 2        
        self.max_jitter = batch_gen_params['max_jitter']
        self.negative_sampling_rate = batch_gen_params['negative_sampling_rate']
        self.rev_comp_aug = batch_gen_params['rev_comp_aug']
        self.shuffle = batch_gen_params['shuffle']
        
        # control batch generation for next epoch
        # if the value is not set to True, batches are not generated
        # Use an external controller to set value to True/False
        self.ready_for_next_epoch = False
        
        # (early) stopping flag
        self.stop = False
        
        if self.sampling_mode == 'peaks':
            
            # get a pandas dataframe for the peak positions
            # Note - we need the 'tasks' dictionary so we can access
            # the peaks.bed files from the paths available in the 
            # dictionary
            self.data = sequtils.getPeakPositions(
                self.tasks, self.chroms, 
                self.chrom_sizes_df[['chrom', 'size']], self.input_flank)  
            
        elif sampling_mode == 'sequential':
            
            if 'num_positions' not in batch_gen_params:
                raise quietexception.QuietException(
                    "Key not found in batch_gen_params_json: 'num_positions'. " 
                    "Required for sequential sampling mode")

            if 'step_size' not in batch_gen_params:
                raise quietexception.QuietException(
                    "Key not found in batch_gen_params_json: 'step_size'. " 
                    "Required for sequential sampling mode")

            # get a pandas dataframe with sequential positions at 
            # regular intervals
            self.data = sequtils.getChromPositions(
                self.chroms, self.chrom_sizes_df[['chrom', 'size']], 
                self.input_flank, mode=sampling_mode, 
                num_positions=batch_gen_params['num_positions'], 
                step=batch_gen_params['step_size'])

            self.max_jitter = 0

        elif sampling_mode == 'random':
            
            if 'num_positions' not in batch_gen_params:
                raise quietexception.QuietException(
                    "Key not found in batch_gen_params_json: 'num_positions'. "
                    "Required for random sampling mode")
            
            # get a pandas dataframe with random positions
            self.data = sequtils.getChromPositions(
                self.chroms, self.chrom_sizes_df[['chrom', 'size']], 
                self.input_flank, mode=sampling_mode, 
                num_positions=batch_gen_params['num_positions'])

            self.max_jitter = 0
コード例 #10
0
def modisco_main():
    parser = modisco_argsparser()
    args = parser.parse_args()

    if not os.path.exists(args.scores_path):
        raise quietexception.QuietException(
            "Score file {} does not exist".format(args.scores_path))


#     if not os.path.exists(args.scores_locations):
#         raise quietexception.QuietException(
#             "Scores locations file {} does not exist".format(
#                 args.scores_locations))

    if not os.path.exists(args.output_directory):
        raise quietexception.QuietException(
            "Output directiry {} does not exist".format(args.output_directory))

    # Load the scores
    scores = deepdish.io.load(args.scores_path)
    shap_scores_seq = []
    proj_shap_scores_seq = []
    one_hot_seqs = []

    center = int(scores['shap']['seq'].shape[-1] / 2)
    start = center - 200
    end = center + 200
    for i in scores['shap']['seq']:
        shap_scores_seq.append(i[:, start:end].transpose())

    for i in scores['projected_shap']['seq']:
        proj_shap_scores_seq.append(i[:, start:end].transpose())

    for i in scores['raw']['seq']:
        one_hot_seqs.append(i[:, start:end].transpose())

    tasks = ['task0']
    task_to_scores = OrderedDict()
    task_to_hyp_scores = OrderedDict()

    onehot_data = one_hot_seqs
    task_to_scores['task0'] = proj_shap_scores_seq
    task_to_hyp_scores['task0'] = shap_scores_seq

    # track_set = modisco.tfmodisco_workflow.workflow.prep_track_set(
    #     task_names=["task0"],
    #     contrib_scores=task_to_scores,
    #     hypothetical_contribs=task_to_hyp_scores,
    #     one_hot=onehot_data)

    tfmodisco_workflow = modisco.tfmodisco_workflow.workflow.TfModiscoWorkflow(
        sliding_window_size=21,
        flank_size=10,
        target_seqlet_fdr=0.05,
        seqlets_to_patterns_factory=modisco.tfmodisco_workflow.
        seqlets_to_patterns.TfModiscoSeqletsToPatternsFactory(
            embedder_factory=modisco.seqlet_embedding.advanced_gapped_kmer.
            AdvancedGappedKmerEmbedderFactory(),
            trim_to_window_size=30,
            initial_flank_to_add=10,
            final_min_cluster_size=30))

    tfmodisco_results = tfmodisco_workflow(
        task_names=["task0"],
        contrib_scores=task_to_scores,
        hypothetical_contribs=task_to_hyp_scores,
        one_hot=onehot_data)

    modisco_results_path = '{}/modisco_results.h5'.format(
        args.output_directory)

    tfmodisco_results.save_hdf5(h5py.File(modisco_results_path))
    print("Saved modisco results to file {}".format(str(modisco_results_path)))

    seqlet_path = '{}/seqlets.txt'.format(args.output_directory)
    print("Saving seqlets to %s" % seqlet_path)
    seqlets = \
        tfmodisco_results.metacluster_idx_to_submetacluster_results[0].seqlets
    bases = np.array(["A", "C", "G", "T"])
    with open(seqlet_path, "w") as f:
        for seqlet in seqlets:
            sequence = "".join(bases[np.argmax(seqlet["sequence"].fwd,
                                               axis=-1)])
            example_index = seqlet.coor.example_idx
            start, end = seqlet.coor.start, seqlet.coor.end
            f.write(">example%d:%d-%d\n" % (example_index, start, end))
            f.write(sequence + "\n")

    print("Saving pattern visualizations")

    patterns = (tfmodisco_results.metacluster_idx_to_submetacluster_results[0].
                seqlets_to_patterns_result.patterns)

    # generate .pngs of each motif and write motif seqlet to
    # individual files
    for idx, pattern in enumerate(patterns):
        print(pattern)
        print("pattern idx", idx)
        print(len(pattern.seqlets))

        pattern_seqlet_path = os.path.join(args.output_directory,
                                           'pattern{}_seqlets.txt'.format(idx))
        with open(pattern_seqlet_path, "w") as f:
            for seqlet in pattern.seqlets:
                sequence = "".join(bases[np.argmax(seqlet["sequence"].fwd,
                                                   axis=-1)])
                example_index = seqlet.coor.example_idx
                start, end = seqlet.coor.start, seqlet.coor.end
                f.write(">example%d:%d-%d\n" % (example_index, start, end))
                f.write(sequence + "\n")

        save_plot(pattern["task0_contrib_scores"].fwd,
                  '{}/contrib_{}.png'.format(args.output_directory, idx))
        save_plot(pattern["sequence"].fwd,
                  '{}/sequence_{}.png'.format(args.output_directory, idx))
コード例 #11
0
def logits2profile_main():
    # parse the command line arguments
    parser = logits2profile_argsparser()
    args = parser.parse_args()
    
    # check if the output directory exists
    if not os.path.exists(args.output_directory):
        raise quietexception.QuietException(
            "Directory {} does not exist".format(args.output_dir))
        return
    
    # check if the logits file exists
    if not os.path.exists(args.logits_file):
        raise quietexception.QuietException(
            "Logits file {} does not exist".format(args.logits_file))
        return

    # check if the counts file exists
    if not os.path.exists(args.counts_file):
        raise quietexception.QuietException(
            "Counts file {} does not exist".format(args.counts_file))
        return

    # check if the peaks file exists
    if not os.path.exists(args.peaks):
        raise quietexception.QuietException(
            "Peaks file {} does not exist".format(args.peaks))
        return
    
    # check if the chrom sizes file exists
    if not os.path.exists(args.chrom_sizes):
        raise quietexception.QuietException(
            "Peaks file {} does not exist".format(args.chrom_sizes))
        return

    # construct header for the output bigWig file
    header = []
    # dataframe with chromosome sizes
    chrom_sizes_df = pd.read_csv(args.chrom_sizes, sep = '\t', header=None, 
                                 names = ['chrom', 'size'])
    chrom_sizes_df = chrom_sizes_df.set_index('chrom')
    # sort chromosomes, to be consistent with how pandas sorts
    # chromosomes ... for e.g. chrom21 is < chrom8
    chroms = args.chroms[:]
    chroms.sort()
    for chrom in chroms:
        size = chrom_sizes_df.at[chrom, 'size']
        header.append((chrom, int(size)))

    logging.debug("bigWig HEADER - {}".format(header))
    
    # open logits bigWig for reading
    logits_bigWig = pyBigWig.open(args.logits_file)

    # open counts bigWig for reading
    counts_bigWig = pyBigWig.open(args.counts_file)

    # open output bigWig for writing 
    output_bigWig_fname = '{}/{}.bw'.format(args.output_directory, 
                                            args.output_filename)
    output_bigWig = pyBigWig.open(output_bigWig_fname, 'w')
    
    # add the header to the bigWig files
    output_bigWig.addHeader(header, maxZooms=0)    
    
    # read the peaks file into a dataframe 
    peaks_df = pd.read_csv(args.peaks, usecols=[0, 1 ,2], 
                           names=['chrom', 'start', 'end'], header=None,
                           sep='\t')
    peaks_df = peaks_df[peaks_df['chrom'].isin(args.chroms)]
    peaks_df['_start'] = peaks_df['start'] + \
                         (peaks_df['end'] - peaks_df['start']) // 2 - \
                         args.window_size // 2 
    peaks_df['_end'] = peaks_df['_start'] + args.window_size
    peaks_df = peaks_df.sort_values(by=['chrom', '_start'])    
    print(peaks_df)
    
    # maintain a dictionary to record chrom coordinates that are
    # written to the output bigWig, this will make inserting 
    # overlapping coordinates easy to handle. pyBigWig's addEntries
    # function will scream if you write to a position to which
    # an entry was already added previously 
    # Note: since chromosome's are sorted we can delete the 
    # previous chromosome entries to save memory
    write_log = {}
    
    prev_chrom = ''
    for index, row in tqdm(peaks_df.iterrows(), total=peaks_df.shape[0]):
        chrom = row['chrom']
        start = row['_start']
        end = row['_end']

        # delete write log entries of the previous chromosome
        if chrom != prev_chrom:
            write_log.pop(prev_chrom, None)
            # new dict for new chrom
            write_log[chrom] = {}
        prev_chrom = chrom
            
        try:
            logits_vals = np.nan_to_num(logits_bigWig.values(chrom, start, end))
        except RuntimeError as e:
            # Get current system exception
            ex_type, ex_value, ex_traceback = sys.exc_info()
            print("Skipping peak ({}, {}, {}) in logits bigWig. No data "
                  "found. Make sure to use the same peaks and "
                  "output-window-size that were used in the predict "
                  "step".format(chrom, start, end))
            continue

        try:
            counts_vals = np.nan_to_num(counts_bigWig.values(chrom, start, end))
        except RuntimeError as e:
            # Get current system exception
            ex_type, ex_value, ex_traceback = sys.exc_info()
            print("Skipping peak ({}, {}, {}) in counts bigWig. No data "
                  "found. Make sure to use the same peaks and "
                  "output-window-size that were used in the predict "
                  "step".format(chrom, start, end))
            continue

        chroms = [chrom] * args.window_size
        starts = list(range(start, end, 1))
        ends = list(range(start + 1, end + 1, 1))

        # scale logits: first softmax, then multiply by counts
        probVals = logits_vals - logsumexp(logits_vals)
        probVals = np.exp(probVals)
        profile = np.multiply(counts_vals, probVals)
        
        for i in range(len(chroms)):
            try:
                _ = write_log[chroms[i]][starts[i]]
            except KeyError as e:
                # write to bigWig only if the location was not written to
                # before
                output_bigWig.addEntries(
                    [chroms[i]], [starts[i]], ends=[ends[i]], 
                    values=[profile[i]])

                # add entry into write log
                write_log[chrom][start] = 0
コード例 #12
0
def main():
    # change the way processes are started, default = 'fork'
    # had to do this to prevent Keras multi gpu from deadlocking
    mp.set_start_method('forkserver')

    # inform user of the keras stderr log file
    logging.warning("For all keras related error logs refer to "
                    "keras.stderr in your local directory")

    # parse the command line arguments
    parser = argparsers.training_argsparser()
    args = parser.parse_args()

    # input params
    input_params = {}
    input_params['data'] = args.input_data
    input_params['stranded'] = args.stranded
    input_params['has_control'] = args.has_control

    # output params
    output_params = {}
    output_params['automate_filenames'] = args.automate_filenames
    output_params['time_zone'] = args.time_zone
    output_params['tag_length'] = args.tag_length
    output_params['output_dir'] = args.output_dir
    output_params['model_output_filename'] = args.model_output_filename

    # genome params
    genome_params = {}
    genome_params['reference_genome'] = args.reference_genome
    genome_params['chrom_sizes'] = args.chrom_sizes
    genome_params['chroms'] = args.chroms
    genome_params['exclude_chroms'] = args.exclude_chroms

    # batch generation parameters
    batch_gen_params = {}
    batch_gen_params['sequence_generator_name'] = args.sequence_generator_name
    batch_gen_params['input_seq_len'] = args.input_seq_len
    batch_gen_params['output_len'] = args.output_len
    batch_gen_params['sampling_mode'] = args.sampling_mode
    batch_gen_params['rev_comp_aug'] = args.reverse_complement_augmentation
    batch_gen_params['negative_sampling_rate'] = args.negative_sampling_rate
    batch_gen_params['max_jitter'] = args.max_jitter
    batch_gen_params['shuffle'] = args.shuffle

    # hyper parameters
    hyper_params = {}
    hyper_params['epochs'] = args.epochs
    hyper_params['batch_size'] = args.batch_size
    hyper_params['learning_rate'] = args.learning_rate
    hyper_params['min_learning_rate'] = args.min_learning_rate
    hyper_params['early_stopping_patience'] = args.early_stopping_patience
    hyper_params['early_stopping_min_delta'] = args.early_stopping_min_delta
    hyper_params['reduce_lr_on_plateau_patience'] = \
        args.reduce_lr_on_plateau_patience

    # parallelization parms
    parallelization_params = {}
    parallelization_params['threads'] = args.threads
    parallelization_params['gpus'] = args.gpus

    # network params
    network_params = {}
    network_params['name'] = args.model_arch_name
    network_params['filters'] = args.filters
    network_params['counts_loss_weight'] = args.counts_loss_weight
    network_params['control_smoothing'] = args.control_smoothing

    if not os.path.exists(output_params['output_dir']):
        raise quietexception.QuietException(
            "Directory {} does not exist".format(output_params['output_dir']))

    if not output_params['automate_filenames'] and \
        output_params['automate_filenames'] is None:
        raise quietexception.QuietException(
            "Model output filename not specified")

    if not os.path.exists(genome_params['reference_genome']):
        raise quietexception.QuietException(
            "Reference genome file {} does not exist".format(
                genome_params['reference_genome']))

    if not os.path.exists(genome_params['chrom_sizes']):
        raise quietexception.QuietException(
            "Chromosome sizes file {} does not exist".format(
                genome_params['chrom_sizes']))

    try:
        get_model = getattr(model_archs, network_params['name'])
    except AttributeError:
        raise quietexception.QuietException(
            "Network {} not found in model definitions".format(
                network_params['name']))

    if not os.path.isfile(args.splits):
        raise quietexception.QuietException("File not found: {}", args.splits)

    # load splits from json file
    with open(args.splits, "r") as splits_json:
        splits = json.loads(splits_json.read())

    # training and validation
    training.train_and_validate_ksplits(input_params, output_params,
                                        genome_params, batch_gen_params,
                                        hyper_params, parallelization_params,
                                        network_params, splits)
コード例 #13
0
def getChromPositions(chroms,
                      chrom_sizes,
                      flank,
                      mode='sequential',
                      num_positions=-1,
                      step=50):
    """Chromosome positions spanning the entire chromosome at 
       a) regular intervals or b) random locations
        
        Args:
            chroms (space separated list): The list of required 
                chromosomes 
            chrom_sizes (pandas.Dataframe): dataframe of chromosome 
                sizes with 'chrom' and 'size' columns
            flank (int): Buffer size before & after the position to  
                ensure we dont fetch values at index < 0 & > chrom size
            mode (str): mode of returned position 'sequential' (from
                the beginning) or 'random'
            num_positions (int): number of chromosome positions
                to return on each chromosome, use -1 to return 
                positions across the entrire chromosome for all given
                chromosomes in `chroms`. mode='random' cannot be used
                with num_positions=-1
            step (int): the interval between consecutive chromosome
                positions
            
        Returns:
            pandas.DataFrame: two column dataframe of chromosome 
                positions (chrom, pos)
            
    """

    if mode == 'random' and num_positions == -1:
        raise quietexception.QuietException(
            "Incompatible parameter pairing: 'mode' = random, "
            "'num_positions' = -1")

    # check if chrom_sizes has a column called 'chrom'
    if 'chrom' not in chrom_sizes.columns:
        logging.error("Expected column 'chrom' not found in chrom_sizes")
        return None

    chrom_sizes = chrom_sizes.set_index('chrom')

    # initialize an empty dataframe with 'chrom' and 'pos' columns
    positions = pd.DataFrame(columns=['chrom', 'pos'])

    # for each chromosome in the list
    for i in range(len(chroms)):
        chrom_size = chrom_sizes.at[chroms[i], 'size']

        # keep start & end within bounds
        start = flank
        end = chrom_size - flank + 1

        if mode == 'random':
            # randomly sample positions
            pos_array = np.random.randint(start, end, num_positions)

        if mode == 'sequential':
            _end = end
            if num_positions != -1:
                # change the last positon based on the number of
                # required positions
                _end = start + step * num_positions

                # if the newly computed 'end' goes beyond the
                # chromosome end (we could throw an error here)
                if _end > end:
                    _end = end

            # positions at regular intervals
            pos_array = list(range(start, _end, step))

        # construct a dataframe for this chromosome
        chrom_df = pd.DataFrame({
            'chrom': [chroms[i]] * len(pos_array),
            'pos': pos_array
        })

        # concatenate to existing df
        positions = pd.concat([positions, chrom_df])

    return positions
コード例 #14
0
ファイル: bounds.py プロジェクト: erankotler/basepairmodels
def bounds_main():
    """
        The main entry point for the bounds computation script
    """
    
    # parse the command line arguments
    parser = bounds_argsparser()
    args = parser.parse_args()
    
    # check if the output directory exists
    if not os.path.exists(args.output_directory):
        raise quietexception.QuietException(
            "Directory {} does not exist".format(args.output_directory))

    # check to make sure at least one input profile was provided
    if len(args.input_profiles) == 0:
        raise quietexception.QuietException(
            "At least one input file is required to compute upper and "
            "lower bound")

    # check to see if the number of output names is equal to the number
    # of input profiles that were provided
    if len(args.output_names) != len(args.input_profiles) :
        raise quietexception.QuietException(
            "There should be same number of output names as the number "
            "of input files")

    # check if each input profile bigWig file exists
    for fname in args.input_profiles:
        if not os.path.exists(fname):
            raise quietexception.QuietException(
                "File not found! {}".format(fname))

    # check if the peaks file exists
    if not os.path.exists(args.peaks):
        raise quietexception.QuietException(
            "Peaks file {} does not exist".format(args.peaks))

    # read the peaks bed file into a pandas dataframe
    peaks_df = pd.read_csv(args.peaks, sep='\t', header=None, 
                           names=['chrom', 'st', 'en', 'name', 'score',
                                  'strand', 'signalValue', 'p', 'q', 'summit'])
    
    # if --chroms paramter is provided filter the dataframe rows
    if args.chroms is not None:
        peaks_df = peaks_df[peaks_df['chrom'].isin(args.chroms)]
    
    # modified start and end based on summit & specified peak_width
    peaks_df['start'] = peaks_df['st'] + peaks_df['summit'] - \
                            (args.peak_width // 2)
    peaks_df['end'] = peaks_df['st'] + peaks_df['summit'] + \
                            (args.peak_width // 2)
    
    print("Peaks shape", peaks_df.shape[0])
    # reset index in case rows have been filtered
    peaks_df = peaks_df.reset_index()
    
    # iterate through each input profile
    for i in range(len(args.input_profiles)):
        
        # path to input profile bigWig
        input_profile_bigWig = args.input_profiles[i]
        
        print("Processing ... ", input_profile_bigWig)
        
        # compute upper & lower bounds, and avg profile performance
        average_profile, bounds_df = bounds(
            input_profile_bigWig, peaks_df, args.peak_width, 
            args.smoothing_params)

        # path to output average profile file
        average_profile_filename = "{}/{}_average_profile.csv".format(
            args.output_directory, args.output_names[i])
        
        # write average profile to csv file
        print("Saving average profile ...")
        np.savetxt(average_profile_filename, average_profile,
                   delimiter=",")
        
        # path to the output bounds file
        output_fname = "{}/{}.bds".format(args.output_directory, 
                                          args.output_names[i])
        
        # write the dataframe to a csv file
        print("Saving bounds file ...")
        bounds_df.to_csv(output_fname, index=False)
コード例 #15
0
ファイル: bounds.py プロジェクト: erankotler/basepairmodels
def bounds(input_bigWig, peaks_df, peak_width, smoothing_params=[7, 81]):
    """
        Function to compute lower & upper bounds, and average profile
        performance for cross entropy and jsd metrics
        
        Args:
            input_bigWig (str): path to bigWig file
            
            peaks_df (str): pandas dataframe containing peaks 
                information. 
                
                The dataframe should have 'chrom', 'start', and 'end'
                as first 3 columns. Each peak should have the same
                width (equal to peak_width) i.e 'end' - 'start' is the
                same for all rows in the dataframe.
                
            peak_width (int): width of each peak.
            
            smoothing_params (list): list of length 2, containing sigma
                and window_size values for 1D gaussian smoothing of 
                profiles
        
        Returns:
            tuple: (numpy array of average profile, pandas dataframe
                with bounds values in columns)
                
    """
    
    # compute the average profile
    print("Computing average profile ...")
    avg_profile = get_average_profile(input_bigWig, peaks_df, peak_width)
    
    # get average profile as probabilities
    avg_profile_prob = avg_profile / np.sum(avg_profile)
    
    # open the bigWig file for reading
    bw = pyBigWig.open(input_bigWig)
        
    # arrays to hold metrics values for mnll, cross entropy, jsd, 
    # pearson and spearman correlation of the peak profile computed 
    # against uniform, average and self(observed peak) profile

    # mnll
    mnll_uniform = np.zeros(peaks_df.shape[0])
    mnll_average = np.zeros(peaks_df.shape[0])
    mnll_self = np.zeros(peaks_df.shape[0])    
    
    # cross entropy
    ce_uniform = np.zeros(peaks_df.shape[0])
    ce_average = np.zeros(peaks_df.shape[0])
    ce_self = np.zeros(peaks_df.shape[0])
    
    # jsd
    jsd_uniform = np.zeros(peaks_df.shape[0])
    jsd_average = np.zeros(peaks_df.shape[0])
    jsd_self = np.zeros(peaks_df.shape[0])
    
    # pearson
    pearson_uniform = np.zeros(peaks_df.shape[0])
    pearson_average = np.zeros(peaks_df.shape[0])
    pearson_self = np.zeros(peaks_df.shape[0])
    
    # spearman
    spearman_uniform = np.zeros(peaks_df.shape[0])
    spearman_average = np.zeros(peaks_df.shape[0])
    spearman_self = np.zeros(peaks_df.shape[0])

    print("Computing bounds ...")

    # iterate through all peaks
    for idx, row in tqdm(peaks_df.iterrows(), desc='peak', 
                         total=peaks_df.shape[0]):

        # raise exception if 'end' - 'start' is not equal to peak_width
        if (row['end'] - row['start']) != peak_width:

            raise quietexception.QuietException(
                "Inconsistent peak width found at: {}:{}-{}".format(
                    row['chrom'], row['start'], row['end']))

        # get bigWig profile
        profile = np.nan_to_num(
            bw.values(row['chrom'], row['start'], row['end']))

        # if we find that the profile at this peak is all zeros
        if sum(profile) == 0:

            print("Found 'zero' profile at {}: ({}, {})".format(
                row['chrom'], row['start'], row['end']))

            # assign nans to all 
            mnll_uniform[idx] = np.nan
            mnll_average[idx] = np.nan
            mnll_self[idx] = np.nan

            ce_uniform[idx] = np.nan
            ce_average[idx] = np.nan
            ce_self[idx] = np.nan

            jsd_uniform[idx] = np.nan
            jsd_average[idx] = np.nan
            jsd_self[idx] = np.nan

            pearson_uniform[idx] = np.nan
            pearson_average[idx] = np.nan
            pearson_self[idx] = np.nan

            spearman_uniform[idx] = np.nan
            spearman_average[idx] = np.nan
            spearman_self[idx] = np.nan

            continue

        # uniform distribution profile
        uniform_profile = np.ones(peak_width) * (1.0 / peak_width)

        # smoothed profile 
        profile_smooth = gaussian1D_smoothing(profile, smoothing_params[0], 
                                              smoothing_params[1])

        # smoothed profile as probabilities 
        profile_smooth_prob = profile_smooth / np.sum(profile_smooth)

        # profile as probabilities
        profile_prob = profile / np.sum(profile)

        # mnll of profile with uniform profile
        mnll_uniform[idx] = mnll(profile, probs=uniform_profile)

        # mnll of profile with average profile
        mnll_average[idx] = mnll(profile, probs=avg_profile_prob)

        # mnll of profile with itself
        mnll_self[idx] = mnll(profile, probs=profile_prob)

        # cross entropy of profile with uniform profile
        ce_uniform[idx] = profile_cross_entropy(profile, 
                                                probs=uniform_profile)

        # cross entropy of profile with average profile
        ce_average[idx] = profile_cross_entropy(profile, 
                                                probs=avg_profile_prob)

        # cross entropy of profile with itself
        ce_self[idx] = profile_cross_entropy(profile, probs=profile_prob)

        # jsd of profile with uniform profile
        jsd_uniform[idx] = jensenshannon(profile_prob, uniform_profile)

        # jsd of profile with average profile
        jsd_average[idx] = jensenshannon(profile_prob, avg_profile_prob)

        # jsd of profile with itself (upper bound)
        jsd_self[idx] = 0.0

        # pearson of profile with uniform profile
        ### nothing to do ... leave it as zeros

        # pearson of profile with average profile
        pearson_average[idx] = pearsonr(profile, avg_profile_prob)[0]
        
        # pearson of profile with itself
        pearson_self[idx] = pearsonr(profile, profile)[0]
        
        # spearman of profile with uniform profile
        ### nothing to do ... leave it as zeros

        # spearman of profile with average profile
        spearman_average[idx] = spearmanr(profile, avg_profile_prob)[0]

        spearman_self[idx] = spearmanr(profile, profile)[0]

    # create a pandas dataframe to hold the upper & lower bound, 
    # and avg profile performance values 
    column_names = ['mnll_uniform', 'mnll_average', 'mnll_self',
                    'ce_uniform', 'ce_average', 'ce_self',
                    'jsd_uniform', 'jsd_average', 'jsd_self',
                    'pearson_uniform', 'pearson_average', 'pearson_self', 
                    'spearman_uniform', 'spearman_average', 'spearman_self']
    
    # create a pandas dataframe to store all the bounds values
    bounds_df = pd.DataFrame(columns = column_names)
        
    # assign values to the dataframe columns
    bounds_df['mnll_uniform'] = np.nan_to_num(mnll_uniform)
    bounds_df['mnll_average'] = np.nan_to_num(mnll_average)
    bounds_df['mnll_self'] = np.nan_to_num(mnll_self)
    bounds_df['ce_uniform'] = np.nan_to_num(ce_uniform)
    bounds_df['ce_average'] = np.nan_to_num(ce_average)
    bounds_df['ce_self'] = np.nan_to_num(ce_self)
    bounds_df['jsd_uniform'] = np.nan_to_num(jsd_uniform)
    bounds_df['jsd_average'] = np.nan_to_num(jsd_average)
    bounds_df['jsd_self'] = np.nan_to_num(jsd_self)
    bounds_df['pearson_uniform'] = np.nan_to_num(pearson_uniform)
    bounds_df['pearson_average'] = np.nan_to_num(pearson_average)
    bounds_df['pearson_self'] = np.nan_to_num(pearson_self)
    bounds_df['spearman_uniform'] = np.nan_to_num(spearman_uniform)
    bounds_df['spearman_average'] = np.nan_to_num(spearman_average)
    bounds_df['spearman_self'] = np.nan_to_num(spearman_self)

    return avg_profile, bounds_df
コード例 #16
0
def counts_loss_weight_main():
    """
        main function for counts loss weight computation
    """

    # parse the command line arguments
    parser = counts_loss_weight_argsparser()
    args = parser.parse_args()

    # check if the input data file exists
    if not os.path.exists(args.input_data):
        # output the default value to stdout
        print(args.default)

        raise quietexception.QuietException(
            "Input data file {} does not exist. Using default weight "
            "{}".format(args.input_data, args.default))

    with open(args.input_data, 'r') as inp_json:
        try:
            input_data = json.loads(inp_json.read())
        except Exception as e:
            # output the default value to stdout
            print(args.default)

            exc_type, exc_value, exc_traceback = sys.exc_info()
            raise quietexception.QuietException(
                "{} {}. Using default weight {}".format(
                    exc_type.__name__, str(exc_value), args.default))

    # get all the bigWigs and peaks from the input_data
    bigWigs = []
    peaks = []
    for task in input_data:
        if 'signal' in input_data[task].keys():
            bigWigs.append(input_data[task]['signal'])

        if 'peaks' in input_data[task].keys():
            peaks.append(input_data[task]['peaks'])

    # if no bigWigs found
    if len(bigWigs) == 0:
        # output the default value to stdout
        print(args.default)

        raise quietexception.QuietException(
            "No 'signal' bigWigs found. Using default weight {}".format(
                args.default))
    else:
        # check to see if all are valid paths
        for bigWig in bigWigs:
            if not os.path.exists(bigWig):
                # output the default value to stdout
                print(args.default)

                raise quietexception.QuietException(
                    "File {} does not exist. Using default weight "
                    "{}".format(bigWig, args.default))

    # if no peaks found
    if len(peaks) == 0:
        # output the default value to stdout
        print(args.default)

        raise quietexception.QuietException(
            "No 'peaks' files found. Using default weight {}".format(
                args.default))
    else:
        # check to see if all are valid paths
        for peak_file in peaks:
            if not os.path.exists(peak_file):
                # output the default value to stdout
                print(args.default)

                raise quietexception.QuietException(
                    "File {} does not exist. Using default weight "
                    "{}".format(peak_file, args.default))

    # list of all peaks dataframes to be passed to stats function
    peaks_dfs = []

    # load each peak file and compute the correct 'start' and 'end'
    # intervals
    for peak_file in peaks:
        peaks_df = pd.read_csv(peak_file,
                               sep='\t',
                               header=None,
                               names=[
                                   'chrom', 'st', 'e', 'name', 'score',
                                   'strand', 'signal', 'p', 'q', 'summit'
                               ])

        # create new column for peak start
        peaks_df['start'] = peaks_df['st'] + \
                            peaks_df['summit'] - \
                            args.peak_width//2

        # create new column for peak end
        peaks_df['end'] = peaks_df['st'] + \
                            peaks_df['summit'] + \
                            args.peak_width//2

        # append to the list of peaks dataframes
        peaks_dfs.append(peaks_df[['chrom', 'start', 'end']])

    # compute the counts loss weight using the stats module function
    clw = stats.get_recommended_counts_loss_weight(bigWigs, peaks_dfs,
                                                   args.alpha)

    print(clw)