Ejemplo n.º 1
0
    def calc_self_complementarity(self,
                                  score_self_comp,
                                  backbone_regions,
                                  pam,
                                  replace_5_prime=None):
        if replace_5_prime:
            # Replace the 2 first bases with e.g. "GG"
            fwd = self.stranded_guide_seq[len(pam):-len(replace_5_prime
                                                        )] + replace_5_prime
        else:
            fwd = self.guide_seq[len(
                pam):]  # Do not include PAM motif in folding calculations

        rvs = str(Seq(fwd).reverse_complement())
        l = len(fwd) - STEM_LEN - 1

        self.folding = 0

        for i in range(0, len(fwd) - STEM_LEN):
            if gc_content(fwd[i:i + STEM_LEN]) >= 0.5:
                if fwd[i:i + STEM_LEN] in rvs[0:(l - i)] or any(
                    [fwd[i:i + STEM_LEN] in item
                     for item in backbone_regions]):
                    self.folding += 1

        self.score += self.folding * config.score('FOLDING')
Ejemplo n.º 2
0
def _add_scores(dataset,dataset_fits):
    for (g,r),fit in dataset_fits.iteritems():
        if g is None:
            continue  # it's a region fit
            
        series = dataset.get_one_series(g,r)
        try:
            if fit.fit_predictions is None:
                fit.fit_score = None
            else:
                fit.fit_score = cfg.score(series.single_expression, fit.fit_predictions)
        except:
            fit.fit_score = None
        try:
            fit.LOO_score = loo_score(series.single_expression, fit.LOO_predictions)
        except:
            fit.LOO_score = None
            
        # add score for correlation LOO fits
        correlation_levels = getattr(fit, 'with_correlations', None)
        if correlation_levels is not None:
            for level in correlation_levels:
                y_real = series.single_expression
                y_pred = level.LOO_predictions[series.original_inds] # match the predictions to the indices of the single series after NaN are removed from it
                level.LOO_score = loo_score(y_real, y_pred)
            
    return dataset_fits
Ejemplo n.º 3
0
def _add_scores(dataset, dataset_fits):
    for (g, r), fit in dataset_fits.iteritems():
        if g is None:
            continue  # it's a region fit

        series = dataset.get_one_series(g, r)
        try:
            if fit.fit_predictions is None:
                fit.fit_score = None
            else:
                fit.fit_score = cfg.score(series.single_expression,
                                          fit.fit_predictions)
        except:
            fit.fit_score = None
        try:
            fit.LOO_score = loo_score(series.single_expression,
                                      fit.LOO_predictions)
        except:
            fit.LOO_score = None

        # add score for correlation LOO fits
        correlation_levels = getattr(fit, 'with_correlations', None)
        if correlation_levels is not None:
            for level in correlation_levels:
                y_real = series.single_expression
                y_pred = level.LOO_predictions[
                    series.
                    original_inds]  # match the predictions to the indices of the single series after NaN are removed from it
                level.LOO_score = loo_score(y_real, y_pred)

    return dataset_fits
Ejemplo n.º 4
0
def plot_one_series(series, shape=None, theta=None, LOO_predictions=None, change_distribution=None, minimal_annotations=False, ax=None, show_legend=True):
    x = series.ages
    y = series.single_expression
    b_subplot = ax is not None
    if ax is None:
        fig = plt.figure()
        ax = fig.add_subplot(111)
    fontsize = cfg.minimal_annotation_fontsize if minimal_annotations else cfg.fontsize
    
    # plot the data points
    markersize = 8 if not minimal_annotations else 4
    ax.plot(series.ages, y, 'ks', markersize=markersize)
    if not b_subplot:
        ax.set_ylabel('expression level', fontsize=fontsize)
        ax.set_xlabel('age', fontsize=fontsize)
    ttl = '{}@{}'.format(series.gene_name, series.region_name)
    add_age_ticks(ax, series.age_scaler, fontsize)

    # plot change distribution if provided
    if change_distribution:
        ymin, ymax = ax.get_ylim()
        centers = change_distribution.centers
        width = centers[1] - centers[0]
        weights = change_distribution.weights
        weights *= 0.9 * (ymax - ymin) / weights.max()
        ax.bar(centers, weights, width=width, bottom=ymin, color='g', alpha=0.5)

    if shape is not None and theta is not None:
        # add fit parameters to title
        ttl = '{}, {} fit'.format(ttl, shape)
        more_ttl = shape.format_params(theta, series.age_scaler, latex=True)
        if more_ttl:
            ttl = '\n'.join([ttl, more_ttl])
        
        # draw the overall fit
        score = cfg.score(y,shape.f(theta,x))
        x_smooth,y_smooth = shape.high_res_preds(theta,x)        
        label = 'fit ({}={:.3g})'.format(cfg.score_type, score)
        ax.plot(x_smooth, y_smooth, 'b-', linewidth=3, label=label)

        # draw LOO predictions and residuals
        if LOO_predictions is not None:
            score = loo_score(y,LOO_predictions)
            for i,(xi,yi,y_loo) in enumerate(zip(x,y,LOO_predictions)):
                if y_loo is None or np.isnan(y_loo):
                    continue
                label = 'LOO ({}={:.3g})'.format(cfg.score_type, score) if i==0 else None
                ax.plot([xi, xi], [yi, y_loo], '-', color='0.5', label=label)
                ax.plot(xi, y_loo, 'x', color='0.5', markeredgewidth=2)
        if show_legend and not minimal_annotations:
            ax.legend(fontsize=fontsize, frameon=False)
        
    if not minimal_annotations:
        ax.tick_params(axis='y', labelsize=fontsize)
        if not b_subplot:
            ax.set_title(ttl, fontsize=fontsize)
    return ax.figure
Ejemplo n.º 5
0
    def calc_gc_content(self, score_gc):
        """ Calculate the GC content of the guide """
        if self.pam is not None and self.stranded_guide_seq is not None:
            g_seq = self.stranded_guide_seq[len(self.pam):]
            g_count = g_seq.count('G')
            c_count = g_seq.count('C')
            self.gc_content = (100 *
                               (float(g_count + c_count) / int(len(g_seq))))

        if score_gc:
            if self.gc_content > GC_HIGH or self.gc_content < GC_LOW:
                self.score += config.score('CRISPR_BAD_GC')
Ejemplo n.º 6
0
def plot_one_exon(series,
                  shape=None,
                  theta=None,
                  LOO_predictions=None,
                  ax=None,
                  y_range=None):
    x = series.ages
    y = series.single_expression

    fontsize = cfg.minimal_annotation_fontsize
    markersize = 8
    y_scaler = scalers.build_scaler(cfg.plots_scaling, None)
    scaled = y_scaler is not None

    y_scaled = y_scaler.scale(y) if scaled else y
    if scaled and y_range is not None:
        y_range = y_scaler.scale(y_range)

    if y_range is not None:
        plt.ylim(y_range)
    ax.plot(series.ages, y_scaled, 'ks', markersize=markersize)
    ax.set_xlabel('age', fontsize=fontsize)
    add_age_ticks(ax, series.age_scaler, fontsize)
    exon = series.gene_name[series.gene_name.index(cfg.exon_separator) + 1:]
    ax.set_title(exon.replace(cfg.exon_separator, '-'), fontsize=14)

    if shape is not None and theta is not None:

        score = cfg.score(y, shape.f(theta, x))
        x_smooth, y_smooth = shape.high_res_preds(theta, x)
        if scaled:
            y_smooth = y_scaler.scale(y_smooth)
        label = 'fit ({}={:.3g})'.format(cfg.score_type, score)
        ax.plot(x_smooth, y_smooth, 'b-', linewidth=3, label=label)

        # draw LOO predictions and residuals
        if LOO_predictions is not None:
            score = loo_score(y, LOO_predictions)
            if scaled:
                LOO_predictions = y_scaler.scale(LOO_predictions)
            for i, (xi, yi,
                    y_loo) in enumerate(zip(x, y_scaled, LOO_predictions)):
                if y_loo is None or np.isnan(y_loo):
                    continue
                label = 'LOO ({}={:.3g})'.format(
                    cfg.score_type,
                    score) if i == 0 and score is not None else None
                ax.plot([xi, xi], [yi, y_loo], '-', color='0.5', label=label)
                ax.plot(xi, y_loo, 'x', color='0.5', markeredgewidth=2)

        ax.legend(fontsize=fontsize, frameon=False)
    return ax.figure
Ejemplo n.º 7
0
def loo_score(y_real, y_pred):
    """Compute score of LOO predictions. 
       For this purpose we ignore the fits taken with the first and last points left out,
       because these fits are then evaluated outside the range they were trained which can
       cause bad overfitting for functions like a sigmoid when the data is basically flat.
       This type of overfitting doesn't affect the fit on the whole data if we only consider the
       fit within the range it was trained on (which we do), so excluding the first and last points 
       should give a better estimate of the generalization error.
       Also ignore any NaNs in both sequences.
    """
    y_real = y_real[1:-1]
    y_pred = y_pred[1:-1]
    valid = ~np.isnan(y_real) & ~np.isnan(y_pred)
    y_real = y_real[valid]
    y_pred = y_pred[valid]
    if len(y_real) < 3:
        return None
    return cfg.score(y_real, y_pred)
Ejemplo n.º 8
0
def run_chari_2015(guides: [Cas9], info) -> [Cas9]:
    """
    Runs chopchop_chari_2015 docker image using the supplied guides & scoring method.

    :param scoring_method: The scoring method to use. Accepted values are "chari_2015" & "ALL".
    :param guides: A list of Cas9 objects to score.
    :return: Returns a list of Cas9 scored objects.
    """
    keyed_tuples = []
    for key, guide in enumerate(guides):
        keyed_tuples.append(convert_cas9_to_tuple(key, guide))

    encoded = codecs.encode(pickle.dumps(keyed_tuples, protocol=2),
                            'base64').decode()

    command = [
        'docker', 'run', '-i', 'chopchop_chari_2015', '-c',
        str(config.score('COEFFICIENTS')), '-p',
        str(info.pam), '-g',
        str(info.genome), '-s',
        str(info.scoring_method.name)
    ]
    chari_2015 = subprocess.run(command,
                                capture_output=True,
                                text=True,
                                input=encoded)

    # encoding='latin1' is for backwards compatibility.
    results = pickle.loads(codecs.decode(chari_2015.stdout.encode(), 'base64'),
                           encoding='latin1')

    for key, guide in enumerate(guides):
        for t in results:
            if t[0] == key:
                _, guide.score, guide.coefficients_score = t

    return guides
Ejemplo n.º 9
0
    def add_off_target(self, hit, check_mismatch, max_off_targets,
                       count_mm_pos):
        """ Add off target hits (and not original hit) to list for each guide RNA """

        hit_id = "%s:%s" % (hit.chrom, hit.start)
        n_miss = 0
        mm_pattern = re.compile(r'NM:i:(\d+)')

        # If the hit is identical to the guide coord it is the original correct hit
        if self.chrom == hit.chrom and self.start == hit.start:  # never true for isoforms
            # This is the original/main hit
            self.correct_hit = hit
            return

        if config.isoforms and self.isoform == hit.chrom and self.stranded_guide_seq == hit.matchSeq:
            # This is the original/main hit
            self.correct_hit = hit
            return

        # Do not count off targets twice, e.g. for TALENs valid on both strands.
        if hit_id in self.off_target_hash:
            return

        # Reverse count+allowed arrays if on the reverse strand
        if check_mismatch and hit.flag_sum == 0 and not config.isoforms:
            count_mm_pos = count_mm_pos[::-1]

        self.off_target_hash[hit_id] = hit
        if check_mismatch:
            mms = get_mismatch_pos(hit.mismatch_pos[5:])
            for mm in mms:
                if not count_mm_pos[mm]:
                    del (self.off_target_hash[hit_id])
                    return

                elif not count_mm_pos[mm]:
                    n_miss += 1

        # Calculate score
        for opt in hit.opts:
            m = mm_pattern.match(opt)
            if m:
                mm = int(m.group(1)) - n_miss

                # ugly repeat to save time from iterating all isoforms
                if config.isoforms and check_mismatch:
                    if hit.chrom in self.gene_isoforms:  # and hit.chrom not in self.offTargetsIso[mm]:
                        self.off_targets_iso[mm].add(hit.chrom)
                        # don't count/score isoform mismatches but display which isoforms have them
                    else:
                        self.off_targets_mm[mm] += 1
                        self.score += SINGLE_OFFTARGET_SCORE[mm]
                else:
                    self.off_targets_mm[mm] += 1
                    self.score += SINGLE_OFFTARGET_SCORE[mm]

            if opt == "XM:i:" + str(max_off_targets):
                self.score += config.score('MAX_OFFTARGETS')
                self.off_targets_mm[0] += max_off_targets
                self.off_targets_mm[1] += max_off_targets
                self.off_targets_mm[2] += max_off_targets
                self.off_targets_mm[3] += max_off_targets

        self.off_targets_sorted = False
Ejemplo n.º 10
0
    def __init__(self, tale1, tale2, spacer_seq, spacer_size, off_target_pairs,
                 enzyme_co, max_off_targets, min_res_site_len):
        self.tale1 = tale1
        self.tale2 = tale2
        self.chrom = tale1.chrom
        self.strand = tale1.strand
        self.id = ""
        self.restriction_sites = ""

        # Start of region covered by tale pair
        self.start = tale1.start

        # End of region covered by tale pair
        self.end = tale2.end
        self.spacer_seq = spacer_seq
        self.target_size = spacer_size
        self.spacer_size = spacer_size
        self.off_target_pairs = off_target_pairs
        self.diff_strand_off_target = 0
        self.same_strand_off_target = 0

        # Start cluster as -1, but will increment from 1
        self.cluster = -1
        self.spacer_start = tale1.start + tale1.guide_size
        self.spacer_end = tale2.start - 1

        self.enzyme_co = enzyme_co
        self.stranded_guide_seq = str(
            self.tale1.guide_seq) + "\n" + self.spacer_seq + "\n" + str(
                self.tale2.guide_seq)
        self.off_target_pair_count = 0

        # Use bitwise operator to compare flag sum to see whether off-target TALEs are on different strands
        # (bad = good cutting ability) or on the same strand (not so bad = FokI domains probably too far apart to cut)
        indiv_score = 0

        for (hit1, hit2) in off_target_pairs:
            # Using boolean, count number of offtarget pairs on different strands
            if hit2.flag_sum & hit1.flag_sum == 0:
                self.diff_strand_off_target += 1

            for opt in [hit1.opts, hit2.opts]:
                if opt == "NM:i:0":
                    indiv_score += SINGLE_OFFTARGET_SCORE[0]
                if opt == "NM:i:1":
                    indiv_score += SINGLE_OFFTARGET_SCORE[1]
                if opt == "NM:i:2":
                    indiv_score += SINGLE_OFFTARGET_SCORE[2]
                if opt == "NM:i:3":
                    indiv_score += SINGLE_OFFTARGET_SCORE[3]

        # Compute penalties (scores) for off-target hits. Worst = off-target pair, Not so bad = off-target single tale
        self.score = (self.diff_strand_off_target *
                      config.score('OFFTARGET_PAIR_DIFF_STRAND')
                      ) + tale1.score + tale2.score - indiv_score + (
                          tale1.strand == "+") * config.score('PAM_IN_PENALTY')
        res_sites = find_restriction_sites(self.spacer_seq, enzyme_co,
                                           min_res_site_len)
        self.restriction_sites = ";".join(
            map(lambda x: "%s:%s" % (str(x), ",".join(map(str, res_sites[x]))),
                res_sites))
Ejemplo n.º 11
0
def plot_one_series(series,
                    shape=None,
                    theta=None,
                    LOO_predictions=None,
                    change_distribution=None,
                    minimal_annotations=False,
                    ax=None,
                    show_legend=True):
    x = series.ages
    y = series.single_expression
    b_subplot = ax is not None
    if ax is None:
        fig = plt.figure()
        ax = fig.add_subplot(111)
    fontsize = cfg.minimal_annotation_fontsize if minimal_annotations else cfg.fontsize

    # plot the data points
    markersize = 8 if not minimal_annotations else 4
    ax.plot(series.ages, y, 'ks', markersize=markersize)
    if not b_subplot:
        ax.set_ylabel('expression level', fontsize=fontsize)
        ax.set_xlabel('age', fontsize=fontsize)
    ttl = '{}@{}'.format(series.gene_name, series.region_name)
    add_age_ticks(ax, series.age_scaler, fontsize)

    # plot change distribution if provided
    if change_distribution:
        ymin, ymax = ax.get_ylim()
        centers = change_distribution.centers
        width = centers[1] - centers[0]
        weights = change_distribution.weights
        weights *= 0.9 * (ymax - ymin) / weights.max()
        ax.bar(centers,
               weights,
               width=width,
               bottom=ymin,
               color='g',
               alpha=0.5)

    if shape is not None and theta is not None:
        # add fit parameters to title
        ttl = '{}, {} fit'.format(ttl, shape)
        more_ttl = shape.format_params(theta, series.age_scaler, latex=True)
        if more_ttl:
            ttl = '\n'.join([ttl, more_ttl])

        # draw the overall fit
        score = cfg.score(y, shape.f(theta, x))
        x_smooth, y_smooth = shape.high_res_preds(theta, x)
        label = 'fit ({}={:.3g})'.format(cfg.score_type, score)
        ax.plot(x_smooth, y_smooth, 'b-', linewidth=3, label=label)

        # draw LOO predictions and residuals
        if LOO_predictions is not None:
            score = loo_score(y, LOO_predictions)
            for i, (xi, yi, y_loo) in enumerate(zip(x, y, LOO_predictions)):
                if y_loo is None or np.isnan(y_loo):
                    continue
                label = 'LOO ({}={:.3g})'.format(cfg.score_type,
                                                 score) if i == 0 else None
                ax.plot([xi, xi], [yi, y_loo], '-', color='0.5', label=label)
                ax.plot(xi, y_loo, 'x', color='0.5', markeredgewidth=2)
        if show_legend and not minimal_annotations:
            ax.legend(fontsize=fontsize, frameon=False)

    if not minimal_annotations:
        ax.tick_params(axis='y', labelsize=fontsize)
        if not b_subplot:
            ax.set_title(ttl, fontsize=fontsize)
    return ax.figure
Ejemplo n.º 12
0
    def __init__(self, tale1, tale2, spacer_seq, spacer_size, off_target_pairs,
                 enzyme_co, max_off_targets, g_rvd, min_res_site_len):
        self.tale1 = tale1
        self.tale2 = tale2
        self.chrom = tale1.chrom
        self.strand = tale1.strand
        self.id = ""
        self.tale1.rvd = ""
        self.tale2.rvd = ""
        self.restriction_sites = ""

        # Start of region covered by tale pair
        self.start = tale1.start

        # End of region covered by tale pair
        self.end = tale2.end  # + tale2.guideSize
        self.spacer_seq = spacer_seq
        self.target_size = spacer_size
        self.spacer_size = spacer_size
        self.off_target_pairs = off_target_pairs
        self.diff_strand_off_target = 0
        self.same_strand_off_target = 0

        # Start cluster as -1, but will increment from 1
        self.cluster = -1
        self.spacer_start = tale1.start + tale1.guide_size
        self.spacer_end = tale2.start - 1

        self.enzyme_co = enzyme_co
        self.stranded_guide_seq = str(
            self.tale1.guide_seq) + "\n" + self.spacer_seq + "\n" + str(
                self.tale2.guide_seq)

        # Calculate RVD for TALEs; FIX: use mapping
        for base in tale1.guide_seq:
            if base == "A":
                tale1.rvd += "NI "
            elif base == "T":
                tale1.rvd += "NG "
            elif base == "C":
                tale1.rvd += "HD "
            elif base == "G":
                tale1.rvd += g_rvd

        for base in Seq(tale2.guide_seq).reverse_complement():
            if base == "A":
                tale2.rvd += "NI "
            elif base == "T":
                tale2.rvd += "NG "
            elif base == "C":
                tale2.rvd += "HD "
            elif base == "G":
                tale2.rvd += g_rvd

        self.offTargetPairCount = 0

        # Use bitwise operator to compare flag sum to see whether off-target TALEs are on different strands
        # (bad = good cutting ability) or on the same strand (not so bad = FokI domains probably too far apart to cut)
        indiv_score = 0

        for (hit1, hit2) in off_target_pairs:
            # Using boolean, count number of offtarget pairs on different strands
            if hit2.flag_sum & hit1.flag_sum == 0:
                self.diff_strand_off_target += 1

            # Using boolean, count number of offtarget pairs on same strand
            elif hit2.flag_sum & hit1.flag_sum == 16:
                self.same_strand_off_target += 1

            for opt in [hit1.opts, hit2.opts]:
                if opt == "NM:i:0":
                    indiv_score += config.score('INPAIR_OFFTARGET_0')
                if opt == "NM:i:1":
                    indiv_score += config.score('INPAIR_OFFTARGET_1')
                if opt == "NM:i:2":
                    indiv_score += config.score('INPAIR_OFFTARGET_2')
                if opt == "NM:i:3":
                    indiv_score += config.score('INPAIR_OFFTARGET_3')

        # Compute penalties (scores) for off-target hits. Worst = off-target pair, Not so bad = off-target single tale
        self.score = (self.same_strand_off_target * config.score('OFFTARGET_PAIR_SAME_STRAND')) + \
                     (self.diff_strand_off_target * config.score('OFFTARGET_PAIR_DIFF_STRAND')) + \
                     (tale1.score + tale2.score + indiv_score)
        res_sites = find_restriction_sites(self.spacer_seq, enzyme_co,
                                           min_res_site_len)
        self.restriction_sites = ";".join(
            map(lambda x: "%s:%s" % (str(x), ",".join(map(str, res_sites[x]))),
                res_sites))