def calc_self_complementarity(self, score_self_comp, backbone_regions, pam, replace_5_prime=None): if replace_5_prime: # Replace the 2 first bases with e.g. "GG" fwd = self.stranded_guide_seq[len(pam):-len(replace_5_prime )] + replace_5_prime else: fwd = self.guide_seq[len( pam):] # Do not include PAM motif in folding calculations rvs = str(Seq(fwd).reverse_complement()) l = len(fwd) - STEM_LEN - 1 self.folding = 0 for i in range(0, len(fwd) - STEM_LEN): if gc_content(fwd[i:i + STEM_LEN]) >= 0.5: if fwd[i:i + STEM_LEN] in rvs[0:(l - i)] or any( [fwd[i:i + STEM_LEN] in item for item in backbone_regions]): self.folding += 1 self.score += self.folding * config.score('FOLDING')
def _add_scores(dataset,dataset_fits): for (g,r),fit in dataset_fits.iteritems(): if g is None: continue # it's a region fit series = dataset.get_one_series(g,r) try: if fit.fit_predictions is None: fit.fit_score = None else: fit.fit_score = cfg.score(series.single_expression, fit.fit_predictions) except: fit.fit_score = None try: fit.LOO_score = loo_score(series.single_expression, fit.LOO_predictions) except: fit.LOO_score = None # add score for correlation LOO fits correlation_levels = getattr(fit, 'with_correlations', None) if correlation_levels is not None: for level in correlation_levels: y_real = series.single_expression y_pred = level.LOO_predictions[series.original_inds] # match the predictions to the indices of the single series after NaN are removed from it level.LOO_score = loo_score(y_real, y_pred) return dataset_fits
def _add_scores(dataset, dataset_fits): for (g, r), fit in dataset_fits.iteritems(): if g is None: continue # it's a region fit series = dataset.get_one_series(g, r) try: if fit.fit_predictions is None: fit.fit_score = None else: fit.fit_score = cfg.score(series.single_expression, fit.fit_predictions) except: fit.fit_score = None try: fit.LOO_score = loo_score(series.single_expression, fit.LOO_predictions) except: fit.LOO_score = None # add score for correlation LOO fits correlation_levels = getattr(fit, 'with_correlations', None) if correlation_levels is not None: for level in correlation_levels: y_real = series.single_expression y_pred = level.LOO_predictions[ series. original_inds] # match the predictions to the indices of the single series after NaN are removed from it level.LOO_score = loo_score(y_real, y_pred) return dataset_fits
def plot_one_series(series, shape=None, theta=None, LOO_predictions=None, change_distribution=None, minimal_annotations=False, ax=None, show_legend=True): x = series.ages y = series.single_expression b_subplot = ax is not None if ax is None: fig = plt.figure() ax = fig.add_subplot(111) fontsize = cfg.minimal_annotation_fontsize if minimal_annotations else cfg.fontsize # plot the data points markersize = 8 if not minimal_annotations else 4 ax.plot(series.ages, y, 'ks', markersize=markersize) if not b_subplot: ax.set_ylabel('expression level', fontsize=fontsize) ax.set_xlabel('age', fontsize=fontsize) ttl = '{}@{}'.format(series.gene_name, series.region_name) add_age_ticks(ax, series.age_scaler, fontsize) # plot change distribution if provided if change_distribution: ymin, ymax = ax.get_ylim() centers = change_distribution.centers width = centers[1] - centers[0] weights = change_distribution.weights weights *= 0.9 * (ymax - ymin) / weights.max() ax.bar(centers, weights, width=width, bottom=ymin, color='g', alpha=0.5) if shape is not None and theta is not None: # add fit parameters to title ttl = '{}, {} fit'.format(ttl, shape) more_ttl = shape.format_params(theta, series.age_scaler, latex=True) if more_ttl: ttl = '\n'.join([ttl, more_ttl]) # draw the overall fit score = cfg.score(y,shape.f(theta,x)) x_smooth,y_smooth = shape.high_res_preds(theta,x) label = 'fit ({}={:.3g})'.format(cfg.score_type, score) ax.plot(x_smooth, y_smooth, 'b-', linewidth=3, label=label) # draw LOO predictions and residuals if LOO_predictions is not None: score = loo_score(y,LOO_predictions) for i,(xi,yi,y_loo) in enumerate(zip(x,y,LOO_predictions)): if y_loo is None or np.isnan(y_loo): continue label = 'LOO ({}={:.3g})'.format(cfg.score_type, score) if i==0 else None ax.plot([xi, xi], [yi, y_loo], '-', color='0.5', label=label) ax.plot(xi, y_loo, 'x', color='0.5', markeredgewidth=2) if show_legend and not minimal_annotations: ax.legend(fontsize=fontsize, frameon=False) if not minimal_annotations: ax.tick_params(axis='y', labelsize=fontsize) if not b_subplot: ax.set_title(ttl, fontsize=fontsize) return ax.figure
def calc_gc_content(self, score_gc): """ Calculate the GC content of the guide """ if self.pam is not None and self.stranded_guide_seq is not None: g_seq = self.stranded_guide_seq[len(self.pam):] g_count = g_seq.count('G') c_count = g_seq.count('C') self.gc_content = (100 * (float(g_count + c_count) / int(len(g_seq)))) if score_gc: if self.gc_content > GC_HIGH or self.gc_content < GC_LOW: self.score += config.score('CRISPR_BAD_GC')
def plot_one_exon(series, shape=None, theta=None, LOO_predictions=None, ax=None, y_range=None): x = series.ages y = series.single_expression fontsize = cfg.minimal_annotation_fontsize markersize = 8 y_scaler = scalers.build_scaler(cfg.plots_scaling, None) scaled = y_scaler is not None y_scaled = y_scaler.scale(y) if scaled else y if scaled and y_range is not None: y_range = y_scaler.scale(y_range) if y_range is not None: plt.ylim(y_range) ax.plot(series.ages, y_scaled, 'ks', markersize=markersize) ax.set_xlabel('age', fontsize=fontsize) add_age_ticks(ax, series.age_scaler, fontsize) exon = series.gene_name[series.gene_name.index(cfg.exon_separator) + 1:] ax.set_title(exon.replace(cfg.exon_separator, '-'), fontsize=14) if shape is not None and theta is not None: score = cfg.score(y, shape.f(theta, x)) x_smooth, y_smooth = shape.high_res_preds(theta, x) if scaled: y_smooth = y_scaler.scale(y_smooth) label = 'fit ({}={:.3g})'.format(cfg.score_type, score) ax.plot(x_smooth, y_smooth, 'b-', linewidth=3, label=label) # draw LOO predictions and residuals if LOO_predictions is not None: score = loo_score(y, LOO_predictions) if scaled: LOO_predictions = y_scaler.scale(LOO_predictions) for i, (xi, yi, y_loo) in enumerate(zip(x, y_scaled, LOO_predictions)): if y_loo is None or np.isnan(y_loo): continue label = 'LOO ({}={:.3g})'.format( cfg.score_type, score) if i == 0 and score is not None else None ax.plot([xi, xi], [yi, y_loo], '-', color='0.5', label=label) ax.plot(xi, y_loo, 'x', color='0.5', markeredgewidth=2) ax.legend(fontsize=fontsize, frameon=False) return ax.figure
def loo_score(y_real, y_pred): """Compute score of LOO predictions. For this purpose we ignore the fits taken with the first and last points left out, because these fits are then evaluated outside the range they were trained which can cause bad overfitting for functions like a sigmoid when the data is basically flat. This type of overfitting doesn't affect the fit on the whole data if we only consider the fit within the range it was trained on (which we do), so excluding the first and last points should give a better estimate of the generalization error. Also ignore any NaNs in both sequences. """ y_real = y_real[1:-1] y_pred = y_pred[1:-1] valid = ~np.isnan(y_real) & ~np.isnan(y_pred) y_real = y_real[valid] y_pred = y_pred[valid] if len(y_real) < 3: return None return cfg.score(y_real, y_pred)
def run_chari_2015(guides: [Cas9], info) -> [Cas9]: """ Runs chopchop_chari_2015 docker image using the supplied guides & scoring method. :param scoring_method: The scoring method to use. Accepted values are "chari_2015" & "ALL". :param guides: A list of Cas9 objects to score. :return: Returns a list of Cas9 scored objects. """ keyed_tuples = [] for key, guide in enumerate(guides): keyed_tuples.append(convert_cas9_to_tuple(key, guide)) encoded = codecs.encode(pickle.dumps(keyed_tuples, protocol=2), 'base64').decode() command = [ 'docker', 'run', '-i', 'chopchop_chari_2015', '-c', str(config.score('COEFFICIENTS')), '-p', str(info.pam), '-g', str(info.genome), '-s', str(info.scoring_method.name) ] chari_2015 = subprocess.run(command, capture_output=True, text=True, input=encoded) # encoding='latin1' is for backwards compatibility. results = pickle.loads(codecs.decode(chari_2015.stdout.encode(), 'base64'), encoding='latin1') for key, guide in enumerate(guides): for t in results: if t[0] == key: _, guide.score, guide.coefficients_score = t return guides
def add_off_target(self, hit, check_mismatch, max_off_targets, count_mm_pos): """ Add off target hits (and not original hit) to list for each guide RNA """ hit_id = "%s:%s" % (hit.chrom, hit.start) n_miss = 0 mm_pattern = re.compile(r'NM:i:(\d+)') # If the hit is identical to the guide coord it is the original correct hit if self.chrom == hit.chrom and self.start == hit.start: # never true for isoforms # This is the original/main hit self.correct_hit = hit return if config.isoforms and self.isoform == hit.chrom and self.stranded_guide_seq == hit.matchSeq: # This is the original/main hit self.correct_hit = hit return # Do not count off targets twice, e.g. for TALENs valid on both strands. if hit_id in self.off_target_hash: return # Reverse count+allowed arrays if on the reverse strand if check_mismatch and hit.flag_sum == 0 and not config.isoforms: count_mm_pos = count_mm_pos[::-1] self.off_target_hash[hit_id] = hit if check_mismatch: mms = get_mismatch_pos(hit.mismatch_pos[5:]) for mm in mms: if not count_mm_pos[mm]: del (self.off_target_hash[hit_id]) return elif not count_mm_pos[mm]: n_miss += 1 # Calculate score for opt in hit.opts: m = mm_pattern.match(opt) if m: mm = int(m.group(1)) - n_miss # ugly repeat to save time from iterating all isoforms if config.isoforms and check_mismatch: if hit.chrom in self.gene_isoforms: # and hit.chrom not in self.offTargetsIso[mm]: self.off_targets_iso[mm].add(hit.chrom) # don't count/score isoform mismatches but display which isoforms have them else: self.off_targets_mm[mm] += 1 self.score += SINGLE_OFFTARGET_SCORE[mm] else: self.off_targets_mm[mm] += 1 self.score += SINGLE_OFFTARGET_SCORE[mm] if opt == "XM:i:" + str(max_off_targets): self.score += config.score('MAX_OFFTARGETS') self.off_targets_mm[0] += max_off_targets self.off_targets_mm[1] += max_off_targets self.off_targets_mm[2] += max_off_targets self.off_targets_mm[3] += max_off_targets self.off_targets_sorted = False
def __init__(self, tale1, tale2, spacer_seq, spacer_size, off_target_pairs, enzyme_co, max_off_targets, min_res_site_len): self.tale1 = tale1 self.tale2 = tale2 self.chrom = tale1.chrom self.strand = tale1.strand self.id = "" self.restriction_sites = "" # Start of region covered by tale pair self.start = tale1.start # End of region covered by tale pair self.end = tale2.end self.spacer_seq = spacer_seq self.target_size = spacer_size self.spacer_size = spacer_size self.off_target_pairs = off_target_pairs self.diff_strand_off_target = 0 self.same_strand_off_target = 0 # Start cluster as -1, but will increment from 1 self.cluster = -1 self.spacer_start = tale1.start + tale1.guide_size self.spacer_end = tale2.start - 1 self.enzyme_co = enzyme_co self.stranded_guide_seq = str( self.tale1.guide_seq) + "\n" + self.spacer_seq + "\n" + str( self.tale2.guide_seq) self.off_target_pair_count = 0 # Use bitwise operator to compare flag sum to see whether off-target TALEs are on different strands # (bad = good cutting ability) or on the same strand (not so bad = FokI domains probably too far apart to cut) indiv_score = 0 for (hit1, hit2) in off_target_pairs: # Using boolean, count number of offtarget pairs on different strands if hit2.flag_sum & hit1.flag_sum == 0: self.diff_strand_off_target += 1 for opt in [hit1.opts, hit2.opts]: if opt == "NM:i:0": indiv_score += SINGLE_OFFTARGET_SCORE[0] if opt == "NM:i:1": indiv_score += SINGLE_OFFTARGET_SCORE[1] if opt == "NM:i:2": indiv_score += SINGLE_OFFTARGET_SCORE[2] if opt == "NM:i:3": indiv_score += SINGLE_OFFTARGET_SCORE[3] # Compute penalties (scores) for off-target hits. Worst = off-target pair, Not so bad = off-target single tale self.score = (self.diff_strand_off_target * config.score('OFFTARGET_PAIR_DIFF_STRAND') ) + tale1.score + tale2.score - indiv_score + ( tale1.strand == "+") * config.score('PAM_IN_PENALTY') res_sites = find_restriction_sites(self.spacer_seq, enzyme_co, min_res_site_len) self.restriction_sites = ";".join( map(lambda x: "%s:%s" % (str(x), ",".join(map(str, res_sites[x]))), res_sites))
def plot_one_series(series, shape=None, theta=None, LOO_predictions=None, change_distribution=None, minimal_annotations=False, ax=None, show_legend=True): x = series.ages y = series.single_expression b_subplot = ax is not None if ax is None: fig = plt.figure() ax = fig.add_subplot(111) fontsize = cfg.minimal_annotation_fontsize if minimal_annotations else cfg.fontsize # plot the data points markersize = 8 if not minimal_annotations else 4 ax.plot(series.ages, y, 'ks', markersize=markersize) if not b_subplot: ax.set_ylabel('expression level', fontsize=fontsize) ax.set_xlabel('age', fontsize=fontsize) ttl = '{}@{}'.format(series.gene_name, series.region_name) add_age_ticks(ax, series.age_scaler, fontsize) # plot change distribution if provided if change_distribution: ymin, ymax = ax.get_ylim() centers = change_distribution.centers width = centers[1] - centers[0] weights = change_distribution.weights weights *= 0.9 * (ymax - ymin) / weights.max() ax.bar(centers, weights, width=width, bottom=ymin, color='g', alpha=0.5) if shape is not None and theta is not None: # add fit parameters to title ttl = '{}, {} fit'.format(ttl, shape) more_ttl = shape.format_params(theta, series.age_scaler, latex=True) if more_ttl: ttl = '\n'.join([ttl, more_ttl]) # draw the overall fit score = cfg.score(y, shape.f(theta, x)) x_smooth, y_smooth = shape.high_res_preds(theta, x) label = 'fit ({}={:.3g})'.format(cfg.score_type, score) ax.plot(x_smooth, y_smooth, 'b-', linewidth=3, label=label) # draw LOO predictions and residuals if LOO_predictions is not None: score = loo_score(y, LOO_predictions) for i, (xi, yi, y_loo) in enumerate(zip(x, y, LOO_predictions)): if y_loo is None or np.isnan(y_loo): continue label = 'LOO ({}={:.3g})'.format(cfg.score_type, score) if i == 0 else None ax.plot([xi, xi], [yi, y_loo], '-', color='0.5', label=label) ax.plot(xi, y_loo, 'x', color='0.5', markeredgewidth=2) if show_legend and not minimal_annotations: ax.legend(fontsize=fontsize, frameon=False) if not minimal_annotations: ax.tick_params(axis='y', labelsize=fontsize) if not b_subplot: ax.set_title(ttl, fontsize=fontsize) return ax.figure
def __init__(self, tale1, tale2, spacer_seq, spacer_size, off_target_pairs, enzyme_co, max_off_targets, g_rvd, min_res_site_len): self.tale1 = tale1 self.tale2 = tale2 self.chrom = tale1.chrom self.strand = tale1.strand self.id = "" self.tale1.rvd = "" self.tale2.rvd = "" self.restriction_sites = "" # Start of region covered by tale pair self.start = tale1.start # End of region covered by tale pair self.end = tale2.end # + tale2.guideSize self.spacer_seq = spacer_seq self.target_size = spacer_size self.spacer_size = spacer_size self.off_target_pairs = off_target_pairs self.diff_strand_off_target = 0 self.same_strand_off_target = 0 # Start cluster as -1, but will increment from 1 self.cluster = -1 self.spacer_start = tale1.start + tale1.guide_size self.spacer_end = tale2.start - 1 self.enzyme_co = enzyme_co self.stranded_guide_seq = str( self.tale1.guide_seq) + "\n" + self.spacer_seq + "\n" + str( self.tale2.guide_seq) # Calculate RVD for TALEs; FIX: use mapping for base in tale1.guide_seq: if base == "A": tale1.rvd += "NI " elif base == "T": tale1.rvd += "NG " elif base == "C": tale1.rvd += "HD " elif base == "G": tale1.rvd += g_rvd for base in Seq(tale2.guide_seq).reverse_complement(): if base == "A": tale2.rvd += "NI " elif base == "T": tale2.rvd += "NG " elif base == "C": tale2.rvd += "HD " elif base == "G": tale2.rvd += g_rvd self.offTargetPairCount = 0 # Use bitwise operator to compare flag sum to see whether off-target TALEs are on different strands # (bad = good cutting ability) or on the same strand (not so bad = FokI domains probably too far apart to cut) indiv_score = 0 for (hit1, hit2) in off_target_pairs: # Using boolean, count number of offtarget pairs on different strands if hit2.flag_sum & hit1.flag_sum == 0: self.diff_strand_off_target += 1 # Using boolean, count number of offtarget pairs on same strand elif hit2.flag_sum & hit1.flag_sum == 16: self.same_strand_off_target += 1 for opt in [hit1.opts, hit2.opts]: if opt == "NM:i:0": indiv_score += config.score('INPAIR_OFFTARGET_0') if opt == "NM:i:1": indiv_score += config.score('INPAIR_OFFTARGET_1') if opt == "NM:i:2": indiv_score += config.score('INPAIR_OFFTARGET_2') if opt == "NM:i:3": indiv_score += config.score('INPAIR_OFFTARGET_3') # Compute penalties (scores) for off-target hits. Worst = off-target pair, Not so bad = off-target single tale self.score = (self.same_strand_off_target * config.score('OFFTARGET_PAIR_SAME_STRAND')) + \ (self.diff_strand_off_target * config.score('OFFTARGET_PAIR_DIFF_STRAND')) + \ (tale1.score + tale2.score + indiv_score) res_sites = find_restriction_sites(self.spacer_seq, enzyme_co, min_res_site_len) self.restriction_sites = ";".join( map(lambda x: "%s:%s" % (str(x), ",".join(map(str, res_sites[x]))), res_sites))