def flip(blockades, model_file): """ Flips blockades """ blockade_model = load_model(model_file) identifier = Identifier(blockade_model) peptide = blockades[0].peptide clusters = sp.preprocess_blockades(blockades, cluster_size=1, min_dwell=0.0, max_dwell=1000) print("Num\tFwd_dst\tRev_dst\t\tNeeds_flip", file=sys.stderr) num_reversed = 0 new_blockades = [] for num, cluster in enumerate(clusters): discr_signal = sp.discretize(cluster.consensus, len(peptide)) fwd_dist = identifier.signal_protein_distance(discr_signal, peptide) rev_dist = identifier.signal_protein_distance(discr_signal, peptide[::-1]) print("{0}\t{1:5.2f}\t{2:5.2f}\t\t{3}" .format(num + 1, fwd_dist, rev_dist, fwd_dist > rev_dist), file=sys.stderr) new_blockades.append(cluster.blockades[0]) if fwd_dist > rev_dist: new_blockades[-1].eventTrace = new_blockades[-1].eventTrace[::-1] num_reversed += 1 print("Reversed:", num_reversed, "of", len(blockades), file=sys.stderr) return new_blockades
def get_bias(blockades_file, model_file, cluster_size): """ Gets AA-specific bias between the empirical and theoretical signals """ WINDOW = 4 blockades = read_mat(blockades_file) clusters = sp.preprocess_blockades(blockades, cluster_size=cluster_size, min_dwell=0.5, max_dwell=20) peptide = clusters[0].blockades[0].peptide blockade_model = load_model(model_file) errors = defaultdict(list) model_signal = blockade_model.peptide_signal(peptide) for cluster in clusters: discr_signal = sp.discretize(cluster.consensus, len(peptide)) flanked_peptide = ("-" * (WINDOW - 1) + peptide + "-" * (WINDOW - 1)) num_peaks = len(peptide) + WINDOW - 1 for i in xrange(0, num_peaks): kmer = flanked_peptide[i:i + WINDOW] if "-" not in kmer: for aa in kmer: errors[aa].append(discr_signal[i] - model_signal[i]) return errors
def _detalize_cluster(identifier, cluster, top_id, target_id, ostream): """ Prints information about each single blockade inside cluster """ single_blockades = sp.preprocess_blockades(cluster.blockades, cluster_size=1) global_rankings = defaultdict(list) for num, cluster in enumerate(single_blockades): rankings = identifier.rank_db_proteins(cluster.consensus) for i in xrange(len(rankings)): global_rankings[rankings[i][0]].append(i) if rankings[i][0] == target_id: target_rank = i if rankings[i][0] == top_id: winner_rank = i ostream.write("\tSignal {0}, target = {1}, consensus top = {2}\n" .format(num, target_rank, winner_rank)) for prot in global_rankings: global_rankings[prot] = np.mean(global_rankings[prot]) global_rankings = sorted(global_rankings.items(), key=lambda i: i[1]) ostream.write("\tRanking:\n") for prot, rank in global_rankings[:10]: ostream.write("\t\t{0}\t{1}\n".format(prot, rank))
def get_bias(blockades_file, model_file, cluster_size): """ Gets AA-specific bias between the empirical and theoretical signals """ WINDOW = 4 blockades = read_mat(blockades_file) clusters = sp.preprocess_blockades(blockades, cluster_size=cluster_size, min_dwell=0.5, max_dwell=20) peptide = clusters[0].blockades[0].peptide blockade_model = load_model(model_file) errors = defaultdict(list) model_signal = blockade_model.peptide_signal(peptide) for cluster in clusters: discr_signal = sp.discretize(cluster.consensus, len(peptide)) flanked_peptide = ("-" * (WINDOW - 1) + peptide + "-" * (WINDOW - 1)) num_peaks = len(peptide) + WINDOW - 1 for i in xrange(0, num_peaks): kmer = flanked_peptide[i : i + WINDOW] if "-" not in kmer: for aa in kmer: errors[aa].append(discr_signal[i] - model_signal[i]) return errors
def _detalize_cluster(identifier, cluster, top_id, target_id, ostream): """ Prints information about each single blockade inside cluster """ single_blockades = sp.preprocess_blockades(cluster.blockades, cluster_size=1) global_rankings = defaultdict(list) for num, cluster in enumerate(single_blockades): rankings = identifier.rank_db_proteins(cluster.consensus) for i in xrange(len(rankings)): global_rankings[rankings[i][0]].append(i) if rankings[i][0] == target_id: target_rank = i if rankings[i][0] == top_id: winner_rank = i ostream.write( "\tSignal {0}, target = {1}, consensus top = {2}\n".format( num, target_rank, winner_rank)) for prot in global_rankings: global_rankings[prot] = np.mean(global_rankings[prot]) global_rankings = sorted(global_rankings.items(), key=lambda i: i[1]) ostream.write("\tRanking:\n") for prot, rank in global_rankings[:10]: ostream.write("\t\t{0}\t{1}\n".format(prot, rank))
def pvalues_test(blockades_file, cluster_size, blockade_model, db_file, single_blockades, ostream): """ Performs protein identification and report results """ RANDOM_DB_SIZE = 10000 identifier = Identifier(blockade_model) blockades = read_mat(blockades_file) true_peptide = blockades[0].peptide if db_file is None: identifier.random_database(true_peptide, RANDOM_DB_SIZE) target_id = "target" db_len = RANDOM_DB_SIZE else: database, target_id = _make_database(db_file, true_peptide) identifier.set_database(database) db_len = len(database) clusters = sp.preprocess_blockades(blockades, cluster_size=cluster_size, min_dwell=0.5, max_dwell=20) ostream.write("\nNo\tSize\tBest_id\t\tBest_dst\tTrg_dst\t\tTrg_rank\t" "Trg_pval\n") p_values = [] ranks = [] for num, cluster in enumerate(clusters): db_ranking = identifier.rank_db_proteins(cluster.consensus) target_rank = None target_dist = None for rank, (prot_id, prot_dist) in enumerate(db_ranking): if prot_id == target_id: target_rank = rank target_dist = prot_dist p_value = float(target_rank) / db_len p_values.append(p_value) ranks.append(target_rank) ostream.write( "{0}\t{1}\t{2:10}\t{3:5.2f}\t\t{4:5.2f}\t\t{5}\t\t{6:6.4}\n". format(num + 1, len(cluster.blockades), db_ranking[0][0], db_ranking[0][1], target_dist, target_rank + 1, p_value)) if single_blockades: _detalize_cluster(identifier, cluster, db_ranking[0][0], target_id, ostream) ostream.write("\nMedian p-value: {0:7.4f}\n".format(np.median(p_values))) ostream.write("Median target rank: {0:d}\n".format(int(np.median(ranks)))) return np.median(p_values), int(np.median(ranks))
def pvalues_test(blockades_file, cluster_size, blockade_model, db_file, single_blockades, ostream): """ Performs protein identification and report results """ RANDOM_DB_SIZE = 10000 identifier = Identifier(blockade_model) blockades = read_mat(blockades_file) true_peptide = blockades[0].peptide if db_file is None: identifier.random_database(true_peptide, RANDOM_DB_SIZE) target_id = "target" db_len = RANDOM_DB_SIZE else: database, target_id = _make_database(db_file, true_peptide) identifier.set_database(database) db_len = len(database) clusters = sp.preprocess_blockades(blockades, cluster_size=cluster_size, min_dwell=0.5, max_dwell=20) ostream.write("\nNo\tSize\tBest_id\t\tBest_dst\tTrg_dst\t\tTrg_rank\t" "Trg_pval\n") p_values = [] ranks = [] for num, cluster in enumerate(clusters): db_ranking = identifier.rank_db_proteins(cluster.consensus) target_rank = None target_dist = None for rank, (prot_id, prot_dist) in enumerate(db_ranking): if prot_id == target_id: target_rank = rank target_dist = prot_dist p_value = float(target_rank) / db_len p_values.append(p_value) ranks.append(target_rank) ostream.write("{0}\t{1}\t{2:10}\t{3:5.2f}\t\t{4:5.2f}\t\t{5}\t\t{6:6.4}\n" .format(num + 1, len(cluster.blockades), db_ranking[0][0], db_ranking[0][1], target_dist, target_rank + 1, p_value)) if single_blockades: _detalize_cluster(identifier, cluster, db_ranking[0][0], target_id, ostream) ostream.write("\nMedian p-value: {0:7.4f}\n".format(np.median(p_values))) ostream.write("Median target rank: {0:d}\n".format(int(np.median(ranks)))) return np.median(p_values), int(np.median(ranks))
def _get_peptides_signals(mat_files): TRAIN_AVG = 1 peptides = [] signals = [] for mat in mat_files: blockades = read_mat(mat) clusters = sp.preprocess_blockades(blockades, cluster_size=TRAIN_AVG, min_dwell=0.5, max_dwell=20) mat_peptide = clusters[0].blockades[0].peptide peptides.extend([mat_peptide] * len(clusters)) for cluster in clusters: signals.append(sp.discretize(cluster.consensus, len(mat_peptide))) return peptides, signals
def plot_blockades(blockades_file, model_files, cluster_size, show_text): """ Pretty plotting """ WINDOW = 4 blockades = read_mat(blockades_file) clusters = sp.preprocess_blockades(blockades, cluster_size=cluster_size, min_dwell=0.5, max_dwell=20) peptide = clusters[0].blockades[0].peptide models = [] for model_file in model_files: models.append(load_model(model_file)) #svr_signal = model.peptide_signal(peptide) #mv_signal = MvBlockade().peptide_signal(peptide) for cluster in clusters: #cluster.consensus = sp.discretize(cluster.consensus, len(peptide)) signal_length = len(cluster.consensus) x_axis = np.linspace(0, len(peptide) + 1, signal_length) matplotlib.rcParams.update({"font.size": 16}) fig = plt.subplot() fig.spines["right"].set_visible(False) fig.spines["top"].set_visible(False) fig.get_xaxis().tick_bottom() fig.get_yaxis().tick_left() fig.set_xlim(0, len(peptide) + 1) fig.set_xlabel("Putative AA position") fig.set_ylabel("Normalized signal") fig.plot(x_axis, cluster.consensus, label="Empirical signal", linewidth=1.5) ################ for model in models: model_signal = model.peptide_signal(peptide) model_grid = [i * signal_length / (len(model_signal) - 1) for i in xrange(len(model_signal))] interp_fun = interp1d(model_grid, model_signal, kind="linear") model_interp = interp_fun(xrange(signal_length)) corr = 1 - distance.correlation(cluster.consensus, model_interp) print("{0} correlation: {1:5.2f}\t".format(model.name, corr), file=sys.stderr) fig.plot(x_axis, model_interp, label=model.name, linewidth=2) ############## legend = fig.legend(loc="lower left", frameon=False) for label in legend.get_lines(): label.set_linewidth(2) for label in legend.get_texts(): label.set_fontsize(16) if show_text: #adding AAs text: event_mean = np.mean(cluster.consensus) acids_pos = _get_aa_positions(peptide, WINDOW, x_axis[-1]) for i, aa in enumerate(peptide): fig.text(acids_pos[i], event_mean - 2, aa, fontsize=16) plt.show()
def plot_blockades(blockades_file, model_files, cluster_size, show_text): """ Pretty plotting """ WINDOW = 4 blockades = read_mat(blockades_file) clusters = sp.preprocess_blockades(blockades, cluster_size=cluster_size, min_dwell=0.5, max_dwell=20) peptide = clusters[0].blockades[0].peptide models = [] for model_file in model_files: models.append(load_model(model_file)) #svr_signal = model.peptide_signal(peptide) #mv_signal = MvBlockade().peptide_signal(peptide) for cluster in clusters: #cluster.consensus = sp.discretize(cluster.consensus, len(peptide)) signal_length = len(cluster.consensus) x_axis = np.linspace(0, len(peptide) + 1, signal_length) matplotlib.rcParams.update({"font.size": 16}) fig = plt.subplot() fig.spines["right"].set_visible(False) fig.spines["top"].set_visible(False) fig.get_xaxis().tick_bottom() fig.get_yaxis().tick_left() fig.set_xlim(0, len(peptide) + 1) fig.set_xlabel("Putative AA position") fig.set_ylabel("Normalized signal") fig.plot(x_axis, cluster.consensus, label="Empirical signal", linewidth=1.5) ################ for model in models: model_signal = model.peptide_signal(peptide) model_grid = [ i * signal_length / (len(model_signal) - 1) for i in xrange(len(model_signal)) ] interp_fun = interp1d(model_grid, model_signal, kind="linear") model_interp = interp_fun(xrange(signal_length)) corr = 1 - distance.correlation(cluster.consensus, model_interp) print("{0} correlation: {1:5.2f}\t".format(model.name, corr), file=sys.stderr) fig.plot(x_axis, model_interp, label=model.name, linewidth=2) ############## legend = fig.legend(loc="lower left", frameon=False) for label in legend.get_lines(): label.set_linewidth(2) for label in legend.get_texts(): label.set_fontsize(16) if show_text: #adding AAs text: event_mean = np.mean(cluster.consensus) acids_pos = _get_aa_positions(peptide, WINDOW, x_axis[-1]) for i, aa in enumerate(peptide): fig.text(acids_pos[i], event_mean - 2, aa, fontsize=16) plt.show()