def correlation(mat_file_1, mat_file_2): """ Draws the plot """ blockades_1 = read_mat(mat_file_1) blockades_1 = sp._fractional_blockades(blockades_1) blockades_1 = sp._filter_by_duration(blockades_1, 0.5, 20) blockades_1 = map(lambda b: sp.discretize(sp._trim_flank_noise(b.eventTrace), 20), blockades_1) blockades_2 = read_mat(mat_file_2) blockades_2 = sp._fractional_blockades(blockades_2) blockades_2 = sp._filter_by_duration(blockades_2, 0.5, 20) blockades_2 = map(lambda b: sp.discretize(sp._trim_flank_noise(b.eventTrace), 20), blockades_2) self_corr = [] cross_corr = [] for blockade in blockades_1: block_self = [] for other in blockades_1: block_self.append(1 - distance.correlation(blockade, other)) block_cross = [] for other in blockades_2: block_cross.append(1 - distance.correlation(blockade, other)) self_corr.append(np.mean(block_self)) cross_corr.append(np.mean(block_cross)) mean_self = np.median(self_corr) mean_cross = np.median(cross_corr) matplotlib.rcParams.update({"font.size": 16}) fig = plt.subplot() fig.spines["right"].set_visible(False) fig.spines["top"].set_visible(False) fig.get_xaxis().tick_bottom() fig.get_yaxis().tick_left() fig.set_xlim(-0.6, 0.6) fig.set_ylim(-0.6, 0.6) fig.set_xlabel("(H3 tail, H3 tail) correlation") fig.set_ylabel("(H3 tail, CCL5) correlation") for y in [-0.4, -0.2, 0, 0.2, 0.4]: plt.plot((-0.6, 0.6), (y, y), "--", lw=0.5, color="black") plt.plot((y, y), (-0.6, 0.6), "--", lw=0.5, color="black") plt.plot((-0.6, 0.6), (mean_cross, mean_cross), "--", lw=1.5, color="red") plt.plot((mean_self, mean_self), (-0.6, 0.6), "--", lw=1.5, color="red") fig.scatter(self_corr, cross_corr, linewidth=0.5, c="dodgerblue", s=30, edgecolor="blue") plt.tight_layout() plt.show()
def frequency_distribution(blockades_file, detailed): """ Plots the frequency distribution """ blockades = read_mat(blockades_file) blockades = sp._fractional_blockades(blockades) blockades = sp._filter_by_duration(blockades, 0.5, 20) peaks_count = {} for blockade in blockades: if detailed: detailed_plots(blockade) signal = blockade.eventTrace[1000:-1000] xx, yy = sp.find_peaks(signal) peaks_count[blockade] = len(xx) / blockade.ms_Dwell * 5 / 4 mean = np.mean(peaks_count.values()) errors = map(lambda e: peaks_count[e] - mean, blockades) lengths = map(lambda e: e.ms_Dwell, blockades) f, (s1, s2) = plt.subplots(2) s1.scatter(lengths, errors) s2.hist(peaks_count.values(), bins=100) plt.show()
def get_bias(blockades_file, model_file, cluster_size): """ Gets AA-specific bias between the empirical and theoretical signals """ WINDOW = 4 blockades = read_mat(blockades_file) clusters = sp.preprocess_blockades(blockades, cluster_size=cluster_size, min_dwell=0.5, max_dwell=20) peptide = clusters[0].blockades[0].peptide blockade_model = load_model(model_file) errors = defaultdict(list) model_signal = blockade_model.peptide_signal(peptide) for cluster in clusters: discr_signal = sp.discretize(cluster.consensus, len(peptide)) flanked_peptide = ("-" * (WINDOW - 1) + peptide + "-" * (WINDOW - 1)) num_peaks = len(peptide) + WINDOW - 1 for i in xrange(0, num_peaks): kmer = flanked_peptide[i : i + WINDOW] if "-" not in kmer: for aa in kmer: errors[aa].append(discr_signal[i] - model_signal[i]) return errors
def get_bias(blockades_file, model_file, cluster_size): """ Gets AA-specific bias between the empirical and theoretical signals """ WINDOW = 4 blockades = read_mat(blockades_file) clusters = sp.preprocess_blockades(blockades, cluster_size=cluster_size, min_dwell=0.5, max_dwell=20) peptide = clusters[0].blockades[0].peptide blockade_model = load_model(model_file) errors = defaultdict(list) model_signal = blockade_model.peptide_signal(peptide) for cluster in clusters: discr_signal = sp.discretize(cluster.consensus, len(peptide)) flanked_peptide = ("-" * (WINDOW - 1) + peptide + "-" * (WINDOW - 1)) num_peaks = len(peptide) + WINDOW - 1 for i in xrange(0, num_peaks): kmer = flanked_peptide[i:i + WINDOW] if "-" not in kmer: for aa in kmer: errors[aa].append(discr_signal[i] - model_signal[i]) return errors
def pvalues_test(blockades_file, cluster_size, blockade_model, db_file, single_blockades, ostream): """ Performs protein identification and report results """ RANDOM_DB_SIZE = 10000 identifier = Identifier(blockade_model) blockades = read_mat(blockades_file) true_peptide = blockades[0].peptide if db_file is None: identifier.random_database(true_peptide, RANDOM_DB_SIZE) target_id = "target" db_len = RANDOM_DB_SIZE else: database, target_id = _make_database(db_file, true_peptide) identifier.set_database(database) db_len = len(database) clusters = sp.preprocess_blockades(blockades, cluster_size=cluster_size, min_dwell=0.5, max_dwell=20) ostream.write("\nNo\tSize\tBest_id\t\tBest_dst\tTrg_dst\t\tTrg_rank\t" "Trg_pval\n") p_values = [] ranks = [] for num, cluster in enumerate(clusters): db_ranking = identifier.rank_db_proteins(cluster.consensus) target_rank = None target_dist = None for rank, (prot_id, prot_dist) in enumerate(db_ranking): if prot_id == target_id: target_rank = rank target_dist = prot_dist p_value = float(target_rank) / db_len p_values.append(p_value) ranks.append(target_rank) ostream.write( "{0}\t{1}\t{2:10}\t{3:5.2f}\t\t{4:5.2f}\t\t{5}\t\t{6:6.4}\n". format(num + 1, len(cluster.blockades), db_ranking[0][0], db_ranking[0][1], target_dist, target_rank + 1, p_value)) if single_blockades: _detalize_cluster(identifier, cluster, db_ranking[0][0], target_id, ostream) ostream.write("\nMedian p-value: {0:7.4f}\n".format(np.median(p_values))) ostream.write("Median target rank: {0:d}\n".format(int(np.median(ranks)))) return np.median(p_values), int(np.median(ranks))
def main(): if len(sys.argv) < 3: print("usage: merge-mat.py mat_1[,mat_2..] out_mat\n\n" "Merge multiple files with blockades into one") return 1 blockades = [] for mat_file in sys.argv[1:-1]: blockades.extend(read_mat(mat_file)) write_mat(blockades, sys.argv[-1]) return 0
def main(): if len(sys.argv) != 3: print("usage: protein-label.py mat_file prot_sequence\n\n" "Add protein sequence record into the mat file " "with blockades", file=sys.stderr) return 1 blockades = read_mat(sys.argv[1]) for blockade in blockades: blockade.peptide = sys.argv[2] write_mat(blockades, sys.argv[1]) return 0
def pvalues_test(blockades_file, cluster_size, blockade_model, db_file, single_blockades, ostream): """ Performs protein identification and report results """ RANDOM_DB_SIZE = 10000 identifier = Identifier(blockade_model) blockades = read_mat(blockades_file) true_peptide = blockades[0].peptide if db_file is None: identifier.random_database(true_peptide, RANDOM_DB_SIZE) target_id = "target" db_len = RANDOM_DB_SIZE else: database, target_id = _make_database(db_file, true_peptide) identifier.set_database(database) db_len = len(database) clusters = sp.preprocess_blockades(blockades, cluster_size=cluster_size, min_dwell=0.5, max_dwell=20) ostream.write("\nNo\tSize\tBest_id\t\tBest_dst\tTrg_dst\t\tTrg_rank\t" "Trg_pval\n") p_values = [] ranks = [] for num, cluster in enumerate(clusters): db_ranking = identifier.rank_db_proteins(cluster.consensus) target_rank = None target_dist = None for rank, (prot_id, prot_dist) in enumerate(db_ranking): if prot_id == target_id: target_rank = rank target_dist = prot_dist p_value = float(target_rank) / db_len p_values.append(p_value) ranks.append(target_rank) ostream.write("{0}\t{1}\t{2:10}\t{3:5.2f}\t\t{4:5.2f}\t\t{5}\t\t{6:6.4}\n" .format(num + 1, len(cluster.blockades), db_ranking[0][0], db_ranking[0][1], target_dist, target_rank + 1, p_value)) if single_blockades: _detalize_cluster(identifier, cluster, db_ranking[0][0], target_id, ostream) ostream.write("\nMedian p-value: {0:7.4f}\n".format(np.median(p_values))) ostream.write("Median target rank: {0:d}\n".format(int(np.median(ranks)))) return np.median(p_values), int(np.median(ranks))
def _get_peptides_signals(mat_files): TRAIN_AVG = 1 peptides = [] signals = [] for mat in mat_files: blockades = read_mat(mat) clusters = sp.preprocess_blockades(blockades, cluster_size=TRAIN_AVG, min_dwell=0.5, max_dwell=20) mat_peptide = clusters[0].blockades[0].peptide peptides.extend([mat_peptide] * len(clusters)) for cluster in clusters: signals.append(sp.discretize(cluster.consensus, len(mat_peptide))) return peptides, signals
def main(): if len(sys.argv) != 4: print("usage: flip-blockades.py blockades_in model_file flipped_out\n\n" "Orients blockade signals according to the AA order " "in the protein of origin") return 1 blockades_in = sys.argv[1] blockades_out = sys.argv[3] svr_file = sys.argv[2] blockades = read_mat(blockades_in) rev_blockades = flip(blockades, svr_file) write_mat(rev_blockades, blockades_out) return 0
def frequency_plot(blockade_files): """ Draws the plot """ datasets_names = [] frequencies = [] for file in blockade_files: blockades = read_mat(file) blockades = sp._fractional_blockades(blockades) blockades = sp._filter_by_duration(blockades, 0.5, 20) dataset_freqs = [] for blockade in blockades: xx, yy = sp.find_peaks(blockade.eventTrace[1000:-1000]) dataset_freqs.append(len(xx) / blockade.ms_Dwell * 5 / 4) frequencies.append(dataset_freqs) datasets_names.append(os.path.basename(file).split(".")[0]) x_axis = np.arange(min(sum(frequencies, [])) - 10, max(sum(frequencies, [])) + 10, 0.1) matplotlib.rcParams.update({"font.size": 16}) fig = plt.subplot() colors = ["blue", "green", "red", "cyan"] for distr, name, color in zip(frequencies, datasets_names, colors): density = gaussian_kde(distr) density.covariance_factor = lambda: 0.25 density._compute_covariance gauss_dens = density(x_axis) fig.spines["right"].set_visible(False) fig.spines["top"].set_visible(False) fig.get_xaxis().tick_bottom() fig.get_yaxis().tick_left() fig.set_ylim(0, 0.16) fig.plot(x_axis, gauss_dens, antialiased=True, linewidth=2, color=color, alpha=0.7, label=name) fig.fill_between(x_axis, gauss_dens, alpha=0.5, antialiased=True, color=color) fig.set_xlabel("Fluctuation frequency, 1/ms") legend = fig.legend(loc="upper left", frameon=False) for label in legend.get_lines(): label.set_linewidth(3) for label in legend.get_texts(): label.set_fontsize(16) plt.show()
def plot_blockades(blockades_file, model_files, cluster_size, show_text): """ Pretty plotting """ WINDOW = 4 blockades = read_mat(blockades_file) clusters = sp.preprocess_blockades(blockades, cluster_size=cluster_size, min_dwell=0.5, max_dwell=20) peptide = clusters[0].blockades[0].peptide models = [] for model_file in model_files: models.append(load_model(model_file)) #svr_signal = model.peptide_signal(peptide) #mv_signal = MvBlockade().peptide_signal(peptide) for cluster in clusters: #cluster.consensus = sp.discretize(cluster.consensus, len(peptide)) signal_length = len(cluster.consensus) x_axis = np.linspace(0, len(peptide) + 1, signal_length) matplotlib.rcParams.update({"font.size": 16}) fig = plt.subplot() fig.spines["right"].set_visible(False) fig.spines["top"].set_visible(False) fig.get_xaxis().tick_bottom() fig.get_yaxis().tick_left() fig.set_xlim(0, len(peptide) + 1) fig.set_xlabel("Putative AA position") fig.set_ylabel("Normalized signal") fig.plot(x_axis, cluster.consensus, label="Empirical signal", linewidth=1.5) ################ for model in models: model_signal = model.peptide_signal(peptide) model_grid = [i * signal_length / (len(model_signal) - 1) for i in xrange(len(model_signal))] interp_fun = interp1d(model_grid, model_signal, kind="linear") model_interp = interp_fun(xrange(signal_length)) corr = 1 - distance.correlation(cluster.consensus, model_interp) print("{0} correlation: {1:5.2f}\t".format(model.name, corr), file=sys.stderr) fig.plot(x_axis, model_interp, label=model.name, linewidth=2) ############## legend = fig.legend(loc="lower left", frameon=False) for label in legend.get_lines(): label.set_linewidth(2) for label in legend.get_texts(): label.set_fontsize(16) if show_text: #adding AAs text: event_mean = np.mean(cluster.consensus) acids_pos = _get_aa_positions(peptide, WINDOW, x_axis[-1]) for i, aa in enumerate(peptide): fig.text(acids_pos[i], event_mean - 2, aa, fontsize=16) plt.show()
def plot_blockades(blockades_file, model_files, cluster_size, show_text): """ Pretty plotting """ WINDOW = 4 blockades = read_mat(blockades_file) clusters = sp.preprocess_blockades(blockades, cluster_size=cluster_size, min_dwell=0.5, max_dwell=20) peptide = clusters[0].blockades[0].peptide models = [] for model_file in model_files: models.append(load_model(model_file)) #svr_signal = model.peptide_signal(peptide) #mv_signal = MvBlockade().peptide_signal(peptide) for cluster in clusters: #cluster.consensus = sp.discretize(cluster.consensus, len(peptide)) signal_length = len(cluster.consensus) x_axis = np.linspace(0, len(peptide) + 1, signal_length) matplotlib.rcParams.update({"font.size": 16}) fig = plt.subplot() fig.spines["right"].set_visible(False) fig.spines["top"].set_visible(False) fig.get_xaxis().tick_bottom() fig.get_yaxis().tick_left() fig.set_xlim(0, len(peptide) + 1) fig.set_xlabel("Putative AA position") fig.set_ylabel("Normalized signal") fig.plot(x_axis, cluster.consensus, label="Empirical signal", linewidth=1.5) ################ for model in models: model_signal = model.peptide_signal(peptide) model_grid = [ i * signal_length / (len(model_signal) - 1) for i in xrange(len(model_signal)) ] interp_fun = interp1d(model_grid, model_signal, kind="linear") model_interp = interp_fun(xrange(signal_length)) corr = 1 - distance.correlation(cluster.consensus, model_interp) print("{0} correlation: {1:5.2f}\t".format(model.name, corr), file=sys.stderr) fig.plot(x_axis, model_interp, label=model.name, linewidth=2) ############## legend = fig.legend(loc="lower left", frameon=False) for label in legend.get_lines(): label.set_linewidth(2) for label in legend.get_texts(): label.set_fontsize(16) if show_text: #adding AAs text: event_mean = np.mean(cluster.consensus) acids_pos = _get_aa_positions(peptide, WINDOW, x_axis[-1]) for i, aa in enumerate(peptide): fig.text(acids_pos[i], event_mean - 2, aa, fontsize=16) plt.show()
def correlation(mat_file_1, mat_file_2): """ Draws the plot """ blockades_1 = read_mat(mat_file_1) blockades_1 = sp._fractional_blockades(blockades_1) blockades_1 = sp._filter_by_duration(blockades_1, 0.5, 20) blockades_1 = map( lambda b: sp.discretize(sp._trim_flank_noise(b.eventTrace), 20), blockades_1) blockades_2 = read_mat(mat_file_2) blockades_2 = sp._fractional_blockades(blockades_2) blockades_2 = sp._filter_by_duration(blockades_2, 0.5, 20) blockades_2 = map( lambda b: sp.discretize(sp._trim_flank_noise(b.eventTrace), 20), blockades_2) self_corr = [] cross_corr = [] for blockade in blockades_1: block_self = [] for other in blockades_1: block_self.append(1 - distance.correlation(blockade, other)) block_cross = [] for other in blockades_2: block_cross.append(1 - distance.correlation(blockade, other)) self_corr.append(np.mean(block_self)) cross_corr.append(np.mean(block_cross)) mean_self = np.median(self_corr) mean_cross = np.median(cross_corr) matplotlib.rcParams.update({"font.size": 16}) fig = plt.subplot() fig.spines["right"].set_visible(False) fig.spines["top"].set_visible(False) fig.get_xaxis().tick_bottom() fig.get_yaxis().tick_left() fig.set_xlim(-0.6, 0.6) fig.set_ylim(-0.6, 0.6) fig.set_xlabel("(H3 tail, H3 tail) correlation") fig.set_ylabel("(H3 tail, CCL5) correlation") for y in [-0.4, -0.2, 0, 0.2, 0.4]: plt.plot((-0.6, 0.6), (y, y), "--", lw=0.5, color="black") plt.plot((y, y), (-0.6, 0.6), "--", lw=0.5, color="black") plt.plot((-0.6, 0.6), (mean_cross, mean_cross), "--", lw=1.5, color="red") plt.plot((mean_self, mean_self), (-0.6, 0.6), "--", lw=1.5, color="red") fig.scatter(self_corr, cross_corr, linewidth=0.5, c="dodgerblue", s=30, edgecolor="blue") plt.tight_layout() plt.show()