def run_threshold_analysis(output_directory, title, input_repertoires, trim_first=True, rawtext=True, k=1): all_seqs = [] # read in the clusters repertoire_dict = {} cluster_number_to_rep = {} i = 0 for file in input_repertoires: i += 1 cluster_number_to_rep[i] = file content = txttoseqlist(file, trim_first=trim_first, rawtext=rawtext) all_seqs += content for cdr in content: if cdr in repertoire_dict: found = False for nums in repertoire_dict[cdr]: if nums == i: found = True if not found: repertoire_dict[cdr].append(i) else: repertoire_dict[cdr] = [i] # get cluster length set_content = set(content) num_clusters = i print(generate_threshold_plot(all_seqs, repertoire_dict, k, num_clusters, output_directory, title, step=.05, cluster_number_to_rep=cluster_number_to_rep))
def twins_threshold_analysis(output_directory, twinA1, twinA2, twinC1, twinC2, twinD1, twinD2, beta=True): twin_pair_A = txttoseqlist(twinA1, trim_first=True) + txttoseqlist( twinA2, trim_first=True) twin_pair_C = txttoseqlist(twinC1, trim_first=True) + txttoseqlist( twinC2, trim_first=True) twin_pair_D = txttoseqlist(twinD1, trim_first=True) + txttoseqlist( twinD2, trim_first=True) # run threshold analysis on each twin pair if beta: run_threshold_analysis_repertoire_lists( output_directory, "TwA_Beta", [twin_pair_A, twin_pair_C + twin_pair_D], k=1) run_threshold_analysis_repertoire_lists( output_directory, "TwC_Beta", [twin_pair_C, twin_pair_A + twin_pair_D], k=1) run_threshold_analysis_repertoire_lists( output_directory, "TwD_Beta", [twin_pair_D, twin_pair_A + twin_pair_C], k=1) else: run_threshold_analysis_repertoire_lists( output_directory, "TwA_Alpha", [twin_pair_A, twin_pair_C + twin_pair_D], k=1) run_threshold_analysis_repertoire_lists( output_directory, "TwC_Alpha", [twin_pair_C, twin_pair_A + twin_pair_D], k=1) run_threshold_analysis_repertoire_lists( output_directory, "TwD_Alpha", [twin_pair_D, twin_pair_A + twin_pair_C], k=1)
fig.savefig(output_directory + title + "_kde_updated.pdf") plt.figure() # new figure figure = sns.kdeplot(self_scores, shade=True, cumulative=True) figure = sns.kdeplot(non_self_scores, shade=True, cumulative=True) figure.set(ylim=(0, 1)) fig = figure.get_figure() fig.savefig(output_directory + title + "_kde_cumulative_updated.pdf") print("calculating self-scores...") # calculate self-scores all_self_scores = [] num_self_scores = 0 for file in cluster_file_list: cdrs = txttoseqlist(file) cdrs = set(cdrs) cdrs = list(cdrs) scores = calculate_min_distances_within(cdrs, cdrs) all_self_scores += scores num_self_scores += len(scores) print(all_self_scores) print("calculating non-self scores...") # calculate non-self scores all_non_self_scores = [] num_non_self_scores = 0 for file1 in cluster_file_list: for file2 in cluster_file_list: if file1 != file2: cdrs1 = list(set(txttoseqlist(file1)))
"need the following arguments: output directory, threshold to cluster to, " "human/mice, which epitope to get the motifs of, data directory") print("human epitopes: BMLF1, p65, M1") print("mice epitopes: PB1, PB1-F2, NP, PA, M38, M45, m139") sys.exit() output_dir = sys.argv[1] distance = float(sys.argv[2]) human = str(sys.argv[3]) epitope = sys.argv[4] data_dir = sys.argv[5] # datasets/dash_data/ epitope_filename = data_dir + epitope + "_beta.txt" print(epitope_filename) Xlabels = txttoseqlist(epitope_filename, trim_first=True, remove_special=True) human_files = [ data_dir + "BMLF1_beta.txt", data_dir + "p65_beta.txt", data_dir + "M1_beta.txt" ] mice_files = [ data_dir + "PB1_beta.txt", data_dir + "PB1-F2_beta.txt", data_dir + "NP_beta.txt", data_dir + "PA_beta.txt", data_dir + "M38_beta.txt", data_dir + "M45_beta.txt", data_dir + "m139_beta.txt" ] other_files = [] if human.lower() == 'true': for file in human_files:
sys.exit() else: output_directory = sys.argv[1] cdr_type = sys.argv[2] repertoire_file_list = sys.argv[3].split(',') all_seqs = [] # read in the clusters seq_to_rep = {} rep_to_seq = {} i = 0 for file in repertoire_file_list: i += 1 content = txttoseqlist(file, trim_first=False) print(file) print(len(content)) rep_to_seq[i] = content all_seqs += content for cdr in content: if cdr in seq_to_rep: found = False for nums in seq_to_rep[cdr]: if nums == i: found = True if not found: seq_to_rep[cdr].append(i) else: seq_to_rep[cdr] = [i]
def cluster_and_get_distance_scores(subject_file, subject_twin_file, other_files, clustering_distance, output_dir): log_file = open(output_dir + "output.txt", mode='w') log_file.write(subject_file[11:]) Xlabels = txttoseqlist(subject_file, trim_first=True, remove_special=True) Xlabelstwin = txttoseqlist(subject_twin_file, trim_first=True, remove_special=True) print("clustering subject and their twin...") clusters = cluster_data(Xlabels, max_distance=clustering_distance) clusters_twin = cluster_data(Xlabelstwin, max_distance=clustering_distance) print("clustered subject and their twin") # convert clusters to list of clusters clusters_list = [] for key in clusters: clusters_list.append(clusters[key]) twin_clusters_list = [] for key in clusters_twin: twin_clusters_list.append(clusters_twin[key]) print("clustering other individuals...") other_clusterings = [] # get the other twins reperotires for file in other_files: Xlabelsother = txttoseqlist(file, trim_first=True, remove_special=True) clusters2other = cluster_data(Xlabelsother, max_distance=clustering_distance) other_clusters = [] for key in clusters2other: other_clusters.append(clusters2other[key]) other_clusterings.append(other_clusters) print("clustered others") print("getting scores...") # get the scores scores = score_repertoire_twins(clusters_list, twin_clusters_list, other_clusterings, min_clust_size=4) print(scores) n = 1 for score_data in scores: log_file.write("\n\ncluster #" + str(n)) # print(score_data) # print("parsing data...\n") # parse score data cluster = score_data[1] twin_cluster = score_data[0][1] twin_cluster_score = score_data[0][0] log_file.write("\ncluster is :" + str(cluster)) log_file.write("\nthe closest twin cluster is " + str(twin_cluster)) log_file.write("\ntheir score is: " + str(twin_cluster_score)) # create motifs create_motif(cluster, str(n) + "_subject", output_dir) create_motif(twin_cluster, str(n) + "_subject_twin", output_dir) # calculate average distance to other clusters total_dist = 0 count = 0 other_scores = [] for i in range(len(other_files)): data = score_data[i + 2] repertoire = other_files[i][22:] other_score = data[0] other_cluster = data[2] log_file.write("\nscore with " + str(repertoire) + " is " + str(other_score)) log_file.write("\ncluster: " + str(other_cluster)) create_motif(other_cluster, str(n) + "_" + str(repertoire), output_dir) total_dist += other_score count += 1 other_scores.append(other_score) avg_score = total_dist / count log_file.write("\naverage score to another cluster is " + str(avg_score)) log_file.write("\nthe closest score to another cluster is " + str(min(other_scores))) log_file.write(' ') n += 1