def run_threshold_analysis(output_directory, title, input_repertoires, trim_first=True, rawtext=True, k=1):
    all_seqs = []

    # read in the clusters
    repertoire_dict = {}
    cluster_number_to_rep = {}
    i = 0
    for file in input_repertoires:
        i += 1
        cluster_number_to_rep[i] = file
        content = txttoseqlist(file, trim_first=trim_first, rawtext=rawtext)
        all_seqs += content
        for cdr in content:
            if cdr in repertoire_dict:
                found = False
                for nums in repertoire_dict[cdr]:
                    if nums == i:
                        found = True
                if not found:
                    repertoire_dict[cdr].append(i)
            else:
                repertoire_dict[cdr] = [i]

        # get cluster length
        set_content = set(content)
    num_clusters = i

    print(generate_threshold_plot(all_seqs, repertoire_dict, k, num_clusters, output_directory, title, step=.05, cluster_number_to_rep=cluster_number_to_rep))
def twins_threshold_analysis(output_directory,
                             twinA1,
                             twinA2,
                             twinC1,
                             twinC2,
                             twinD1,
                             twinD2,
                             beta=True):

    twin_pair_A = txttoseqlist(twinA1, trim_first=True) + txttoseqlist(
        twinA2, trim_first=True)
    twin_pair_C = txttoseqlist(twinC1, trim_first=True) + txttoseqlist(
        twinC2, trim_first=True)
    twin_pair_D = txttoseqlist(twinD1, trim_first=True) + txttoseqlist(
        twinD2, trim_first=True)

    # run threshold analysis on each twin pair
    if beta:
        run_threshold_analysis_repertoire_lists(
            output_directory,
            "TwA_Beta", [twin_pair_A, twin_pair_C + twin_pair_D],
            k=1)
        run_threshold_analysis_repertoire_lists(
            output_directory,
            "TwC_Beta", [twin_pair_C, twin_pair_A + twin_pair_D],
            k=1)
        run_threshold_analysis_repertoire_lists(
            output_directory,
            "TwD_Beta", [twin_pair_D, twin_pair_A + twin_pair_C],
            k=1)
    else:
        run_threshold_analysis_repertoire_lists(
            output_directory,
            "TwA_Alpha", [twin_pair_A, twin_pair_C + twin_pair_D],
            k=1)
        run_threshold_analysis_repertoire_lists(
            output_directory,
            "TwC_Alpha", [twin_pair_C, twin_pair_A + twin_pair_D],
            k=1)
        run_threshold_analysis_repertoire_lists(
            output_directory,
            "TwD_Alpha", [twin_pair_D, twin_pair_A + twin_pair_C],
            k=1)
    fig.savefig(output_directory + title + "_kde_updated.pdf")

    plt.figure()  # new figure
    figure = sns.kdeplot(self_scores, shade=True, cumulative=True)
    figure = sns.kdeplot(non_self_scores, shade=True, cumulative=True)
    figure.set(ylim=(0, 1))
    fig = figure.get_figure()
    fig.savefig(output_directory + title + "_kde_cumulative_updated.pdf")


print("calculating self-scores...")
# calculate self-scores
all_self_scores = []
num_self_scores = 0
for file in cluster_file_list:
    cdrs = txttoseqlist(file)
    cdrs = set(cdrs)
    cdrs = list(cdrs)
    scores = calculate_min_distances_within(cdrs, cdrs)
    all_self_scores += scores
    num_self_scores += len(scores)
print(all_self_scores)

print("calculating non-self scores...")
# calculate non-self scores
all_non_self_scores = []
num_non_self_scores = 0
for file1 in cluster_file_list:
    for file2 in cluster_file_list:
        if file1 != file2:
            cdrs1 = list(set(txttoseqlist(file1)))
Exemple #4
0
        "need the following arguments: output directory, threshold to cluster to, "
        "human/mice, which epitope to get the motifs of, data directory")
    print("human epitopes: BMLF1, p65, M1")
    print("mice epitopes: PB1, PB1-F2, NP, PA, M38, M45, m139")
    sys.exit()

output_dir = sys.argv[1]
distance = float(sys.argv[2])
human = str(sys.argv[3])
epitope = sys.argv[4]
data_dir = sys.argv[5]  # datasets/dash_data/

epitope_filename = data_dir + epitope + "_beta.txt"

print(epitope_filename)
Xlabels = txttoseqlist(epitope_filename, trim_first=True, remove_special=True)

human_files = [
    data_dir + "BMLF1_beta.txt", data_dir + "p65_beta.txt",
    data_dir + "M1_beta.txt"
]
mice_files = [
    data_dir + "PB1_beta.txt", data_dir + "PB1-F2_beta.txt",
    data_dir + "NP_beta.txt", data_dir + "PA_beta.txt",
    data_dir + "M38_beta.txt", data_dir + "M45_beta.txt",
    data_dir + "m139_beta.txt"
]

other_files = []
if human.lower() == 'true':
    for file in human_files:
Exemple #5
0
    sys.exit()
else:
    output_directory = sys.argv[1]
    cdr_type = sys.argv[2]
    repertoire_file_list = sys.argv[3].split(',')


all_seqs = []

# read in the clusters
seq_to_rep = {}
rep_to_seq = {}
i = 0
for file in repertoire_file_list:
    i += 1
    content = txttoseqlist(file, trim_first=False)
    print(file)
    print(len(content))
    rep_to_seq[i] = content
    all_seqs += content
    for cdr in content:
        if cdr in seq_to_rep:
            found = False
            for nums in seq_to_rep[cdr]:
                if nums == i:
                    found = True
            if not found:
                seq_to_rep[cdr].append(i)
        else:
            seq_to_rep[cdr] = [i]
Exemple #6
0
def cluster_and_get_distance_scores(subject_file, subject_twin_file,
                                    other_files, clustering_distance,
                                    output_dir):
    log_file = open(output_dir + "output.txt", mode='w')
    log_file.write(subject_file[11:])

    Xlabels = txttoseqlist(subject_file, trim_first=True, remove_special=True)
    Xlabelstwin = txttoseqlist(subject_twin_file,
                               trim_first=True,
                               remove_special=True)

    print("clustering subject and their twin...")

    clusters = cluster_data(Xlabels, max_distance=clustering_distance)
    clusters_twin = cluster_data(Xlabelstwin, max_distance=clustering_distance)

    print("clustered subject and their twin")

    # convert clusters to list of clusters
    clusters_list = []
    for key in clusters:
        clusters_list.append(clusters[key])

    twin_clusters_list = []
    for key in clusters_twin:
        twin_clusters_list.append(clusters_twin[key])

    print("clustering other individuals...")

    other_clusterings = []
    # get the other twins reperotires
    for file in other_files:
        Xlabelsother = txttoseqlist(file, trim_first=True, remove_special=True)
        clusters2other = cluster_data(Xlabelsother,
                                      max_distance=clustering_distance)

        other_clusters = []
        for key in clusters2other:
            other_clusters.append(clusters2other[key])

        other_clusterings.append(other_clusters)

    print("clustered others")
    print("getting scores...")

    # get the scores
    scores = score_repertoire_twins(clusters_list,
                                    twin_clusters_list,
                                    other_clusterings,
                                    min_clust_size=4)
    print(scores)

    n = 1

    for score_data in scores:
        log_file.write("\n\ncluster #" + str(n))
        # print(score_data)
        # print("parsing data...\n")

        # parse score data
        cluster = score_data[1]
        twin_cluster = score_data[0][1]
        twin_cluster_score = score_data[0][0]
        log_file.write("\ncluster is :" + str(cluster))
        log_file.write("\nthe closest twin cluster is " + str(twin_cluster))
        log_file.write("\ntheir score is: " + str(twin_cluster_score))

        # create motifs
        create_motif(cluster, str(n) + "_subject", output_dir)
        create_motif(twin_cluster, str(n) + "_subject_twin", output_dir)

        # calculate average distance to other clusters
        total_dist = 0
        count = 0
        other_scores = []

        for i in range(len(other_files)):
            data = score_data[i + 2]
            repertoire = other_files[i][22:]
            other_score = data[0]
            other_cluster = data[2]
            log_file.write("\nscore with " + str(repertoire) + " is " +
                           str(other_score))
            log_file.write("\ncluster: " + str(other_cluster))

            create_motif(other_cluster,
                         str(n) + "_" + str(repertoire), output_dir)

            total_dist += other_score
            count += 1
            other_scores.append(other_score)

        avg_score = total_dist / count
        log_file.write("\naverage score to another cluster is " +
                       str(avg_score))
        log_file.write("\nthe closest score to another cluster is " +
                       str(min(other_scores)))

        log_file.write(' ')

        n += 1