コード例 #1
0
def get_invariants_under_ignored_edge_ends(seq_to_acc_list_sorted, params):
    if params.nr_cores == 1:
        best_edit_distances = get_nearest_neighbors(seq_to_acc_list_sorted, 0, 0, seq_to_acc_list_sorted, params.neighbor_search_depth, params.ignore_ends_len)

        # implement check here to se that all seqs got a nearest_neighbor, if not, print which noes that did not get a nearest_neighbor computed.!

    else:
        ####### parallelize alignment #########
        # pool = Pool(processes=mp.cpu_count())
        original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
        signal.signal(signal.SIGINT, original_sigint_handler)
        pool = Pool(processes=mp.cpu_count())

        # here we split the input into chunks
        chunk_size = max(int(len(seq_to_acc_list_sorted) / (10*mp.cpu_count())), 20 )
        ref_seq_chunks = [ ( max(0, i - params.neighbor_search_depth -1), seq_to_acc_list_sorted[max(0, i - params.neighbor_search_depth -1) : i + chunk_size + params.neighbor_search_depth +1 ]) for i in range(0, len(seq_to_acc_list_sorted), chunk_size) ]
        chunks = [(i, seq_to_acc_list_sorted[i:i + chunk_size]) for i in range(0, len(seq_to_acc_list_sorted), chunk_size)] 

        if params.verbose:
            write_output.logger(str([j for j, ch in ref_seq_chunks]), params.develop_logfile, timestamp=False)
            write_output.logger("reference chunks:" + str([len(ch) for j,ch in ref_seq_chunks]), params.develop_logfile, timestamp=False)
            # print([j for j, ch in ref_seq_chunks])
            # print("reference chunks:", [len(ch) for j,ch in ref_seq_chunks])
            write_output.logger(str([i for i,ch in chunks]), params.develop_logfile, timestamp=False)
            write_output.logger("query chunks:" + str([len(ch) for i,ch in chunks]), params.develop_logfile, timestamp=False)

            print([i for i,ch in chunks])
            print("query chunks:", [len(ch) for i,ch in chunks])

        # get nearest_neighbors takes thre sub containers: 
        #  chunk - a container with (sequences, accesions)-tuples to be aligned (queries)
        #  ref_seq_chunks - a container with (sequences, accesions)-tuples to be aligned to (references)
        #  already_converged_chunks - a set of query sequences that has already converged 

        try:
            res = pool.map_async(get_nearest_neighbors_helper, [ ((chunks[i][1],  chunks[i][0], chunks[i][0] - ref_seq_chunks[i][0], ref_seq_chunks[i][1], params.neighbor_search_depth, params.ignore_ends_len), {}) for i in range(len(chunks))] )
            best_edit_distances_results =res.get(999999999) # Without the timeout this blocking call ignores all signals.
        except KeyboardInterrupt:
            print("Caught KeyboardInterrupt, terminating workers")
            pool.terminate()
            sys.exit()
        else:
            # print("Normal termination")
            pool.close()
        pool.join()
        best_edit_distances = {}
        for sub_graph in best_edit_distances_results:
            for seq in sub_graph:
                assert seq not in best_edit_distances
            best_edit_distances.update(sub_graph)
        
    # store only invariants here, i.e., edit distance 0 when ignoring ends!
    for acc1 in list(best_edit_distances.keys()):
        for acc2 in list(best_edit_distances[acc1].keys()):
            if best_edit_distances[acc1][acc2] != 0:
                del best_edit_distances[acc1][acc2]

    return best_edit_distances
コード例 #2
0
def find_candidate_transcripts(read_file, params):
    """
        input: a string pointing to a fasta file
        output: a string containing a path to a fasta formatted file with consensus_id_support as accession 
                    and the sequence as the read
    """
    if params.is_fastq:
        S = {
            acc: seq
            for (acc, seq, qual) in fastq_parser.readfq(open(read_file, 'r'))
        }
        ccs_dict = {}
    else:
        S = {
            acc: seq
            for (acc, seq) in fasta_parser.read_fasta(open(read_file, 'r'))
        }

        # still beta to use quaity values for correction, inactivated for now
        if False:  #params.ccs:
            ccs_file = pysam.AlignmentFile(params.ccs, "rb", check_sq=False)
            ccs_dict_raw = ccs_info.get_ccs(ccs_file)
            X_ids = {x_acc.split("/")[1]: x_acc for x_acc in S}
            ccs_dict = ccs_info.modify_strings_and_acc(ccs_dict_raw, X_ids, S)
            for x_acc in S:
                assert S[x_acc] == ccs_dict[x_acc].seq
        else:
            ccs_dict = {}

    step = 1
    print()
    print("ITERATION:", step)
    print()

    lenghts = [len(seq) for seq in S.values()]
    C = Counter(lenghts)

    if params.verbose:
        for l in sorted(C.keys()):
            write_output.logger("seq length {0}: {1} occurances".format(
                l, C[l]),
                                params.develop_logfile,
                                timestamp=False)

    # print(sorted(lenghts))
    max_len = max(lenghts)
    min_len = min(lenghts)
    print("Max transcript length:{0}, Min transcript length:{1}".format(
        max_len, min_len))
    exon_filtered = set()

    seq_to_acc = get_unique_seq_accessions(S)

    nearest_neighbor_start = time()
    G_star, graph_partition, M, converged = partitions.partition_strings(
        S, params)
    partition_alignments = get_partition_alignments(graph_partition, M, G_star,
                                                    exon_filtered, params)

    nearest_neighbor_elapsed = time() - nearest_neighbor_start
    write_output.logger(
        'Time for nearest_neighbors and partition, step 1:{0}'.format(
            str(nearest_neighbor_elapsed)), params.logfile)

    prev_edit_distances_2steps_ago = [2**28, 2**28, 2**28]  # prevents 2-cycles
    prev_edit_distances = [2**28]

    # homopolymer_mode = False

    while not converged:
        correction_start = time()
        edit_distances = [
            partition_alignments[s1][s2][0] for s1 in partition_alignments
            for s2 in partition_alignments[s1]
        ]
        edit_distances.sort()
        if params.verbose:
            print("edit distances from SSW:", edit_distances)

        ###### Different convergence criterion #########

        if prev_edit_distances_2steps_ago == edit_distances:
            # Only cyclic alignments are left, these are reads that jump between two optimal alignment of two different
            # target sequeneces. This is a product of our fast heurustics of defining a minmap score + SSW filtering to choose best alignment
            print("CYCLE!!!")
            assert len(partition_alignments) == len(M)
            break
            # if homopolymer_mode:
            #     break
            # else:
            #     homopolymer_mode = True

        if sum(edit_distances) > sum(prev_edit_distances) and max(
                edit_distances) > max(prev_edit_distances):
            #return here if there is some sequence alternating between best alignments and gets corrected and re-corrected to different candidate sequences
            assert len(partition_alignments) == len(M)
            print("exiting here!")
            break
            # if homopolymer_mode:
            #     print("exiting here!")
            #     break
            # else:
            #     homopolymer_mode = True

        has_converged = [True if ed == 0 else False for ed in edit_distances]
        if all(has_converged):
            # we return here if tha data set contain isolated nodes.
            assert len(partition_alignments) == len(M)
            print("Normal convergence")
            break
            # if homopolymer_mode:
            #     print("Normal convergence")
            #     break
            # else:
            #     homopolymer_mode = True
        #######################################################

        S_prime, S_prime_quality_vector = correction_module.correct_strings(
            partition_alignments,
            seq_to_acc,
            ccs_dict,
            step,
            nr_cores=params.nr_cores,
            verbose=params.verbose)

        for acc, s_prime in S_prime.items():
            S[acc] = s_prime
            if ccs_dict:
                ccs_dict[acc].qual = S_prime_quality_vector[acc]

        print("Tot seqs:", len(S))
        seq_to_acc = get_unique_seq_accessions(S)
        step += 1
        print()
        print("ITERATION:", step)
        print()
        print(
            "Total number of unique reads put aside becauese of exon differences to all other strings:",
            len(exon_filtered))
        S_to_align = {
            acc: seq
            for acc, seq in S.items() if seq not in exon_filtered
        }
        G_star, graph_partition, M, converged = partitions.partition_strings(
            S_to_align, params)
        partition_alignments = get_partition_alignments(
            graph_partition, M, G_star, exon_filtered, params)
        out_file_name = os.path.join(params.outfolder,
                                     "candidates_step_" + str(step) + ".fa")
        out_file = open(out_file_name, "w")
        for i, m in enumerate(partition_alignments):
            N_t = sum([
                container_tuple[3]
                for s, container_tuple in partition_alignments[m].items()
            ])
            out_file.write(">{0}\n{1}\n".format(
                "read" + str(i) + "_support_" + str(N_t), m))

        prev_edit_distances_2steps_ago = prev_edit_distances
        prev_edit_distances = edit_distances

        correction_elapsed = time() - correction_start
        write_output.logger(
            'Time for correction, nearest_neighbors and partition, step {0}:{1}'
            .format(step, str(correction_elapsed)), params.logfile)

        # sys.exit()

    ######################
    ###### NEW ###########
    c_seq_to_read_acc = {}
    for read_acc, seq in S.items():
        if seq in c_seq_to_read_acc:
            c_seq_to_read_acc[seq].append(read_acc)
        else:
            c_seq_to_read_acc[seq] = [read_acc]

    c_acc_to_seq = {}
    c_acc_to_support = {}
    for i, m in enumerate(sorted(c_seq_to_read_acc)):
        if m in partition_alignments:
            N_t = partition_alignments[m][m][
                3]  #sum([container_tuple[3] for s, container_tuple in partition_alignments[m].items()])
        else:
            N_t = 1  # did not converge

        c_acc = "transcript_" + str(i) + "_support_" + str(N_t)
        c_acc_to_seq[c_acc] = m
        c_acc_to_support[c_acc] = N_t

    if params.ignore_ends_len > 0:
        remaining_c_after_invariant = end_invariant_functions.collapse_candidates_under_ends_invariant(
            c_acc_to_seq, c_acc_to_support, params)
        # print(remaining_c_after_invariant)
        # sys.exit()
        for c_acc in remaining_c_after_invariant:
            c_seq = c_acc_to_seq[c_acc]
            for removed_c_acc in remaining_c_after_invariant[c_acc]:
                removed_c_seq = c_acc_to_seq[removed_c_acc]
                reads_to_removed_c_acc = c_seq_to_read_acc[removed_c_seq]

                for read_acc in reads_to_removed_c_acc:
                    c_seq_to_read_acc[c_seq].append(read_acc)

                del c_acc_to_seq[removed_c_acc]
                del c_acc_to_support[removed_c_acc]
                del c_seq_to_read_acc[removed_c_seq]

    if params.is_fastq:
        original_reads = {
            acc: seq
            for (acc, seq, qual) in fastq_parser.readfq(open(read_file, 'r'))
        }
    else:
        original_reads = {
            acc: seq
            for (acc, seq) in fasta_parser.read_fasta(open(read_file, 'r'))
        }

    # original_reads = {acc: seq for (acc, seq) in  fasta_parser.read_fasta(open(read_file, 'r'))}
    print(len(S), len(original_reads))
    assert len(S) == len(original_reads)

    # to_realign = {}
    for c_acc in list(c_acc_to_seq.keys()):
        support = c_acc_to_support[c_acc]
        if support < params.min_candidate_support:
            c_seq = c_acc_to_seq[c_acc]
            if params.verbose:
                print(
                    "nearest_neighbor did not pass threshold. It had support of {0} reads."
                    .format(support))
                print(c_seq_to_read_acc[c_seq])
            del c_acc_to_seq[c_acc]
            del c_seq_to_read_acc[c_seq]
            del c_acc_to_support[c_acc]
            # for read_acc in c_seq_to_read_acc[c_seq]:
            #     to_realign[read_acc] = original_reads[read_acc]

    all_reads_assigned_to_candidates = set([
        read_acc for c_seq in c_seq_to_read_acc
        for read_acc in c_seq_to_read_acc[c_seq]
    ])
    unassigned_reads = set(
        original_reads.keys()) - all_reads_assigned_to_candidates
    to_realign = {
        read_acc: original_reads[read_acc]
        for read_acc in unassigned_reads
    }
    print("Reads assigned to candididates:",
          len(all_reads_assigned_to_candidates))
    print("Reads to realign:", len(to_realign))
    print("Number of initial reads:", len(original_reads))

    candidates_file_name = os.path.join(params.outfolder,
                                        "candidates_converged.fa")
    write_output.print_candidates_from_nearest_neighbors(
        candidates_file_name, c_acc_to_seq, params)
    # sys.exit()
    not_converged_reads = open(
        os.path.join(params.outfolder, "not_converged.fa"), "w")
    not_converged_reads.close()
    assert len(to_realign) + len(all_reads_assigned_to_candidates) == len(
        original_reads)
    assert len(c_acc_to_seq) == len(c_seq_to_read_acc)
    c_to_reads = {}
    for c_acc, c_seq in c_acc_to_seq.items():
        c_to_reads[c_acc] = {}
        for read_acc in c_seq_to_read_acc[c_seq]:
            c_to_reads[c_acc][read_acc] = (c_seq, original_reads[read_acc])

    c_to_reads_edit_distances = edlib_align_sequences_keeping_accession(
        c_to_reads, nr_cores=params.nr_cores)
    print(
        "Total reads in partition (assigned reads after edlib):",
        len([
            1 for c_acc in c_to_reads_edit_distances
            for read_acc in c_to_reads_edit_distances[c_acc]
        ]))
    read_partition = sw_align_sequences_keeping_accession(
        c_to_reads_edit_distances, nr_cores=params.nr_cores)
    filtered_reads = functions.filter_exon_differences(read_partition,
                                                       params.min_exon_diff,
                                                       params.ignore_ends_len)
    print(
        "DEVELOP: Number of read to candidate assignments removed because of exon differences: ",
        len(filtered_reads))
    # sys.exit()
    for read_acc in filtered_reads:
        to_realign[read_acc] = original_reads[read_acc]

    print(
        "Total reads in partition (assigned reads after SW):",
        len([
            1 for c_acc in read_partition for read_acc in read_partition[c_acc]
        ]))
    return candidates_file_name, read_partition, to_realign
コード例 #3
0
def stat_filter_candidates(read_file, candidate_file, read_partition,
                           to_realign, params):
    modified = True

    ############ GET READS AND CANDIDATES #################
    X_original = {
        acc: seq
        for (acc, seq) in fasta_parser.read_fasta(open(read_file, 'r'))
    }
    print("Total original reads", len(X_original))
    x_assigned_to_cluster = set(
        [x_acc for c_acc in read_partition for x_acc in read_partition[c_acc]])
    X = {
        acc: seq
        for (acc, seq) in X_original.items()
        if acc in x_assigned_to_cluster or acc in to_realign
    }  # just set X to read_partition + to_realign here

    print("Original reads in fasta file:", len(X_original))
    print("Reads included in statistical testing:", len(X))
    if os.stat(candidate_file).st_size == 0:
        out_file_name = os.path.join(params.outfolder, "final_candidates.fa")
        tsv_info = os.path.join(params.outfolder, "cluster_info.tsv")
        write_output.print_candidates(out_file_name, {}, {}, {}, {},
                                      params,
                                      final=True,
                                      reads_to_consensus_tsv=tsv_info)
        print("Candidate file is empty!")
        sys.exit(0)
    else:
        C = {
            acc: seq
            for (acc,
                 seq) in fasta_parser.read_fasta(open(candidate_file, 'r'))
        }

    ################################################################

    ### IF CCS file is provided ####
    if params.ccs:
        ccs_file = pysam.AlignmentFile(params.ccs, "rb", check_sq=False)
        ccs_dict_raw = ccs_info.get_ccs(ccs_file)
        X_ids = {x_acc.split("/")[1]: x_acc for x_acc in X}
        ccs_dict = ccs_info.modify_strings_and_acc(ccs_dict_raw, X_ids, X)
        for x_acc in X:
            assert X[x_acc] == ccs_dict[x_acc].seq

    else:
        ccs_dict = {}

    ################################

    print()
    print("STARTING STATISTICAL TESTING")
    print()
    print("Number of reads to realign:", len(to_realign))
    step = 1
    prefilter = True
    previous_partition_of_X = copy.deepcopy(
        read_partition)  #{ c_acc : set() for c_acc in C.keys()}
    previous_components = {c_acc: set() for c_acc in C.keys()}
    previous_edges = {c_acc: set() for c_acc in C.keys()}
    significance_values = {}
    realignment_to_avoid_local_max = 0
    remaining_to_align_read_file = os.path.join(params.outfolder,
                                                "remaining_to_align.fa")

    while modified:
        statistical_start = time()

        modified = False
        print()
        print("STEP NR: {0}".format(step))
        print()
        ########### Write current candidates to file ##########
        temp_candidate_name = os.path.join(
            params.outfolder, "temp_candidates_step_{0}.fa".format(step))
        temp_candidate_file = open(temp_candidate_name, "w")

        for c_acc, c_seq in C.items():
            temp_candidate_file.write(">{0}\n{1}\n".format(c_acc, c_seq))
        temp_candidate_file.close()
        #######################################################

        if params.verbose:
            for c_acc in read_partition:
                print(
                    c_acc, "has {0} reads assigned to it.".format(
                        len(read_partition[c_acc])))

        ############ GET READ SUPORT AND ALIGNMENTS #################

        if realignment_to_avoid_local_max == 1:
            print("REALIGNING EVERYTHING FINAL STEP")
            to_realign = X
            read_partition = {c_acc: {} for c_acc in C.keys()}

        if to_realign:
            print(len(to_realign), "reads to realign.")
            write_output.print_reads(remaining_to_align_read_file, to_realign)
            # align reads that is not yet assigned to candidate here
            G_star_rem, partition_of_realigned_reads = partitions.partition_strings_2set(
                to_realign, C, remaining_to_align_read_file,
                temp_candidate_file.name, params)
            reassigned_reads_to_candidates = {}
            for c_acc in partition_of_realigned_reads:
                reassigned_reads_to_candidates[c_acc] = {}
                for read_acc in partition_of_realigned_reads[c_acc]:
                    reassigned_reads_to_candidates[c_acc][read_acc] = (
                        C[c_acc], X[read_acc])

            edit_distances_of_c_to_reads = edlib_align_sequences_keeping_accession(
                reassigned_reads_to_candidates, nr_cores=params.nr_cores)
            alignments_of_c_to_reads = sw_align_sequences_keeping_accession(
                edit_distances_of_c_to_reads, nr_cores=params.nr_cores)
            # structure: read_partition[c_acc][read_acc] = (c_aln, read_aln, (matches, mismatches, indels))

            ##################################
            ssw_temp = [
                alignments_of_c_to_reads[c_acc][read_acc]
                for c_acc in alignments_of_c_to_reads
                for read_acc in alignments_of_c_to_reads[c_acc]
            ]
            pattern = r"[-]{{{min_exon_diff},}}".format(
                min_exon_diff=str(params.min_exon_diff))  # r"[-]{20,}"
            for c_acc in list(alignments_of_c_to_reads.keys()):
                for read_acc in list(alignments_of_c_to_reads[c_acc].keys()):
                    c_alignment, read_alignment, (
                        matches, mismatches,
                        indels) = alignments_of_c_to_reads[c_acc][read_acc]
                    missing_exon_s1 = re.search(pattern, c_alignment)
                    missing_exon_s2 = re.search(pattern, read_alignment)
                    if missing_exon_s1:
                        del alignments_of_c_to_reads[c_acc][read_acc]
                    elif missing_exon_s2:
                        del alignments_of_c_to_reads[c_acc][read_acc]
            ssw_after_exon_temp = [
                alignments_of_c_to_reads[c_acc][read_acc]
                for c_acc in alignments_of_c_to_reads
                for read_acc in alignments_of_c_to_reads[c_acc]
            ]
            print(
                "Number of alignments that were removed before statistical test because best match to candidate had exon difference larger than {0}bp: {1} "
                .format(str(params.min_exon_diff),
                        len(ssw_temp) - len(ssw_after_exon_temp)))
            #################################

            # add reads to best candidate given new alignments
            for c_acc in alignments_of_c_to_reads:
                for read_acc in alignments_of_c_to_reads[c_acc]:
                    read_partition[c_acc][read_acc] = alignments_of_c_to_reads[
                        c_acc][read_acc]

            for c_acc in list(read_partition.keys()):
                if len(read_partition[c_acc]) == 0:
                    print(c_acc, "removed as it has no supporting reads")
                    del C[c_acc]
                    del read_partition[c_acc]
                else:
                    if params.verbose:
                        print(
                            c_acc,
                            "Now has {0} reads assigned to it, after aligning reads that are not assigned."
                            .format(len(read_partition[c_acc])))

            # add the alignments to alignment structure
            # for x_acc in remaining_alignments_of_x_to_c.keys():
            #     alignments_of_x_to_c[x_acc] = remaining_alignments_of_x_to_c[x_acc]

        # C_seq_to_acc = {seq : acc for acc, seq in C.items()}
        ################################################################

        # check_exon_diffs(alignments_of_x_to_c, params)

        ############# GET THE CLOSES HIGHEST SUPPORTED REFERENCE TO TEST AGAINST FOR EACH CANDIDATE ############

        if params.ignore_ends_len > 0:
            nearest_neighbor_graph = end_invariant_functions.get_nearest_neighbors_graph_under_ignored_ends(
                C, params)
        else:
            nearest_neighbor_graph = get_nearest_neighbor_graph(C)

        # print("EXTRA EDGES FROM HOMOPOLYMER IDENTICAL:", homopol_extra_added)

        ## save time if the nearest_neighbor and all cantidates in a component has identical reads assignmed to them as previous step
        # or heuristically: if candidate hase more than 2x more support than the reference itself (will give highly significant p-value anyway) to save computation time
        # Since indata is the same, the test is guaranteed to give same siginficance values as previous step

        previous_significance_values = {}
        # print(nearest_neighbor_graph)
        for c_acc in list(nearest_neighbor_graph.keys()):
            # skip to test candidates with more reads than their respective references, because its redundant computation that will lead to significant values anyway..
            for t_acc in list(nearest_neighbor_graph[c_acc].keys()):
                if len(read_partition[c_acc]) >= params.min_test_ratio * len(
                        read_partition[t_acc]):
                    if params.verbose:
                        print(
                            "skipping test for dominant candidate {0} to ref {1}"
                            .format(c_acc, t_acc))
                    del nearest_neighbor_graph[c_acc][t_acc]

            previous_significance_values[c_acc] = {}
            to_remove = set()
            for t_acc in list(nearest_neighbor_graph[c_acc].keys()):
                if (c_acc, t_acc) in previous_edges[c_acc] and (
                        previous_partition_of_X[t_acc] == read_partition[t_acc]
                ) and (previous_partition_of_X[c_acc]
                       == read_partition[c_acc]):
                    # print("here", (c_acc, t_acc) in previous_edges[c_acc] and ( previous_partition_of_X[t_acc] == read_partition[t_acc] ) and  (previous_partition_of_X[c_acc] == read_partition[c_acc]))
                    previous_significance_values[c_acc][
                        t_acc] = significance_values[c_acc][t_acc]
                    to_remove.add((c_acc, t_acc))
                    if params.verbose:
                        print("TEST IDENTICAL TO PREVIOUS STEP, SKIPPING FOR",
                              t_acc, c_acc)
                else:
                    pass
                    # print("Modified")
            previous_edges[c_acc] = set([
                (c_acc, t_acc)
                for t_acc in list(nearest_neighbor_graph[c_acc].keys())
            ])
            for c_acc, t_acc in to_remove:
                del nearest_neighbor_graph[c_acc][t_acc]
        # print(nearest_neighbor_graph)
        #####################################################################################################

        # get all candidats that serve as null-hypothesis references and have neighbors subject to testing
        # these are all candidates that are nearest_neighbors to some other, isolated nodes are not tested
        # candidatate in G_star_C
        nr_of_tests_this_round = len([
            1 for c_acc in nearest_neighbor_graph
            for t_acc in nearest_neighbor_graph[c_acc]
        ])
        print("NUMBER OF CANDIDATES LEFT:", len(C),
              ". Number statistical tests in this round:",
              nr_of_tests_this_round)
        if nr_of_tests_this_round > 0:
            new_significance_values = hypothesis_test_module.do_statistical_tests_per_edge(
                nearest_neighbor_graph, C, X, read_partition, ccs_dict, params)

            for c_acc in new_significance_values:
                for t_acc in new_significance_values[c_acc]:
                    previous_significance_values[c_acc][
                        t_acc] = new_significance_values[c_acc][t_acc]

            # previous_significance_values.update(new_significance_values)
            significance_values = copy.deepcopy(previous_significance_values)
        else:
            significance_values = copy.deepcopy(previous_significance_values)

        assert len(significance_values) == len(C)
        highest_significance_values = {}
        for c_acc in significance_values:
            p_val_max = 0.0
            highest = (c_acc, "", "not_tested",
                       1.0, len(read_partition[c_acc]),
                       len(read_partition[c_acc]), "")
            for t_acc in significance_values[c_acc]:
                (p_value, mult_factor_inv, k, N_t,
                 variants) = significance_values[c_acc][t_acc]
                if p_value >= p_val_max:
                    p_val_max = p_value
                    highest = (c_acc, t_acc, p_value, mult_factor_inv, k, N_t,
                               variants)
            highest_significance_values[c_acc] = highest

        if len(highest_significance_values) > 0:
            corrected_pvals = [
                p_value * mult_factor_inv for c_acc,
                (c_acc, t_acc, p_value, mult_factor_inv, k, N_t,
                 variants) in highest_significance_values.items()
                if p_value != "not_tested"
            ]
            if len(corrected_pvals) == 0:
                p_val_threshold = params.p_value_threshold  #1.0
            else:
                corrected_pvals.sort()
                if len(corrected_pvals) % 2 == 0:
                    corrected_pvals_median = (
                        corrected_pvals[int(len(corrected_pvals) / 2) - 1] +
                        corrected_pvals[int(len(corrected_pvals) / 2)]) / 2.0
                else:
                    corrected_pvals_median = corrected_pvals[int(
                        len(corrected_pvals) / 2)]
                print("Median corrected p-val:", corrected_pvals_median)
                print("Number of unique candidates tested:",
                      len(corrected_pvals))
                p_val_threshold = corrected_pvals_median if corrected_pvals_median > params.p_value_threshold else params.p_value_threshold
                print("Filtering threshold (p_val*mult_correction_factor):",
                      p_val_threshold)

        to_realign = {}
        for c_acc, (c_acc, t_acc, p_value, mult_factor_inv, k, N_t,
                    variants) in highest_significance_values.items():
            if p_value == "not_tested":
                if params.verbose:
                    print("Did not test", c_acc)

            elif k == 0:
                if params.verbose:
                    print("Support is 0 for", c_acc)
                print("removing", c_acc, "p-val:", p_value,
                      "correction factor:", mult_factor_inv, "k", k, "N_t",
                      N_t, "variants:", variants, "SUPPORT IS 0.")
                del C[c_acc]
                modified = True
                for x_acc in read_partition[c_acc]:
                    to_realign[x_acc] = X[x_acc]
                del read_partition[c_acc]

            elif p_value * mult_factor_inv >= p_val_threshold:
                print("removing", c_acc, "p-val:", p_value,
                      "correction factor:", mult_factor_inv, "k", k, "N_t",
                      N_t, "variants:", variants)
                del C[c_acc]
                modified = True
                for x_acc in read_partition[c_acc]:
                    to_realign[x_acc] = X[x_acc]
                del read_partition[c_acc]

        previous_partition_of_X = copy.deepcopy(read_partition)

        print("nr candidates left:", len(C))
        candidate_file = os.path.join(
            params.outfolder, "candidates_after_step_{0}.fa".format(step))
        step += 1

        # significance_values = new_significance_values.copy()

        if len(C) == 0:  # no candidates were significant!
            break

        # print("LEN SIGN:", len(significance_values), len(C))
        write_output.print_candidates(candidate_file, C,
                                      highest_significance_values,
                                      read_partition, X, params)

        # do a last realingment to avoind local maxima of reads

        if realignment_to_avoid_local_max == 1:  # we have already done a last realignment, keep going until everythin is significant never realign
            realignment_to_avoid_local_max = 2
        elif not modified and realignment_to_avoid_local_max == 0:  # we have not yet done a final alignment and everythin is significant, realign to escape local maxima alignment
            realignment_to_avoid_local_max = 1
            modified = True
            prefilter = False

        statistical_elapsed = time() - statistical_start
        write_output.logger(
            'Time for Statistical test, step {0}:{1}'.format(
                step, str(statistical_elapsed)), params.logfile)

    final_out_file_name = os.path.join(params.outfolder, "final_candidates.fa")
    tsv_info = os.path.join(params.outfolder, "cluster_info.tsv")
    write_output.print_candidates(final_out_file_name,
                                  C,
                                  highest_significance_values,
                                  read_partition,
                                  X,
                                  params,
                                  final=True,
                                  reads_to_consensus_tsv=tsv_info)

    return C
コード例 #4
0
def stat_filter_candidates(read_file, candidate_file, read_partition,
                           to_realign, params):
    modified = True

    ############ GET READS AND CANDIDATES #################
    if params.is_fastq:
        X_original = {
            acc: seq
            for (acc, seq, qual) in fastq_parser.readfq(open(read_file, 'r'))
        }
    else:
        X_original = {
            acc: seq
            for (acc, seq) in fasta_parser.read_fasta(open(read_file, 'r'))
        }
        # X_original = {acc: seq for (acc, seq) in  fasta_parser.read_fasta(open(read_file, 'r'))}

    print("Total original reads", len(X_original))
    x_assigned_to_cluster = set(
        [x_acc for c_acc in read_partition for x_acc in read_partition[c_acc]])
    X = {
        acc: seq
        for (acc, seq) in X_original.items()
        if acc in x_assigned_to_cluster or acc in to_realign
    }  # just set X to read_partition + to_realign here

    print("Original reads in fasta file:", len(X_original))
    print("Reads included in statistical testing:", len(X))
    if os.stat(candidate_file).st_size == 0:
        out_file_name = os.path.join(params.outfolder, "final_candidates.fa")
        tsv_info = os.path.join(params.outfolder, "cluster_info.tsv")
        write_output.print_candidates(out_file_name, {}, {}, {}, {},
                                      params,
                                      final=True,
                                      reads_to_consensus_tsv=tsv_info)
        print("Candidate file is empty!")
        sys.exit(0)
    else:
        C = {
            acc: seq
            for (acc,
                 seq) in fasta_parser.read_fasta(open(candidate_file, 'r'))
        }

    ################################################################

    ### IF quality values are provided ####
    if params.is_fastq:
        ccs_dict_raw = {
            x_acc.split(" ")[0]: ccs_info.CCS(
                x_acc.split(" ")[0], seq,
                [ord(ascii_char) - 33 for ascii_char in qual], "NA")
            for (x_acc, seq, qual) in fastq_parser.readfq(open(read_file, 'r'))
        }
        # int_quals = [ord(ascii_char) - 33 for ascii_char in qual]
        X_ids = {x_acc.split(" ")[0]: x_acc for x_acc in X}
        print(len(X_ids), len(X), len(ccs_dict_raw))
        for x_acc in X:
            # print(ccs_dict_raw[x_acc.split(" ")[0]].qual)
            # print(ccs_dict_raw[x_acc.split(" ")[0]].seq)
            assert X_ids[x_acc.split(" ")[0]] == x_acc

        ccs_dict = ccs_info.modify_strings_and_acc_fastq(
            ccs_dict_raw, X_ids, X)
        for x_acc in X:
            assert X[x_acc] == ccs_dict[x_acc].seq

    elif params.ccs:
        ccs_file = pysam.AlignmentFile(params.ccs, "rb", check_sq=False)
        ccs_dict_raw = ccs_info.get_ccs(ccs_file)
        X_ids = {"/".join(x_acc.split("/")[:2]): x_acc for x_acc in X}
        ccs_dict = ccs_info.modify_strings_and_acc(ccs_dict_raw, X_ids, X)
        for x_acc in X:
            assert X[x_acc] == ccs_dict[x_acc].seq

    else:
        ccs_dict = {}

    ################################
    all_neighbors_graph = end_invariant_functions.get_NN_graph_ignored_ends_edlib(
        C, params)
    print(
        "TOTAL EDGES G_ALL edlib:",
        len([1 for s in all_neighbors_graph for t in all_neighbors_graph[s]]))
    print(
        "TOTAL Edit distances G_ALL edlib:",
        sum([
            all_neighbors_graph[s][t] for s in all_neighbors_graph
            for t in all_neighbors_graph[s]
        ]))
    candidates_nn_graph_static = all_neighbors_graph

    # all_neighbors_graph_static = sw_align_sequences_keeping_accession(all_neighbors_graph, nr_cores = params.nr_cores)
    # no_alignments = set(C.keys()) - set(all_neighbors_graph_static.keys())
    # for c_acc in no_alignments:
    #     all_neighbors_graph_static[c_acc] = {}
    # # print("TOTAL EDGES G_STATIC parasail:", len([1 for s in all_neighbors_graph_static for t in all_neighbors_graph_static[s]]))
    # # print("TOTAL Edit distances G_STATIC parasail:", sum([ sum(all_neighbors_graph_static[s][t][2][1:]) for s in all_neighbors_graph_static for t in all_neighbors_graph_static[s]]))
    # candidates_nn_graph_static = {}
    # print(len(all_neighbors_graph_static), len(C), len(all_neighbors_graph))
    # for s in all_neighbors_graph_static:
    #     candidates_nn_graph_static[s] = {}
    #     for t in all_neighbors_graph_static[s]:
    #         s_aln, t_aln = all_neighbors_graph_static[s][t][0], all_neighbors_graph_static[s][t][1]
    #         mask_start, mask_end = functions.get_mask_start_and_end(s_aln, t_aln)
    #         ed = sum(all_neighbors_graph_static[s][t][2][1:]) -  min(mask_start, params.ignore_ends_len) - min(params.ignore_ends_len, (len(s_aln) - mask_end))
    #         # print(ed)
    #         if ed > 10:
    #             print(ed,"edlib:", all_neighbors_graph_static[s][t][2])
    #             print(s_aln)
    #             print(t_aln)
    #             continue
    #         else:
    #             candidates_nn_graph_static[s][t] = ed
    #         # print()
    # print("TOTAL EDGES G_STATIC parasail:", len([1 for s in candidates_nn_graph_static for t in candidates_nn_graph_static[s]]))
    # print("TOTAL Edit distances G_STATIC parasail after ignoring ends differences:", sum([ candidates_nn_graph_static[s][t] for s in candidates_nn_graph_static for t in candidates_nn_graph_static[s]]))
    print()
    # sys.exit()
    print()
    print("STARTING STATISTICAL TESTING")
    print()
    print("Number of reads to realign:", len(to_realign))
    step = 1
    prefilter = True
    previous_partition_of_X = copy.deepcopy(read_partition)
    previous_components = {c_acc: set() for c_acc in C.keys()}
    previous_edges = {c_acc: set() for c_acc in C.keys()}
    significance_values = {}
    realignment_to_avoid_local_max = 0
    remaining_to_align_read_file = os.path.join(params.outfolder,
                                                "remaining_to_align.fa")

    while modified:
        statistical_start = time()

        modified = False
        print()
        print("STEP NR: {0}".format(step))
        print()
        ########### Write current candidates to file ##########
        temp_candidate_name = os.path.join(
            params.outfolder, "temp_candidates_step_{0}.fa".format(step))
        temp_candidate_file = open(temp_candidate_name, "w")

        for c_acc, c_seq in C.items():
            temp_candidate_file.write(">{0}\n{1}\n".format(c_acc, c_seq))
        temp_candidate_file.close()
        #######################################################

        if params.verbose:
            for c_acc in read_partition:
                print(
                    c_acc, "has {0} reads assigned to it.".format(
                        len(read_partition[c_acc])))

        ############ GET READ SUPORT AND ALIGNMENTS #################

        if realignment_to_avoid_local_max == 1:
            print("REALIGNING EVERYTHING FINAL STEP")
            to_realign = X
            read_partition = {c_acc: {} for c_acc in C.keys()}

        if to_realign:
            print(len(to_realign), "reads to realign.")
            write_output.print_reads(remaining_to_align_read_file, to_realign)
            # align reads that is not yet assigned to candidate here
            G_star_rem, partition_of_realigned_reads = partitions.partition_strings_2set(
                to_realign, C, remaining_to_align_read_file,
                temp_candidate_file.name, params)
            reassigned_reads_to_candidates = {}
            for c_acc in partition_of_realigned_reads:
                reassigned_reads_to_candidates[c_acc] = {}
                for read_acc in partition_of_realigned_reads[c_acc]:
                    reassigned_reads_to_candidates[c_acc][read_acc] = (
                        C[c_acc], X[read_acc])

            edit_distances_of_c_to_reads = edlib_align_sequences_keeping_accession(
                reassigned_reads_to_candidates, nr_cores=params.nr_cores)
            alignments_of_c_to_reads = sw_align_sequences_keeping_accession(
                edit_distances_of_c_to_reads, nr_cores=params.nr_cores)
            # structure: read_partition[c_acc][read_acc] = (c_aln, read_aln, (matches, mismatches, indels))

            ############## REMOVE EXON LEVEL DIFFERENCES IN ALIGNMENTS ####################
            ssw_temp = [
                alignments_of_c_to_reads[c_acc][read_acc]
                for c_acc in alignments_of_c_to_reads
                for read_acc in alignments_of_c_to_reads[c_acc]
            ]
            _ = functions.filter_exon_differences(alignments_of_c_to_reads,
                                                  params.min_exon_diff,
                                                  params.ignore_ends_len)
            ssw_after_exon_temp = [
                alignments_of_c_to_reads[c_acc][read_acc]
                for c_acc in alignments_of_c_to_reads
                for read_acc in alignments_of_c_to_reads[c_acc]
            ]
            print(
                "Number of alignments that were removed before statistical test because best match to candidate had exon difference larger than {0}bp: {1} "
                .format(str(params.min_exon_diff),
                        len(ssw_temp) - len(ssw_after_exon_temp)))
            #################################

            # add reads to best candidate given new alignments
            for c_acc in alignments_of_c_to_reads:
                for read_acc in alignments_of_c_to_reads[c_acc]:
                    read_partition[c_acc][read_acc] = alignments_of_c_to_reads[
                        c_acc][read_acc]

            for c_acc in list(read_partition.keys()):
                if len(read_partition[c_acc]) == 0:
                    print(c_acc, "removed as it has no supporting reads")
                    del C[c_acc]
                    del read_partition[c_acc]
                else:
                    if params.verbose:
                        print(
                            c_acc,
                            "Now has {0} reads assigned to it, after aligning reads that are not assigned."
                            .format(len(read_partition[c_acc])))

            # add the alignments to alignment structure
            # for x_acc in remaining_alignments_of_x_to_c.keys():
            #     alignments_of_x_to_c[x_acc] = remaining_alignments_of_x_to_c[x_acc]

        # C_seq_to_acc = {seq : acc for acc, seq in C.items()}
        ################################################################

        # check_exon_diffs(alignments_of_x_to_c, params)

        ############# GET THE CLOSES HIGHEST SUPPORTED REFERENCE TO TEST AGAINST FOR EACH CANDIDATE ############
        nearest_neighbor_graph = {}
        for c_acc in C.keys():
            nearest_neighbor_graph[c_acc] = {}
            if len(candidates_nn_graph_static[c_acc]) > 0:
                candidate_edit_distances = [
                    ed for c_nbr_acc, ed in
                    candidates_nn_graph_static[c_acc].items() if c_nbr_acc in C
                ]
                if candidate_edit_distances:
                    min_ed = min(candidate_edit_distances)
                    # print("new min:", min_ed)
                else:
                    print("no tests left")
                # here we get the relevant tests for the current iteration
                for c_nbr_acc in candidates_nn_graph_static[c_acc]:
                    if c_nbr_acc in C and candidates_nn_graph_static[c_acc][
                            c_nbr_acc] == min_ed:
                        nearest_neighbor_graph[c_acc][c_nbr_acc] = min_ed

        print(
            "Edges in NEW candidate NN graph:",
            len([
                1 for c_acc in nearest_neighbor_graph
                for t_acc in nearest_neighbor_graph[c_acc]
            ]))
        print(
            "Edit distances in NEW candidate NN graph:",
            sum([
                nearest_neighbor_graph[c_acc][t_acc]
                for c_acc in nearest_neighbor_graph
                for t_acc in nearest_neighbor_graph[c_acc]
            ]))

        # if params.ignore_ends_len > 0:
        #     nearest_neighbor_graph_old = end_invariant_functions.get_nearest_neighbors_graph_under_ignored_ends(C, params)
        # else:
        #     nearest_neighbor_graph_old = get_nearest_neighbor_graph(C)

        # for c_acc in nearest_neighbor_graph:
        #     for t_acc in nearest_neighbor_graph[c_acc]:
        #         if t_acc not in nearest_neighbor_graph_old[c_acc]:
        #             print("new test:", nearest_neighbor_graph[c_acc][t_acc])
        #             print(C[c_acc])
        #             print(C[t_acc])
        #             print(all_neighbors_graph_static[c_acc][t_acc])

        # print("Edges in candidate NN graph:", len([ 1 for c_acc in nearest_neighbor_graph_old for t_acc in nearest_neighbor_graph_old[c_acc] ]) )
        # print("Edit distances in candidate NN graph:", sum([ nearest_neighbor_graph_old[c_acc][t_acc] for c_acc in nearest_neighbor_graph_old for t_acc in nearest_neighbor_graph_old[c_acc] ]) )

        if realignment_to_avoid_local_max > 0:
            homopolymenr_invariant_graph = functions.get_homopolymer_invariants(
                C)
            print(
                "Edges in candidate homopolymenr invariant graph:",
                len([
                    1 for c_acc in homopolymenr_invariant_graph
                    for t_acc in homopolymenr_invariant_graph[c_acc]
                ]))
            for c_acc in homopolymenr_invariant_graph:
                if c_acc not in nearest_neighbor_graph:
                    print(c_acc, "not in NN_candidates graph but added now.")
                    nearest_neighbor_graph[c_acc] = {}
                for t_acc in homopolymenr_invariant_graph[c_acc]:
                    if t_acc not in nearest_neighbor_graph[c_acc]:
                        # print("Homopolymenr edge added")
                        nearest_neighbor_graph[c_acc][t_acc] = 1
            print(
                "Total union of edges:",
                len([
                    1 for c_acc in nearest_neighbor_graph
                    for t_acc in nearest_neighbor_graph[c_acc]
                ]))

        # print("EXTRA EDGES FROM HOMOPOLYMER IDENTICAL:", homopol_extra_added)

        ## save time if the nearest_neighbor and all cantidates in a component has identical reads assignmed to them as previous step
        # or heuristically: if candidate hase more than 2x more support than the reference itself (will give highly significant p-value anyway) to save computation time
        # Since indata is the same, the test is guaranteed to give same siginficance values as previous step

        previous_significance_values = {}
        # print(nearest_neighbor_graph)
        for c_acc in list(nearest_neighbor_graph.keys()):
            # skip to test candidates with more reads than their respective references, because its redundant computation that will lead to significant values anyway..
            for t_acc in list(nearest_neighbor_graph[c_acc].keys()):
                if len(read_partition[c_acc]) >= params.min_test_ratio * len(
                        read_partition[t_acc]):
                    if params.verbose:
                        print(
                            "skipping test for dominant candidate {0} to ref {1}"
                            .format(c_acc, t_acc))
                    del nearest_neighbor_graph[c_acc][t_acc]

            previous_significance_values[c_acc] = {}
            to_remove = set()
            for t_acc in list(nearest_neighbor_graph[c_acc].keys()):
                if (c_acc, t_acc) in previous_edges[c_acc] and (
                        previous_partition_of_X[t_acc] == read_partition[t_acc]
                ) and (previous_partition_of_X[c_acc]
                       == read_partition[c_acc]):
                    # print("here", (c_acc, t_acc) in previous_edges[c_acc] and ( previous_partition_of_X[t_acc] == read_partition[t_acc] ) and  (previous_partition_of_X[c_acc] == read_partition[c_acc]))
                    previous_significance_values[c_acc][
                        t_acc] = significance_values[c_acc][t_acc]
                    to_remove.add((c_acc, t_acc))
                    if params.verbose:
                        print("TEST IDENTICAL TO PREVIOUS STEP, SKIPPING FOR",
                              t_acc, c_acc)
                else:
                    pass
                    # print("Modified")
            previous_edges[c_acc] = set([
                (c_acc, t_acc)
                for t_acc in list(nearest_neighbor_graph[c_acc].keys())
            ])
            for c_acc, t_acc in to_remove:
                del nearest_neighbor_graph[c_acc][t_acc]
        # print(nearest_neighbor_graph)
        print(
            "Total edges after removing dominant candidates:",
            len([
                1 for c_acc in nearest_neighbor_graph
                for t_acc in nearest_neighbor_graph[c_acc]
            ]))
        # sys.exit()
        #####################################################################################################

        # get all candidats that serve as null-hypothesis references and have neighbors subject to testing
        # these are all candidates that are nearest_neighbors to some other, isolated nodes are not tested
        # candidatate in G_star_C
        nr_of_tests_this_round = len([
            1 for c_acc in nearest_neighbor_graph
            for t_acc in nearest_neighbor_graph[c_acc]
        ])
        print("NUMBER OF CANDIDATES LEFT:", len(C),
              ". Number statistical tests in this round:",
              nr_of_tests_this_round)
        if nr_of_tests_this_round > 0:
            new_significance_values = hypothesis_test_module.do_statistical_tests_per_edge(
                nearest_neighbor_graph, C, X, read_partition, ccs_dict, params)

            for c_acc in new_significance_values:
                for t_acc in new_significance_values[c_acc]:
                    previous_significance_values[c_acc][
                        t_acc] = new_significance_values[c_acc][t_acc]

            # previous_significance_values.update(new_significance_values)
            significance_values = copy.deepcopy(previous_significance_values)
        else:
            significance_values = copy.deepcopy(previous_significance_values)

        assert len(significance_values) == len(C)
        highest_significance_values = {}
        for c_acc in significance_values:
            corrected_p_val_max = 0.0
            highest = (c_acc, "", "not_tested",
                       1.0, len(read_partition[c_acc]),
                       len(read_partition[c_acc]), "")
            for t_acc in significance_values[c_acc]:
                (p_value, mult_factor_inv, k, N_t,
                 variants) = significance_values[c_acc][t_acc]
                corr_p_value = product_with_check_overflow(
                    p_value, mult_factor_inv)
                if corr_p_value >= corrected_p_val_max:
                    corrected_p_val_max = corr_p_value
                    highest = (c_acc, t_acc, p_value, mult_factor_inv, k, N_t,
                               variants)
            highest_significance_values[c_acc] = highest

        if len(highest_significance_values) > 0:
            corrected_pvals = [
                product_with_check_overflow(p_value, mult_factor_inv)
                for c_acc,
                (c_acc, t_acc, p_value, mult_factor_inv, k, N_t,
                 variants) in highest_significance_values.items()
                if p_value != "not_tested"
            ]
            if len(corrected_pvals) == 0:
                p_val_threshold = params.p_value_threshold  #1.0
            else:
                corrected_pvals.sort()
                if len(corrected_pvals) % 2 == 0:
                    corrected_pvals_median = (
                        corrected_pvals[int(len(corrected_pvals) / 2) - 1] +
                        corrected_pvals[int(len(corrected_pvals) / 2)]) / 2.0
                else:
                    corrected_pvals_median = corrected_pvals[int(
                        len(corrected_pvals) / 2)]
                print("Median corrected p-val:", corrected_pvals_median)
                print("Number of unique candidates tested:",
                      len(corrected_pvals))
                p_val_threshold = corrected_pvals_median if corrected_pvals_median > params.p_value_threshold else params.p_value_threshold
                print("Filtering threshold (p_val*mult_correction_factor):",
                      p_val_threshold)

        to_realign = {}
        p_value_tsv_file = open(
            os.path.join(params.outfolder, "p_values_{0}.tsv".format(step)),
            "w")

        for c_acc, (c_acc, t_acc, p_value, mult_factor_inv, k, N_t,
                    variants) in highest_significance_values.items():
            if p_value == "not_tested":
                if params.verbose:
                    print("Did not test", c_acc)

            elif k == 0:
                if params.verbose:
                    print("Support is 0 for", c_acc)
                print("removing", c_acc, "p-val:", p_value,
                      "correction factor:", mult_factor_inv, "k", k, "N_t",
                      N_t, "variants:", variants, "SUPPORT IS 0.")
                del C[c_acc]
                modified = True
                for x_acc in read_partition[c_acc]:
                    to_realign[x_acc] = X[x_acc]
                del read_partition[c_acc]

            elif product_with_check_overflow(
                    p_value, mult_factor_inv) >= p_val_threshold:
                print("removing", c_acc, "p-val:", p_value,
                      "correction factor:", mult_factor_inv, "k", k, "N_t",
                      N_t, "variants:", variants)
                del C[c_acc]
                modified = True
                for x_acc in read_partition[c_acc]:
                    to_realign[x_acc] = X[x_acc]
                del read_partition[c_acc]

            if p_value != "not_tested":
                p_value_tsv_file.write("{0}\t{1}\n".format(
                    c_acc + "_" + str(k) + "_" + str(1.0 if k == 0 else min(
                        1.0,
                        product_with_check_overflow(p_value, mult_factor_inv)))
                    + "_" + str(N_t) + "_" + str(len(variants)), str(p_value)))
        p_value_tsv_file.close()

        previous_partition_of_X = copy.deepcopy(read_partition)

        print("nr candidates left:", len(C))
        candidate_file = os.path.join(
            params.outfolder, "candidates_after_step_{0}.fa".format(step))
        step += 1

        # significance_values = new_significance_values.copy()

        if len(C) == 0:  # no candidates were significant!
            break

        # print("LEN SIGN:", len(significance_values), len(C))
        write_output.print_candidates(candidate_file, C,
                                      highest_significance_values,
                                      read_partition, X, params)

        # do a last realingment to avoind local maxima of reads

        if realignment_to_avoid_local_max == 1:  # we have already done a last realignment, keep going until everythin is significant never realign
            realignment_to_avoid_local_max = 2
        elif not modified and realignment_to_avoid_local_max == 0:  # we have not yet done a final alignment and everythin is significant, realign to escape local maxima alignment
            realignment_to_avoid_local_max = 1
            modified = True
            prefilter = False

        statistical_elapsed = time() - statistical_start
        write_output.logger(
            'Time for Statistical test, step {0}:{1}'.format(
                step, str(statistical_elapsed)), params.logfile)

    if params.ignore_ends_len > 0:
        c_acc_to_support = {
            c_acc: len(all_candidate_assigned_reads)
            for c_acc, all_candidate_assigned_reads in read_partition.items()
        }
        remaining_c_after_invariant = end_invariant_functions.collapse_candidates_under_ends_invariant(
            C, c_acc_to_support, params)
        # print(remaining_c_after_invariant)
        # sys.exit()
        for c_acc in remaining_c_after_invariant:
            c_seq = C[c_acc]
            for removed_c_acc in remaining_c_after_invariant[c_acc]:
                removed_c_seq = C[removed_c_acc]
                reads_to_removed_c_acc = read_partition[removed_c_acc]

                for read_acc in reads_to_removed_c_acc:
                    read_partition[c_acc][read_acc] = reads_to_removed_c_acc[
                        read_acc]

                del C[removed_c_acc]
                del c_acc_to_support[removed_c_acc]
                del read_partition[removed_c_acc]

    final_out_file_name = os.path.join(params.outfolder, "final_candidates.fa")
    tsv_info = os.path.join(params.outfolder, "cluster_info.tsv")
    write_output.print_candidates(final_out_file_name,
                                  C,
                                  highest_significance_values,
                                  read_partition,
                                  X,
                                  params,
                                  final=True,
                                  reads_to_consensus_tsv=tsv_info)

    return C