Ejemplo n.º 1
0
        # Read in alignments of intrachromosomal chimeras and proceed with ordering and orientation
        log('-- Reading intrachromosomal chimera broken alignments')
        alns = read_paf_alignments('chimera_break/intra_contigs_against_ref.paf')
        alns = clean_alignments(alns, l=1000, in_exclude_file=exclude_file)
        contigs_file = '/ragoo_output/chimera_break/' + out_intra_fasta
        log('-- The total number of interchromasomally chimeric contigs broken is %r' % total_inter_broken)
        log('-- The total number of intrachromasomally chimeric contigs broken is %r' % total_intra_broken)



    # Assign each contig to a corresponding reference chromosome.
    log('-- Assigning contigs')
    all_unique_contigs = dict()
    for i in alns.keys():
        all_unique_contigs[i] = UniqueContigAlignment(alns[i])

    # Add to this the list of headers that did not make it
    write_contig_clusters(all_unique_contigs, group_score_thresh, skip_ctg)

    log('-- Ordering and orienting contigs')
    order_orient_contigs(all_unique_contigs, alns)

    log('-- Creating Pseudomolecules')
    create_pseudomolecules(contigs_file, all_unique_contigs, g)

    if call_svs:
        log('-- Aligning pseudomolecules to reference')
        align_pms(minimap_path, t, reference_file)

        log('-- Getting structural variants')
Ejemplo n.º 2
0
def order_orient_contigs(in_unique_contigs, in_alns):
    current_path = os.getcwd()
    output_path = current_path + '/orderings'
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    # Get longest alignments
    longest_contigs = dict()
    for i in in_alns.keys():
        # Only consider alignments to the assigned chromosome
        uniq_aln = UniqueContigAlignment(in_alns[i])
        best_header = uniq_aln.ref_chrom
        ctg_alns = copy.deepcopy(in_alns[i])
        ctg_alns.filter_ref_chroms([best_header])
        longest_contigs[i] = LongestContigAlignment(ctg_alns)

    # Save the orientations
    final_orientations = dict()
    for i in longest_contigs.keys():
        final_orientations[i] = longest_contigs[i].strand

    # Get the location and orientation confidence scores
    orientation_confidence = dict()
    location_confidence = dict()
    forward_bp = 0
    reverse_bp = 0
    for i in in_alns.keys():
        uniq_aln = UniqueContigAlignment(in_alns[i])
        best_header = uniq_aln.ref_chrom
        ctg_alns = copy.deepcopy(in_alns[i])
        ctg_alns.filter_ref_chroms([best_header])

        # Orientation confidence scores
        # Every base pair votes for the orientation of the alignment in which it belongs
        # Score is # votes for the assigned orientation over all votes
        for j in range(len(ctg_alns.ref_headers)):
            if ctg_alns.strands[j] == '+':
                forward_bp += ctg_alns.aln_lens[j]
            else:
                reverse_bp += ctg_alns.aln_lens[j]

        if final_orientations[i] == '+':
            orientation_confidence[i] = forward_bp / (forward_bp + reverse_bp)
        else:
            orientation_confidence[i] = reverse_bp / (forward_bp + reverse_bp)

        forward_bp = 0
        reverse_bp = 0

        # Location confidence
        location_confidence[i] = get_location_confidence(ctg_alns)

    all_chroms = set([in_unique_contigs[i].ref_chrom for i in in_unique_contigs.keys()])

    for this_chrom in all_chroms:

        # Intialize the list of start and end positions w.r.t the query
        ref_pos = []

        groupings_file = 'groupings/' + this_chrom + '_contigs.txt'
        contigs_list = get_contigs_from_groupings(groupings_file)

        for i in range(len(contigs_list)):
            # There is a scope issue here. Pass this (longest_contigs) to the method explicitly.
            ref_pos.append((longest_contigs[contigs_list[i]].ref_start, longest_contigs[contigs_list[i]].ref_end, i))

        final_order = [contigs_list[i[2]] for i in sorted(ref_pos)]

        # Get ordering confidence
        # To do this, get the max and min alignments to this reference chromosome
        # Then within that region, what percent of bp are covered

        with open('orderings/' + this_chrom + '_orderings.txt', 'w') as out_file:
            for i in final_order:
                # Also have a scope issue here.
                out_file.write(i + '\t' + final_orientations[i] + '\t' + str(location_confidence[i]) + '\t' + str(orientation_confidence[i]) + '\n')
Ejemplo n.º 3
0
def get_intra_contigs(alns, l, d, c):
    """
    Flag contigs as being intrachromosomal chimeras
    :param alns:
    :param l: Minimum alignment length to consider
    :param d: Distance between consecutive adjacent alignments with respect to the reference. If larger than this, flag
    :param c: Distance between consecutive adjacent alignments with respect to the query. If larger than this, flag
    :return: dict of contigs and break points.
    """

    # Get only the header to which this contig mostly aligns to and filter out smaller alignments.
    uniq_aln = UniqueContigAlignment(alns)
    best_header = uniq_aln.ref_chrom
    ctg_alns = copy.deepcopy(alns)
    ctg_alns.filter_ref_chroms([best_header])
    ctg_alns.filter_lengths(l)

    # If there are no longer any alignments after length filtering, give up
    if not len(ctg_alns.ref_headers):
        return

    # Sort the alignments with respect to the reference start and end positions.
    ctg_alns.sort_by_ref()

    # Make a list of distance between alignments
    # first with respect to (wrt) the reference.
    distances_wrt_ref = []
    for i in range(len(ctg_alns.ref_headers) - 1):
        distances_wrt_ref.append(ctg_alns.ref_starts[i + 1] -
                                 ctg_alns.ref_starts[i])

    # next, with respect to (wrt) the contig.
    distances_wrt_ctg = []
    for i in range(len(ctg_alns.ref_headers) - 1):
        distances_wrt_ctg.append(
            abs(ctg_alns.query_starts[i + 1] - ctg_alns.query_starts[i]))

    # Next, assign the following two identities.
    #  1. When ordered by the reference, the alignments start at the beginning or the end of the query
    #  2. For the alignment which will be broken on, is it on the forward or reverse strand.

    is_query_start = True

    if ctg_alns.query_starts[0] >= ctg_alns.query_lens[0] / 5:
        is_query_start = False

    # This conditional essentially checks if there are any break points for this contig.
    # Returns None otherwise (no return statement)
    if distances_wrt_ref:
        if max(distances_wrt_ref) > d:
            gap_index = distances_wrt_ref.index(max(distances_wrt_ref))
            a_alns_strands = ctg_alns.strands[:gap_index]
            if is_query_start:
                if a_alns_strands.count('-') > a_alns_strands.count('+'):
                    # The first subcontig is on the reverse strand
                    return (ctg_alns.contig, [(0, ctg_alns.query_ends[0]),
                                              (ctg_alns.query_ends[0],
                                               ctg_alns.query_lens[0])])
                else:
                    # The first subcontig is on the forward strand.
                    return (ctg_alns.contig, [(0,
                                               ctg_alns.query_ends[gap_index]),
                                              (ctg_alns.query_ends[gap_index],
                                               ctg_alns.query_lens[0])])
            else:
                # The first subcontig starts at the end of the contig
                if a_alns_strands.count('-') > a_alns_strands.count('+'):
                    # The first subcontig is on the reverse strand
                    return (ctg_alns.contig,
                            [(0, ctg_alns.query_starts[gap_index]),
                             (ctg_alns.query_starts[gap_index],
                              ctg_alns.query_lens[0])])
                else:
                    # The first subcontig is on the forward strand.
                    return (ctg_alns.contig, [(0, ctg_alns.query_starts[0]),
                                              (ctg_alns.query_starts[0],
                                               ctg_alns.query_lens[0])])

        if max(distances_wrt_ctg) > c:
            gap_index = distances_wrt_ctg.index(max(distances_wrt_ctg)) + 1
            a_alns_strands = ctg_alns.strands[:gap_index]
            if is_query_start:
                if a_alns_strands.count('-') > a_alns_strands.count('+'):
                    # The first subcontig is on the reverse strand
                    return (ctg_alns.contig, [(0, ctg_alns.query_ends[0]),
                                              (ctg_alns.query_ends[0],
                                               ctg_alns.query_lens[0])])
                else:
                    # The first subcontig is on the forward strand.
                    return (ctg_alns.contig, [(0,
                                               ctg_alns.query_ends[gap_index]),
                                              (ctg_alns.query_ends[gap_index],
                                               ctg_alns.query_lens[0])])
            else:
                # The first subcontig starts at the end of the contig
                if a_alns_strands.count('-') > a_alns_strands.count('+'):
                    # The first subcontig is on the reverse strand
                    return (ctg_alns.contig,
                            [(0, ctg_alns.query_starts[gap_index - 1]),
                             (ctg_alns.query_starts[gap_index - 1],
                              ctg_alns.query_lens[0])])
                else:
                    # The first subcontig is on the forward strand.
                    return (ctg_alns.contig, [(0, ctg_alns.query_starts[0]),
                                              (ctg_alns.query_starts[0],
                                               ctg_alns.query_lens[0])])