# Read in alignments of intrachromosomal chimeras and proceed with ordering and orientation log('-- Reading intrachromosomal chimera broken alignments') alns = read_paf_alignments('chimera_break/intra_contigs_against_ref.paf') alns = clean_alignments(alns, l=1000, in_exclude_file=exclude_file) contigs_file = '/ragoo_output/chimera_break/' + out_intra_fasta log('-- The total number of interchromasomally chimeric contigs broken is %r' % total_inter_broken) log('-- The total number of intrachromasomally chimeric contigs broken is %r' % total_intra_broken) # Assign each contig to a corresponding reference chromosome. log('-- Assigning contigs') all_unique_contigs = dict() for i in alns.keys(): all_unique_contigs[i] = UniqueContigAlignment(alns[i]) # Add to this the list of headers that did not make it write_contig_clusters(all_unique_contigs, group_score_thresh, skip_ctg) log('-- Ordering and orienting contigs') order_orient_contigs(all_unique_contigs, alns) log('-- Creating Pseudomolecules') create_pseudomolecules(contigs_file, all_unique_contigs, g) if call_svs: log('-- Aligning pseudomolecules to reference') align_pms(minimap_path, t, reference_file) log('-- Getting structural variants')
def order_orient_contigs(in_unique_contigs, in_alns): current_path = os.getcwd() output_path = current_path + '/orderings' if not os.path.exists(output_path): os.makedirs(output_path) # Get longest alignments longest_contigs = dict() for i in in_alns.keys(): # Only consider alignments to the assigned chromosome uniq_aln = UniqueContigAlignment(in_alns[i]) best_header = uniq_aln.ref_chrom ctg_alns = copy.deepcopy(in_alns[i]) ctg_alns.filter_ref_chroms([best_header]) longest_contigs[i] = LongestContigAlignment(ctg_alns) # Save the orientations final_orientations = dict() for i in longest_contigs.keys(): final_orientations[i] = longest_contigs[i].strand # Get the location and orientation confidence scores orientation_confidence = dict() location_confidence = dict() forward_bp = 0 reverse_bp = 0 for i in in_alns.keys(): uniq_aln = UniqueContigAlignment(in_alns[i]) best_header = uniq_aln.ref_chrom ctg_alns = copy.deepcopy(in_alns[i]) ctg_alns.filter_ref_chroms([best_header]) # Orientation confidence scores # Every base pair votes for the orientation of the alignment in which it belongs # Score is # votes for the assigned orientation over all votes for j in range(len(ctg_alns.ref_headers)): if ctg_alns.strands[j] == '+': forward_bp += ctg_alns.aln_lens[j] else: reverse_bp += ctg_alns.aln_lens[j] if final_orientations[i] == '+': orientation_confidence[i] = forward_bp / (forward_bp + reverse_bp) else: orientation_confidence[i] = reverse_bp / (forward_bp + reverse_bp) forward_bp = 0 reverse_bp = 0 # Location confidence location_confidence[i] = get_location_confidence(ctg_alns) all_chroms = set([in_unique_contigs[i].ref_chrom for i in in_unique_contigs.keys()]) for this_chrom in all_chroms: # Intialize the list of start and end positions w.r.t the query ref_pos = [] groupings_file = 'groupings/' + this_chrom + '_contigs.txt' contigs_list = get_contigs_from_groupings(groupings_file) for i in range(len(contigs_list)): # There is a scope issue here. Pass this (longest_contigs) to the method explicitly. ref_pos.append((longest_contigs[contigs_list[i]].ref_start, longest_contigs[contigs_list[i]].ref_end, i)) final_order = [contigs_list[i[2]] for i in sorted(ref_pos)] # Get ordering confidence # To do this, get the max and min alignments to this reference chromosome # Then within that region, what percent of bp are covered with open('orderings/' + this_chrom + '_orderings.txt', 'w') as out_file: for i in final_order: # Also have a scope issue here. out_file.write(i + '\t' + final_orientations[i] + '\t' + str(location_confidence[i]) + '\t' + str(orientation_confidence[i]) + '\n')
def get_intra_contigs(alns, l, d, c): """ Flag contigs as being intrachromosomal chimeras :param alns: :param l: Minimum alignment length to consider :param d: Distance between consecutive adjacent alignments with respect to the reference. If larger than this, flag :param c: Distance between consecutive adjacent alignments with respect to the query. If larger than this, flag :return: dict of contigs and break points. """ # Get only the header to which this contig mostly aligns to and filter out smaller alignments. uniq_aln = UniqueContigAlignment(alns) best_header = uniq_aln.ref_chrom ctg_alns = copy.deepcopy(alns) ctg_alns.filter_ref_chroms([best_header]) ctg_alns.filter_lengths(l) # If there are no longer any alignments after length filtering, give up if not len(ctg_alns.ref_headers): return # Sort the alignments with respect to the reference start and end positions. ctg_alns.sort_by_ref() # Make a list of distance between alignments # first with respect to (wrt) the reference. distances_wrt_ref = [] for i in range(len(ctg_alns.ref_headers) - 1): distances_wrt_ref.append(ctg_alns.ref_starts[i + 1] - ctg_alns.ref_starts[i]) # next, with respect to (wrt) the contig. distances_wrt_ctg = [] for i in range(len(ctg_alns.ref_headers) - 1): distances_wrt_ctg.append( abs(ctg_alns.query_starts[i + 1] - ctg_alns.query_starts[i])) # Next, assign the following two identities. # 1. When ordered by the reference, the alignments start at the beginning or the end of the query # 2. For the alignment which will be broken on, is it on the forward or reverse strand. is_query_start = True if ctg_alns.query_starts[0] >= ctg_alns.query_lens[0] / 5: is_query_start = False # This conditional essentially checks if there are any break points for this contig. # Returns None otherwise (no return statement) if distances_wrt_ref: if max(distances_wrt_ref) > d: gap_index = distances_wrt_ref.index(max(distances_wrt_ref)) a_alns_strands = ctg_alns.strands[:gap_index] if is_query_start: if a_alns_strands.count('-') > a_alns_strands.count('+'): # The first subcontig is on the reverse strand return (ctg_alns.contig, [(0, ctg_alns.query_ends[0]), (ctg_alns.query_ends[0], ctg_alns.query_lens[0])]) else: # The first subcontig is on the forward strand. return (ctg_alns.contig, [(0, ctg_alns.query_ends[gap_index]), (ctg_alns.query_ends[gap_index], ctg_alns.query_lens[0])]) else: # The first subcontig starts at the end of the contig if a_alns_strands.count('-') > a_alns_strands.count('+'): # The first subcontig is on the reverse strand return (ctg_alns.contig, [(0, ctg_alns.query_starts[gap_index]), (ctg_alns.query_starts[gap_index], ctg_alns.query_lens[0])]) else: # The first subcontig is on the forward strand. return (ctg_alns.contig, [(0, ctg_alns.query_starts[0]), (ctg_alns.query_starts[0], ctg_alns.query_lens[0])]) if max(distances_wrt_ctg) > c: gap_index = distances_wrt_ctg.index(max(distances_wrt_ctg)) + 1 a_alns_strands = ctg_alns.strands[:gap_index] if is_query_start: if a_alns_strands.count('-') > a_alns_strands.count('+'): # The first subcontig is on the reverse strand return (ctg_alns.contig, [(0, ctg_alns.query_ends[0]), (ctg_alns.query_ends[0], ctg_alns.query_lens[0])]) else: # The first subcontig is on the forward strand. return (ctg_alns.contig, [(0, ctg_alns.query_ends[gap_index]), (ctg_alns.query_ends[gap_index], ctg_alns.query_lens[0])]) else: # The first subcontig starts at the end of the contig if a_alns_strands.count('-') > a_alns_strands.count('+'): # The first subcontig is on the reverse strand return (ctg_alns.contig, [(0, ctg_alns.query_starts[gap_index - 1]), (ctg_alns.query_starts[gap_index - 1], ctg_alns.query_lens[0])]) else: # The first subcontig is on the forward strand. return (ctg_alns.contig, [(0, ctg_alns.query_starts[0]), (ctg_alns.query_starts[0], ctg_alns.query_lens[0])])