def get_pileup_list(all_sample_read_aligns, sample_names, reference_graph, node_grouping_list=None, show_insertions=False): """ Generate genotype_pileups for all sample Args: all_sample_read_aligns: dict: {sample_name : {repeat_id: [list of read aligns]}} sample_names: ordered list of sample names in pileup reference_graph: ReferenceGraph for STR locus node_grouping_list: Ordered list of ReferenceNodes in decreasing order of priority. (Default: left to right, repeat units first) show_insertions: Boolean to show full sequence of insertions Returns: List of 3-tuples with pileups for each sample e.g. [(sample_name, PathWithInsertions, Pileup (see get_pileup))] """ for s in all_sample_read_aligns: repeat_aligns = [a for a in all_sample_read_aligns[s] if len([n for n in parse_graph_cigar(a.graph_cigar) if reference_graph[int(n[0])].is_repeat]) > 0] all_sample_read_aligns[s] = repeat_aligns repeat_path = get_repeat_path(all_sample_read_aligns, reference_graph) pileup_list = [] for sample_name in sample_names: if sample_name in all_sample_read_aligns: genotype_pileup_list = get_pileup(all_sample_read_aligns[sample_name], reference_graph, repeat_path, node_grouping_list, show_insertions=show_insertions) else: genotype_pileup_list = get_pileup([], reference_graph, repeat_path, node_grouping_list, show_insertions=show_insertions) pileup_list.append((sample_name, repeat_path, genotype_pileup_list[1])) return pileup_list
def get_spanning_genotypes(read_align_list, reference_graph): '''From a list of ReadAlignments, return a dict mapping each repetitive node_id to ordered list of genotypes spanned by a read. Arguments: read_align_list: List of ReadAlignment to the locus in the sample reference_graph: ReferenceGraph for STR locus node_grouping_list: Ordered list of ReferenceNodes in decreasing order of priority. (Default: left to right, repeat units first) Returns: dict mapping node_id to list of spanned genotypes ''' spanning_genotypes = defaultdict(lambda: [], {}) for read_alignment in read_align_list: node_count = np.zeros(len(reference_graph)) left_flank_uncovered = np.ones(len(reference_graph)) right_flank_uncovered = np.ones(len(reference_graph)) previous_node_id = -1 for node in parse_graph_cigar(read_alignment.graph_cigar): node_count[int(node[0])] += 1 if previous_node_id != -1 and previous_node_id != int(node[0]): for i in range(previous_node_id, int(node[0])): right_flank_uncovered[i] = 0 left_flank_uncovered[i + 1] = 0 previous_node_id = int(node[0]) for node in reference_graph: if node.is_repeat and right_flank_uncovered[node.node_id] == 0 and left_flank_uncovered[node.node_id] == 0: if node_count[node.node_id] not in spanning_genotypes[node.node_id]: spanning_genotypes[node.node_id].append(node_count[node.node_id]) for node_id in spanning_genotypes: spanning_genotypes[node_id].sort() return spanning_genotypes
def get_repeat_path(read_alignments, reference_graph): """Get a PathWithInsertions corresponding to highest count of each repeat_unit Arguments: read_alignments: dict mapping sample name to list of ReadAlignments reference_graph: ReferenceGraph for repeat locus """ #TODO: Infer repeat_graph from read_alignments if repeat_specs not provided. max_node_count = defaultdict(lambda: 0, {}) for sample in read_alignments: for read_alignment in read_alignments[sample]: node_count = defaultdict(lambda: 0, {}) for node in parse_graph_cigar(read_alignment.graph_cigar): node_count[int(node[0])] += 1 for node_id in node_count: max_node_count[node_id] = max(max_node_count[node_id], node_count[node_id]) if max_node_count[reference_graph[0].node_id] == 0: max_node_count[reference_graph[0].node_id] = 1 if max_node_count[reference_graph[-1].node_id] == 0: max_node_count[reference_graph[-1].node_id] = 1 node_id_list = [] for node in reference_graph: node_id_list += [node.node_id] * max_node_count[node.node_id] repeat_path = PathWithInsertions(node_id_list) print ("Reference path = %s" % [n.node_id for n in repeat_path]) for sample in read_alignments: for read_alignment in read_alignments[sample]: repeat_path.update(read_alignment.graph_cigar, read_alignment.offset) return repeat_path
def get_alignment_coordinates(self, query_sequence, reference_graph, graph_cigar, offset=0, show_insertions=False): """Update nodes with insertions in the read alignment. Arguments: query_sequence: sequence of aligned read reference_graph: ReferenceGraph for locus graph_cigar: graph cigar string of read alignment. offset: 0-based position of first base in the first node in the alignment show_insertions: Boolean to show full sequence of insertions Returns: alignment_coordinates: a list of 3-tuples Each 3-tuple consists of (coordinate, query_bp, ref_bp) Gaps are represented by "-" """ INTERNODE_GAP = 1 alignment_coordinates = [] graph_cigar_tuples = parse_graph_cigar(graph_cigar) path_index = self.get_alignment_start_index(graph_cigar) node_coordinate = 0 cigar_tuple_index = 0 query_pos = 0 for i in range(path_index): node_coordinate += len(reference_graph[self[i].node_id].seq) if show_insertions: node_coordinate += self[i].get_total_insertion_size() node_coordinate += INTERNODE_GAP for node_alignment in graph_cigar_tuples: while path_index < len(self) and self[path_index].node_id != int( node_alignment[0]): node_coordinate += len( reference_graph[self[path_index].node_id].seq) if show_insertions: node_coordinate += self[ path_index].get_total_insertion_size() path_index += 1 node_coordinate += INTERNODE_GAP cigar = node_alignment[1] #TODO: can we rely on graph cigar to always include bases from node extremes? Then we do not need offset if cigar_tuple_index == 0: node_offset = offset else: node_offset = 0 node_align = self[path_index].get_alignment_coordinates( cigar, query_pos, query_sequence, node_coordinate, node_offset, reference_graph, show_insertions) alignment_coordinates += node_align query_pos += get_cigar_query_align_len(cigar) cigar_tuple_index += 1 node_coordinate += len( reference_graph[self[path_index].node_id].seq) if show_insertions: node_coordinate += self[path_index].get_total_insertion_size() node_coordinate += INTERNODE_GAP path_index += 1 return alignment_coordinates
def get_genotype_classification(read_alignment, reference_graph, spanning_genotypes, node_grouping_list=None): '''From a list of ReadAlignments, return a dict mapping each repetitive node_id to ordered list of genotypes spanned by a read. Arguments: read_align_list: List of ReadAlignment to the locus in the sample reference_graph: ReferenceGraph for STR locus node_grouping_list: Ordered list of ReferenceNodes in decreasing order of priority. (Default: left to right, repeat units first) Returns: list of 3 tuples for each node in node_grouping_list: (ReferenceNode, CompareStatus ("=", ">" or "\leq"), Genotype String) ''' node_count = np.zeros(len(reference_graph)) left_flank_uncovered = np.ones(len(reference_graph)) right_flank_uncovered = np.ones(len(reference_graph)) previous_node_id = -1 for cigar_node in parse_graph_cigar(read_alignment.graph_cigar): node_count[int(cigar_node[0])] += 1 if previous_node_id != -1 and previous_node_id != int(cigar_node[0]): for i in range(previous_node_id, int(cigar_node[0])): right_flank_uncovered[i] = 0 left_flank_uncovered[i + 1] = 0 previous_node_id = int(cigar_node[0]) gt_class = [] if node_grouping_list is None: node_grouping_list = ([n.node_id for n in reference_graph if (n.is_repeat and 'ignore' not in n.node_name)] + [n.node_id for n in reference_graph if (n.is_repeat and 'ignore' in n.node_name)] + [n.node_id for n in reference_graph if not n.is_repeat]) for node_id in node_grouping_list: graph_node = reference_graph[node_id] if not graph_node.is_repeat: continue gt_compare_status = "=" if (node_count[graph_node.node_id], left_flank_uncovered[graph_node.node_id], right_flank_uncovered[graph_node.node_id]) == (0, 1, 1): gt = -1 elif ((left_flank_uncovered[graph_node.node_id] != right_flank_uncovered[graph_node.node_id]) or node_count[graph_node.node_id] > 0 and (left_flank_uncovered[graph_node.node_id], right_flank_uncovered[graph_node.node_id]) == (1, 1)): spanning_index = bisect.bisect_left(spanning_genotypes[graph_node.node_id], node_count[graph_node.node_id]) if spanning_index >= len(spanning_genotypes[graph_node.node_id]): gt_compare_status = '>' if len(spanning_genotypes[graph_node.node_id]) == 0: gt = 0 else: gt = spanning_genotypes[graph_node.node_id][-1] else: gt = spanning_genotypes[graph_node.node_id][spanning_index] gt_compare_status = '\leq' else: gt = node_count[graph_node.node_id] gt_string = r'$gt%s %s %s$' % (graph_node.node_id, gt_compare_status, int(gt)) if gt > 0 and gt_compare_status == '\leq': spanning_index = bisect.bisect_left(spanning_genotypes[graph_node.node_id], node_count[graph_node.node_id]) if spanning_index > 0: gt_string = r"$%s < gt%s \leq %s$" % (int(spanning_genotypes[graph_node.node_id][spanning_index - 1]), graph_node.node_id, int(gt)) gt_class.append((graph_node, gt_compare_status, gt, gt_string)) return gt_class
def update(self, graph_cigar, offset=0): """Update nodes with insertions in the read alignment. Arguments: graph_cigar: graph cigar string of read alignment. offset: 0-based position of first base in the first node in the alignment """ graph_cigar_tuples = parse_graph_cigar(graph_cigar) path_index = self.get_alignment_start_index(graph_cigar) cigar_tuple_index = 0 for node_alignment in graph_cigar_tuples: while path_index < len(self) and self[path_index].node_id != int( node_alignment[0]): path_index += 1 if cigar_tuple_index == 0: self[path_index].update(node_alignment[1], offset) else: self[path_index].update(node_alignment[1]) cigar_tuple_index += 1 path_index += 1
def get_graph_cigar_seq(self, graph_cigar, offset=0): ''' Get reference sequence corresponding to graph_cigar alignment Args: graph_cigar: graph_cigar string offset: first matched position in the first node (0-based) Returns: reference sequence ''' graph_cigar_seq = '' graph_cigar_tuples = seq_util.parse_graph_cigar(graph_cigar) for graph_cigar_tuple in enumerate(graph_cigar_tuples): if graph_cigar_tuple[0] == 0: graph_cigar_seq += self[int( graph_cigar_tuple[1][0])].get_cigar_seq( graph_cigar_tuple[1][1], offset) else: graph_cigar_seq += self[int( graph_cigar_tuple[1][0])].get_cigar_seq( graph_cigar_tuple[1][1]) return graph_cigar_seq
def get_alignment_start_index(self, graph_cigar): """Find index of first node in alignment from graph_cigar string Arguments: graph_cigar: graph cigar string Returns: Ordered list of node indices in PathWithInsertions. """ graph_cigar_tuples = parse_graph_cigar(graph_cigar) if len(graph_cigar_tuples) == 0: return 0 first_id = int(graph_cigar_tuples[0][0]) # left aligned if single node or in-repeat if first_id == int(graph_cigar_tuples[-1][0]): first_start_index = 0 while first_start_index < len(self): current_node = self[first_start_index] if current_node.node_id == first_id: return first_start_index first_start_index += 1 cigar_index = 0 while cigar_index < len(graph_cigar_tuples): if int(graph_cigar_tuples[cigar_index][0]) != int( graph_cigar_tuples[0][0]): break cigar_index += 1 first_count = cigar_index first_end_index = 0 current_node_id = self[first_end_index].node_id while first_end_index < len(self): if first_end_index == len(self) - 1: return first_end_index - first_count + 1 next_node_id = self[first_end_index + 1].node_id if current_node_id == first_id and next_node_id != first_id: return first_end_index - first_count + 1 first_end_index += 1 current_node_id = next_node_id alignment_start_index = first_end_index - first_count + 1 return alignment_start_index