def get_pileup_list(all_sample_read_aligns, sample_names, reference_graph, node_grouping_list=None, show_insertions=False):
    """ Generate genotype_pileups for all sample
    Args:
        all_sample_read_aligns: dict: {sample_name : {repeat_id: [list of read aligns]}}
        sample_names: ordered list of sample names in pileup
        reference_graph: ReferenceGraph for STR locus
        node_grouping_list: Ordered list of ReferenceNodes in decreasing order of priority. (Default: left to right, repeat units first)
        show_insertions: Boolean to show full sequence of insertions   
    Returns:
        List of 3-tuples with pileups for each sample
            e.g. [(sample_name, PathWithInsertions, Pileup (see get_pileup))]
    """
    for s in all_sample_read_aligns:
        repeat_aligns = [a for a in all_sample_read_aligns[s] if len([n for n in parse_graph_cigar(a.graph_cigar) if reference_graph[int(n[0])].is_repeat]) > 0]
        all_sample_read_aligns[s] = repeat_aligns
    repeat_path = get_repeat_path(all_sample_read_aligns, reference_graph)

    pileup_list = []
    for sample_name in sample_names:
        if sample_name in all_sample_read_aligns:
            genotype_pileup_list = get_pileup(all_sample_read_aligns[sample_name], reference_graph, repeat_path, node_grouping_list, show_insertions=show_insertions)
        else:
            genotype_pileup_list = get_pileup([], reference_graph, repeat_path, node_grouping_list, show_insertions=show_insertions)
        pileup_list.append((sample_name, repeat_path, genotype_pileup_list[1]))
    return pileup_list
def get_spanning_genotypes(read_align_list, reference_graph):
    '''From a list of ReadAlignments, return a dict mapping each repetitive node_id to ordered list of genotypes spanned by a read.
    Arguments:
        read_align_list: List of ReadAlignment to the locus in the sample
        reference_graph: ReferenceGraph for STR locus
        node_grouping_list: Ordered list of ReferenceNodes in decreasing order of priority. (Default: left to right, repeat units first)
    Returns:
        dict mapping node_id to list of spanned genotypes
    '''
    spanning_genotypes = defaultdict(lambda: [], {})
    for read_alignment in read_align_list:
        node_count = np.zeros(len(reference_graph))
        left_flank_uncovered = np.ones(len(reference_graph))
        right_flank_uncovered = np.ones(len(reference_graph))
        previous_node_id = -1
        for node in parse_graph_cigar(read_alignment.graph_cigar):
            node_count[int(node[0])] += 1
            if previous_node_id != -1 and previous_node_id != int(node[0]):
                for i in range(previous_node_id, int(node[0])):
                    right_flank_uncovered[i] = 0
                    left_flank_uncovered[i + 1] = 0
            previous_node_id = int(node[0])
        for node in reference_graph:
            if node.is_repeat and right_flank_uncovered[node.node_id] == 0 and left_flank_uncovered[node.node_id] == 0:
                if node_count[node.node_id] not in spanning_genotypes[node.node_id]:
                    spanning_genotypes[node.node_id].append(node_count[node.node_id])
    for node_id in spanning_genotypes:
        spanning_genotypes[node_id].sort()
    return spanning_genotypes
def get_repeat_path(read_alignments, reference_graph):
    """Get a PathWithInsertions corresponding to highest count of each repeat_unit
    Arguments:
        read_alignments: dict mapping sample name to list of ReadAlignments
        reference_graph: ReferenceGraph for repeat locus
    """
    #TODO: Infer repeat_graph from read_alignments if repeat_specs not provided.
    max_node_count = defaultdict(lambda: 0, {})
    for sample in read_alignments:
        for read_alignment in read_alignments[sample]:
            node_count = defaultdict(lambda: 0, {})
            for node in parse_graph_cigar(read_alignment.graph_cigar):
                node_count[int(node[0])] += 1
            for node_id in node_count:
                max_node_count[node_id] = max(max_node_count[node_id], node_count[node_id])
    if max_node_count[reference_graph[0].node_id] == 0:
        max_node_count[reference_graph[0].node_id] = 1
    if max_node_count[reference_graph[-1].node_id] == 0:
        max_node_count[reference_graph[-1].node_id] = 1
    node_id_list = []
    for node in reference_graph:
        node_id_list += [node.node_id] * max_node_count[node.node_id]
    repeat_path = PathWithInsertions(node_id_list)
    print ("Reference path = %s" % [n.node_id for n in repeat_path])
    for sample in read_alignments:
        for read_alignment in read_alignments[sample]:
            repeat_path.update(read_alignment.graph_cigar, read_alignment.offset)
    return repeat_path
    def get_alignment_coordinates(self,
                                  query_sequence,
                                  reference_graph,
                                  graph_cigar,
                                  offset=0,
                                  show_insertions=False):
        """Update nodes with insertions in the read alignment.
        Arguments:
            query_sequence: sequence of aligned read
            reference_graph: ReferenceGraph for locus
            graph_cigar: graph cigar string of read alignment.
            offset: 0-based position of first base in the first node in the alignment
            show_insertions: Boolean to show full sequence of insertions
        Returns:
            alignment_coordinates: a list of 3-tuples
                Each 3-tuple consists of (coordinate, query_bp, ref_bp)
                Gaps are represented by "-"
        """
        INTERNODE_GAP = 1
        alignment_coordinates = []
        graph_cigar_tuples = parse_graph_cigar(graph_cigar)
        path_index = self.get_alignment_start_index(graph_cigar)
        node_coordinate = 0
        cigar_tuple_index = 0
        query_pos = 0
        for i in range(path_index):
            node_coordinate += len(reference_graph[self[i].node_id].seq)
            if show_insertions:
                node_coordinate += self[i].get_total_insertion_size()
            node_coordinate += INTERNODE_GAP
        for node_alignment in graph_cigar_tuples:
            while path_index < len(self) and self[path_index].node_id != int(
                    node_alignment[0]):
                node_coordinate += len(
                    reference_graph[self[path_index].node_id].seq)
                if show_insertions:
                    node_coordinate += self[
                        path_index].get_total_insertion_size()
                path_index += 1
                node_coordinate += INTERNODE_GAP
            cigar = node_alignment[1]
            #TODO: can we rely on graph cigar to always include bases from node extremes? Then we do not need offset
            if cigar_tuple_index == 0:
                node_offset = offset
            else:
                node_offset = 0
            node_align = self[path_index].get_alignment_coordinates(
                cigar, query_pos, query_sequence, node_coordinate, node_offset,
                reference_graph, show_insertions)
            alignment_coordinates += node_align
            query_pos += get_cigar_query_align_len(cigar)
            cigar_tuple_index += 1
            node_coordinate += len(
                reference_graph[self[path_index].node_id].seq)
            if show_insertions:
                node_coordinate += self[path_index].get_total_insertion_size()
            node_coordinate += INTERNODE_GAP
            path_index += 1

        return alignment_coordinates
def get_genotype_classification(read_alignment, reference_graph, spanning_genotypes, node_grouping_list=None):
    '''From a list of ReadAlignments, return a dict mapping each repetitive node_id to ordered list of genotypes spanned by a read.
    Arguments:
        read_align_list: List of ReadAlignment to the locus in the sample
        reference_graph: ReferenceGraph for STR locus
        node_grouping_list: Ordered list of ReferenceNodes in decreasing order of priority. (Default: left to right, repeat units first)
    Returns:
        list of 3 tuples for each node in node_grouping_list: (ReferenceNode, CompareStatus ("=", ">" or "\leq"), Genotype String)
    '''
    node_count = np.zeros(len(reference_graph))
    left_flank_uncovered = np.ones(len(reference_graph))
    right_flank_uncovered = np.ones(len(reference_graph))
    previous_node_id = -1
    for cigar_node in parse_graph_cigar(read_alignment.graph_cigar):
        node_count[int(cigar_node[0])] += 1
        if previous_node_id != -1 and previous_node_id != int(cigar_node[0]):
            for i in range(previous_node_id, int(cigar_node[0])):
                right_flank_uncovered[i] = 0
                left_flank_uncovered[i + 1] = 0
        previous_node_id = int(cigar_node[0])
    gt_class = []
    if node_grouping_list is None:
        node_grouping_list = ([n.node_id for n in reference_graph if (n.is_repeat and 'ignore' not in n.node_name)]
                                  + [n.node_id for n in reference_graph if (n.is_repeat and 'ignore' in n.node_name)]
                                  + [n.node_id for n in reference_graph if not n.is_repeat])
    for node_id in node_grouping_list:
        graph_node = reference_graph[node_id]
        if not graph_node.is_repeat:
            continue
        gt_compare_status = "="
        if (node_count[graph_node.node_id], left_flank_uncovered[graph_node.node_id], right_flank_uncovered[graph_node.node_id]) == (0, 1, 1):
            gt = -1
        elif ((left_flank_uncovered[graph_node.node_id] != right_flank_uncovered[graph_node.node_id]) or
              node_count[graph_node.node_id] > 0 and (left_flank_uncovered[graph_node.node_id], right_flank_uncovered[graph_node.node_id]) == (1, 1)):
            spanning_index = bisect.bisect_left(spanning_genotypes[graph_node.node_id], node_count[graph_node.node_id])
            if spanning_index >= len(spanning_genotypes[graph_node.node_id]):
                gt_compare_status = '>'
                if len(spanning_genotypes[graph_node.node_id]) == 0:
                    gt = 0
                else:
                    gt = spanning_genotypes[graph_node.node_id][-1]
            else:
                gt = spanning_genotypes[graph_node.node_id][spanning_index]
                gt_compare_status = '\leq'
        else:
            gt = node_count[graph_node.node_id]
        gt_string = r'$gt%s %s %s$' % (graph_node.node_id, gt_compare_status, int(gt))
        if gt > 0 and gt_compare_status == '\leq':
            spanning_index = bisect.bisect_left(spanning_genotypes[graph_node.node_id], node_count[graph_node.node_id])
            if spanning_index > 0:
                gt_string = r"$%s < gt%s \leq %s$" % (int(spanning_genotypes[graph_node.node_id][spanning_index - 1]), graph_node.node_id, int(gt))
        gt_class.append((graph_node, gt_compare_status, gt, gt_string))
    return gt_class
 def update(self, graph_cigar, offset=0):
     """Update nodes with insertions in the read alignment.
     Arguments:
         graph_cigar: graph cigar string of read alignment.
         offset: 0-based position of first base in the first node in the alignment 
     """
     graph_cigar_tuples = parse_graph_cigar(graph_cigar)
     path_index = self.get_alignment_start_index(graph_cigar)
     cigar_tuple_index = 0
     for node_alignment in graph_cigar_tuples:
         while path_index < len(self) and self[path_index].node_id != int(
                 node_alignment[0]):
             path_index += 1
         if cigar_tuple_index == 0:
             self[path_index].update(node_alignment[1], offset)
         else:
             self[path_index].update(node_alignment[1])
         cigar_tuple_index += 1
         path_index += 1
Ejemplo n.º 7
0
 def get_graph_cigar_seq(self, graph_cigar, offset=0):
     '''
     Get reference sequence corresponding to graph_cigar alignment
     Args:
         graph_cigar: graph_cigar string
         offset: first matched position in the first node (0-based)
     Returns:
         reference sequence
     '''
     graph_cigar_seq = ''
     graph_cigar_tuples = seq_util.parse_graph_cigar(graph_cigar)
     for graph_cigar_tuple in enumerate(graph_cigar_tuples):
         if graph_cigar_tuple[0] == 0:
             graph_cigar_seq += self[int(
                 graph_cigar_tuple[1][0])].get_cigar_seq(
                     graph_cigar_tuple[1][1], offset)
         else:
             graph_cigar_seq += self[int(
                 graph_cigar_tuple[1][0])].get_cigar_seq(
                     graph_cigar_tuple[1][1])
     return graph_cigar_seq
    def get_alignment_start_index(self, graph_cigar):
        """Find index of first node in alignment from graph_cigar string
        Arguments:
            graph_cigar: graph cigar string
        Returns:
            Ordered list of node indices in PathWithInsertions.
        """
        graph_cigar_tuples = parse_graph_cigar(graph_cigar)
        if len(graph_cigar_tuples) == 0:
            return 0
        first_id = int(graph_cigar_tuples[0][0])

        # left aligned if single node or in-repeat
        if first_id == int(graph_cigar_tuples[-1][0]):
            first_start_index = 0
            while first_start_index < len(self):
                current_node = self[first_start_index]
                if current_node.node_id == first_id:
                    return first_start_index
                first_start_index += 1
        cigar_index = 0
        while cigar_index < len(graph_cigar_tuples):
            if int(graph_cigar_tuples[cigar_index][0]) != int(
                    graph_cigar_tuples[0][0]):
                break
            cigar_index += 1
        first_count = cigar_index
        first_end_index = 0
        current_node_id = self[first_end_index].node_id
        while first_end_index < len(self):
            if first_end_index == len(self) - 1:
                return first_end_index - first_count + 1
            next_node_id = self[first_end_index + 1].node_id
            if current_node_id == first_id and next_node_id != first_id:
                return first_end_index - first_count + 1
            first_end_index += 1
            current_node_id = next_node_id
        alignment_start_index = first_end_index - first_count + 1
        return alignment_start_index