コード例 #1
0
ファイル: tandomicity.x.py プロジェクト: adamnovak/gwhatshap
with stream.open(str(trans_filename), "rb") as istream:
    for data in istream:
        l = vg_pb2.SnarlTraversal()
        l.ParseFromString(data)
        if l.snarl.start.backward == True:
            start_node = l.snarl.end.node_id
        else:
            start_node = l.snarl.start.node_id
        bubbles_start.add(start_node)

multiplicity_bubbles = defaultdict(list)
read_details = defaultdict(list)
#with stream.open('../out.new.gam', "rb") as istream:
with stream.open(str(gam_filename), "rb") as istream:
    for data in istream:
        g = vg_pb2.Alignment()
        g.ParseFromString(data)
        tmp = []
        for i in range(0, len(g.path.mapping)):
            node = g.path.mapping[i].position.node_id
            if node in bubbles_start:
                multiplicity_bubbles[node].append(g.name)
            if node in tmp:
                out_file.write(str(node) + '\n')
            tmp.append(node)

with stream.open(str(true_haps_filename), "rb") as istream:
    for data in istream:
        g = vg_pb2.Alignment()
        g.ParseFromString(data)
        tmp = []
コード例 #2
0
        'T': 'A',
        'a': 'T',
        'c': 'G',
        'g': 'C',
        't': 'A'
    }
    return "".join([seq_dict[base] for base in reversed(seq)])


nodes_list = set()
dummy_list = ['0'] * 100
orderalignment = defaultdict(list)
orderalignment = defaultdict(lambda: [-1] * 100, orderalignment)
with stream.open(str(file_input), "rb") as istream:
    for data in istream:
        g = vg_pb2.Alignment()
        g.ParseFromString(data)
        #read_info = g.name
        canu_name = g.name

        canu_chunk_num = int(g.query_position)
        orderalignment[canu_name].insert(canu_chunk_num, g)

new_orderalignment = defaultdict(list)
for k, v in orderalignment.items():
    new_orderalignment[k] = [x for x in v if x != -1]

print('hello')
ostream = stream.open(sys.argv[2], 'wb')

for k, v in new_orderalignment.items():
コード例 #3
0
ファイル: aux_contigs_new.py プロジェクト: sarangian/WHdenovo
def vg_reader(locus_file, gam_file):
    """
	input: sorted locus and sorted GAM file output from vg.
	output: sorted readset for core DP.
	assumptions: 
	1. locus file consists of linear ordering of simple bubbles only and hence sorted. Each locus file does not contain start and end vertex.
	2. paths in the locus should be covered by atleast one pacbio read.
	2. GAM file is sorted and restricted to locus file.
	3. files consists of all DAG connected components.
	4. add variant only when it identifies the branch uniquely.
	"""

    locus_count = 0
    prev_startsnarl = 0
    prev_endsnarl = 0
    locus_branch_mapping = OrderedDict()
    locus_count = 0
    prev_startsnarl = 0
    prev_startsnarl_orientation = -1
    prev_endsnarl = 0
    prev_endsnarl_orientation = -1
    reads_dict = defaultdict(list)
    with stream.open(str(locus_file), "rb") as istream:
        for data in istream:
            l = vg_pb2.SnarlTraversal()
            l.ParseFromString(data)
            #TODO: make ordered doctionary locus_branch_mapping
            # handle forward and backward case of nodes
            current_startsnarl = l.snarl.start.node_id
            current_startsnarl_orientation = l.snarl.start.backward
            current_endsnarl = l.snarl.end.node_id
            current_endsnarl_orientation = l.snarl.end.backward
            path_in_bubble = []

            if len(l.visits) == 0:
                #TODO: for now, assumed, all nodes in path are either forward or backward
                if l.snarl.start.backward == True:
                    path_in_bubble.append(
                        tuple((l.snarl.end.node_id, l.snarl.start.node_id)))
                else:
                    path_in_bubble.append(
                        tuple((l.snarl.start.node_id, l.snarl.end.node_id)))
            else:
                #TODO: for now, assumed, all nodes in path are either forward or backward
                if l.snarl.start.backward == True:
                    path_in_bubble.append(
                        tuple((l.snarl.end.node_id, l.visits[-1].node_id)))
                    for i in range(0, len(l.visits) - 1):
                        path_in_bubble.append(
                            tuple((l.visits[i + 1].node_id,
                                   l.visits[i].node_id)))
                    path_in_bubble.append(
                        tuple((l.visits[0].node_id, l.snarl.start.node_id)))
                else:
                    path_in_bubble.append(
                        tuple((l.snarl.start.node_id, l.visits[0].node_id)))
                    for i in range(0, len(l.visits) - 1):
                        path_in_bubble.append(
                            tuple((l.visits[i].node_id,
                                   l.visits[i + 1].node_id)))
                    path_in_bubble.append(
                        tuple((l.visits[-1].node_id, l.snarl.end.node_id)))

            if current_startsnarl == prev_startsnarl and current_endsnarl == prev_endsnarl and current_endsnarl_orientation == prev_endsnarl_orientation and prev_startsnarl_orientation == current_startsnarl_orientation:
                per_locus.append(path_in_bubble)
            else:
                locus_count = locus_count + 1
                per_locus = []
                per_locus.append(path_in_bubble)
            prev_startsnarl = current_startsnarl
            prev_startsnarl_orientation = current_startsnarl_orientation
            prev_endsnarl = current_endsnarl
            prev_endsnarl_orientation = current_endsnarl_orientation
            locus_branch_mapping[locus_count] = per_locus

    print('The number of hets:')
    het_count = 0
    for k, v in locus_branch_mapping.items():
        if len(v) > 1:
            het_count = het_count + 1
    print(het_count)
    # keep branch of paths in each bubble.
    alleles_per_pos = defaultdict()
    for k, v in locus_branch_mapping.items():
        alleles_per_pos[k] = len(v)

    # both simple and complex bubbles: key is the values in locus_branch_mapping and value is triplet(locus, branch, alleles)
    reverse_mapping = defaultdict(list)
    for k, v in locus_branch_mapping.items():
        if len(v) > 1:  # more than one branch
            for i, b in enumerate(v):
                if len(b) > 0:
                    for p, j in enumerate(b):
                        reverse_mapping[j].append(
                            [k, i, len(v)]
                        )  # in complex bubbles, a node can map to multiple branches.

    count = 0
    duplicated = 0
    #TODO: consider reads with only positive score.
    with stream.open(str(gam_file), "rb") as istream:
        for data in istream:
            g = vg_pb2.Alignment()
            g.ParseFromString(data)
            # hard-coded source id, mapping quality and other values.
            val1 = True
            val2 = False

            count1 = 0
            count2 = 0
            score = g.score / len(g.sequence)

            #if score > 0.2:
            #       continue
            read = []  # create read for each read alignment

            prev_tmp = []
            prev_locus = -1
            locus = -1

            for i in range(0, len(g.path.mapping) - 1):
                #for i in g.path.mapping: # go over the mapping in a read
                # TODO: check for forward or reverse strand, we may not need it for DAG.
                edge1 = tuple((int(g.path.mapping[i].position.name),
                               int(g.path.mapping[i + 1].position.name)
                               ))  # go over nodes in a mapping
                edge2 = tuple((int(g.path.mapping[i + 1].position.name),
                               int(g.path.mapping[i].position.name)
                               ))  # go over nodes in a mapping
                if edge1 in reverse_mapping or edge2 in reverse_mapping:  # handle start and sink node.
                    if edge1 in reverse_mapping:
                        qualities = [10] * reverse_mapping[edge1][0][2]
                        node_inf = [
                            tuple(i[0:2]) for i in reverse_mapping[edge1]
                        ]  # consider (locus, branch)
                    else:
                        qualities = [10] * reverse_mapping[edge2][0][2]
                        node_inf = [
                            tuple(i[0:2]) for i in reverse_mapping[edge2]
                        ]
                    tmp = [x for x in node_inf]
                    if prev_locus != tmp[0][0]:
                        prev_tmp = tmp
                        prev_locus = tmp[0][0]

                    interset_tmp = list(set(tmp).intersection(set(prev_tmp)))
                    if len(prev_tmp) > 0 and len(
                            set(tmp).intersection(set(prev_tmp))
                    ) == 1:  # for complicated bubbles, but with Top-k paths. combination of some nodes uniquely determine branch.
                        qualities[interset_tmp[0][1]] = 0
                        if i == len(g.path.mapping) - 2:
                            #read.add_variant(interset_tmp[0][0], interset_tmp[0][1], qualities)
                            reads_dict[g.name + "_" +
                                       str(g.query_position)].append(
                                           interset_tmp[0][0])
                            read.append(interset_tmp[0][0])
                        else:
                            next_edge1 = tuple(
                                (int(g.path.mapping[i + 1].position.name),
                                 int(g.path.mapping[i + 2].position.name)))
                            next_edge2 = tuple(
                                (int(g.path.mapping[i + 2].position.name),
                                 int(g.path.mapping[i + 1].position.name)))

                            if next_edge1 not in reverse_mapping and next_edge2 not in reverse_mapping:
                                #read.add_variant(interset_tmp[0][0], interset_tmp[0][1], qualities)
                                reads_dict[g.name + "_" +
                                           str(g.query_position)].append(
                                               interset_tmp[0][0])
                                read.append(interset_tmp[0][0])
                        locus = interset_tmp[0][0]
                else:
                    read.append(int(g.path.mapping[i].position.name))
                    read.append(int(g.path.mapping[i + 1].position.name))
                    reads_dict[g.name + "_" + str(g.query_position)].append(
                        int(g.path.mapping[i].position.name))
                    reads_dict[g.name + "_" + str(g.query_position)].append(
                        int(g.path.mapping[i + 1].position.name))

            # for every pair of bubbles or bubble-node
            for k in range(0, len(read) - 1):
                pair1 = str(read[k]) + "_" + str(
                    read[k + 1])  # not taking care of reverse direction now
                pair2 = str(read[k + 1]) + "_" + str(read[k])
                # should take of direction, not adding pairs reverse of each other
                if pair2 in consec_pairs:
                    consec_pairs[pair2].add(g.name)
                else:
                    consec_pairs[pair1].add(g.name)

    return reads_dict, consec_pairs