Exemple #1
0
    def run(self):
        signal.signal(signal.SIGTERM, lambda a, b: sys.exit())  # Exit quietly.
        while True:
            with self.run_config.start() as controller:
                for _ in range(FLAGS.batch_size):
                    try:
                        replay_path = self.replay_queue.get()
                    except Queue.Empty:
                        return
                    try:
                        with self.counter.get_lock():
                            self.counter.value += 1
                            print('Processing {}/{} ...'.format(self.counter.value, self.total_num))

                        sampled_action_path = os.path.join(FLAGS.save_path.replace(
                            'SampledObservations', 'SampledActions'), os.path.basename(replay_path))
                        if not os.path.isfile(sampled_action_path):
                            return

                        with open(sampled_action_path) as f:
                            actions = json.load(f)
                        actions.insert(0, 0)

                        replay_data = self.run_config.replay_data(replay_path)
                        info = controller.replay_info(replay_data)
                        map_data = None
                        if info.local_map_path:
                            map_data = self.run_config.map_data(info.local_map_path)

                        for player_info in info.player_info:
                            race = sc_common.Race.Name(player_info.player_info.race_actual)
                            player_id = player_info.player_info.player_id

                            observation_path = os.path.join(FLAGS.save_path, race,
                                                            '{}@{}'.format(player_id, os.path.basename(replay_path)))
                            global_info_path = observation_path.replace('SampledObservations', 'GlobalInfos')

                            if os.path.isfile(observation_path) and os.path.isfile(global_info_path):
                                continue

                            ostream = stream.open(observation_path, 'wb', buffer_size=1000)
                            self.process_replay(controller, replay_data, map_data, player_id, actions,
                                                ostream, global_info_path)
                            ostream.close()
                    except Exception as e:
                        try:
                            ostream.close()
                            if os.path.isfile(observation_path):
                                os.remove(observation_path)

                            if os.path.isfile(global_info_path):
                                os.remove(global_info_path)
                        except:
                            pass

                        print(e)
                        break
                    finally:
                        self.replay_queue.task_done()
def close_old_and_begin_new_stream(protobuf_out_stream, RASPI_IMAGE_GIT_HASH):
    if protobuf_out_stream != None:
        protobuf_out_stream.close()
    protobuf_out_stream = stream.open(
        PROTOBUF_DATA_FOLDER +
        datetime.utcnow().strftime('%Y-%m-%dT%H_%M_%S') + 'GH' +
        hex(RASPI_IMAGE_GIT_HASH) + ".proto.gz", 'ab')
    return protobuf_out_stream
Exemple #3
0
def build_reads_dict(nodes, gam_file_path, min_cutoff):
    all_reads = dict()

    # reading gam file
    with stream.open(str(gam_file_path), "rb") as in_stream:
        counter = 0
        read_mappings = ReadMappings(name="first")

        for data in in_stream:
            counter += 1

            if (counter % 10000000) == 0:
                logging.info("{} mappings processed".format(counter))

            align = Alignment()
            align.ParseFromString(data)

            # skipping alignments with less than minimum cutoff
            if len(align.sequence) < min_cutoff:
                continue

            if align.name not in all_reads:  # either first or new read
                mapping = Mapping()
                # mapping now has the nodes and the length
                mapping.fill_mapping(nodes, align, len(align.sequence))

                all_reads[align.name] = []
                # all_reads[align.name] = read_mappings

                if read_mappings.name == "first":  # only once for first read
                    # all_reads[align.name].name = align.name
                    read_mappings.name = align.name
                    read_mappings.add_mapping(mapping)
                    # all_reads[read_mappings.name].add_mapping(mapping)
                    # all_reads[align.name].add_mapping(mapping)

                # new read, need to store the previous read_mappings
                # in all_reads and start a new ReadMappings object to fill
                else:
                    # all_reads[align.name] = ReadMappings(name=align.name)
                    # all_reads[align.name].add_mapping(mapping)
                    all_reads[read_mappings.name] = read_mappings.list_nodes()
                    read_mappings = ReadMappings(name=align.name)
                    read_mappings.add_mapping(mapping)

            # this read already exists (from previous mapping)
            # add this mapping
            # add mapping checks if this mapping is a new chain or for a chain
            # already seen with this read, then compares the length and keep
            # the longer one
            else:
                mapping = Mapping()
                mapping.fill_mapping(nodes, align, len(align.sequence))

                read_mappings.add_mapping(mapping)
                # all_reads[align.name].add_mapping(mapping)

    return all_reads
Exemple #4
0
 def load_stream(self, stream_path):
     res = []
     with stream.open(stream_path, 'rb') as proto_stream:
         for data in proto_stream:
             trial = Trial()
             trial.ParseFromString(data)
             assert trial.HasField("damage_class")
             res.append((self.to_vec(trial), trial.damage_class))
     return res
def vg_graph_reader(inp):
    with stream.open(str(inp), "rb") as istream:
        for data in istream:
            l = vg_pb2.Graph()
            l.ParseFromString(data)
            g = Graph(len(l.node))
            for j in range(len(l.edge)):
                from_edge = getattr(l.edge[j], "from")
                g.addEdge(from_edge, l.edge[j].to)

    return g
Exemple #6
0
def vg_graph_reader(vg_file):
    node_seq_list = defaultdict()
    edge_connections = defaultdict(list)
    with stream.open(str(vg_file), "rb") as istream:
        for data in istream:
            l = vg_pb2.Graph()
            l.ParseFromString(data)
            for i in range(len(l.node)):
                index = l.node[i].id
                seq = l.node[i].sequence
                node_seq_list[index] = seq
            for j in range(len(l.edge)):
                from_edge = getattr(l.edge[j], "from")
                edge_connections[from_edge].append(l.edge[j].to)
    return node_seq_list, edge_connections
Exemple #7
0
def main():
    nomidi = True
    for filename in os.listdir(devices_dir):
        path = os.path.join(devices_dir, filename)
        if filename.startswith('midi'):
            fd = stream.open(path)
            schedule(read_midi, fd)
            nomidi = False
    if nomidi:
        print("no midi devices found")
        sys.exit(1)

    log = Logger(sys.stdout, sys.stdout)
    server = http.Server(http_application, log, port=9898)
    schedule(server.run)

    schedule.run()
Exemple #8
0
    def run(self):
        signal.signal(
            signal.SIGTERM,
            lambda a, b: sys.exit())  # Kill thread upon termination signal
        while True:
            with self.run_config.start() as controller:
                for _ in range(FLAGS.batch_size):
                    try:
                        replay_path = self.replay_queue.get()
                    except Queue.Empty:
                        return
                    try:
                        with self.counter.get_lock():
                            self.counter.value += 1
                            print('Processing {}/{} ...'.format(
                                self.counter.value, self.total_num))

                        sampled_action_path = os.path.join(
                            FLAGS.save_path.replace('SampledObservations',
                                                    'SampledActions'),
                            os.path.basename(replay_path))
                        if not os.path.isfile(
                                sampled_action_path
                        ):  # Unable to find the sampled observations of replay
                            print('Unable to locate', sampled_action_path)
                            return

                        with open(sampled_action_path
                                  ) as f:  # Get all macro action frames
                            actions = json.load(f)
                        actions.insert(0, 0)  # Add 0th frame to the start

                        replay_data = self.run_config.replay_data(replay_path)
                        info = controller.replay_info(replay_data)
                        map_data = None
                        if info.local_map_path:  # Special handling for custom maps
                            map_data = self.run_config.map_data(
                                info.local_map_path)

                        for player_info in info.player_info:  # Parse replay from each player's point of view
                            race = common_pb.Race.Name(
                                player_info.player_info.race_actual)
                            player_id = player_info.player_info.player_id

                            observation_path = os.path.join(
                                FLAGS.save_path, race,
                                '{}@{}'.format(player_id,
                                               os.path.basename(replay_path)))
                            global_info_path = observation_path.replace(
                                'SampledObservations', 'GlobalInfos')

                            if os.path.isfile(
                                    observation_path
                            ) and os.path.isfile(
                                    global_info_path
                            ):  # Skip replay if it has already been processed
                                continue

                            ostream = stream.open(observation_path,
                                                  'wb',
                                                  buffer_size=1000)
                            self.process_replay(controller, replay_data,
                                                map_data, player_id, actions,
                                                ostream, global_info_path)
                            ostream.close()
                    except Exception as e:
                        try:
                            ostream.close()
                            if os.path.isfile(observation_path):
                                os.remove(observation_path)

                            if os.path.isfile(global_info_path):
                                os.remove(global_info_path)
                        except:
                            pass

                        print(e)
                        break
                    finally:
                        self.replay_queue.task_done()
Exemple #9
0
import vg_pb2
from collections import Counter
from collections import defaultdict
import collections
from collections import OrderedDict, namedtuple
from collections import defaultdict
# assumption ... all S' and before L's
#filename = sys.argv[1]
#out = sys.argv[2]

d = {}
count = 1

bubbles_start = set()
#with stream.open('assembly_graph.P.int.remn2n.X_100.chrXIII.trans' ,"rb") as istream:
with stream.open('chrXIII.filtered.ordered.trans', "rb") as istream:
    for data in istream:
        l = vg_pb2.SnarlTraversal()
        l.ParseFromString(data)
        if l.snarl.start.backward == True:
            start_node = l.snarl.end.node_id
        else:
            start_node = l.snarl.start.node_id
        bubbles_start.add(start_node)

multiplicity_bubbles = defaultdict(list)
read_details = defaultdict(list)
with stream.open('../out.new.gam', "rb") as istream:
    #with stream.open('true_haps.chrXIII.gam', "rb") as istream:
    for data in istream:
        g = vg_pb2.Alignment()
Exemple #10
0
#@jit
def to_cigar(edit, extended):
    if edit.from_length == edit.to_length:
        if extended and not edit.sequence:
            return (7, edit.from_length)
        elif extended:
            return (8, edit.from_length)
        else:
            return (0, edit.from_length)
    elif edit.from_length:
        return (2, edit.from_length)  # DEL
    elif edit.to_length:
        return (1, edit.to_length)  #INS


with stream.open(gamfile, 'rb') as istream:
    with pysam.AlignmentFile(bamfile, "wb", template=samheader,
                             threads=4) as outf:
        for data in istream:
            m = vg_pb2.Alignment()
            m.ParseFromString(data)
            read_pos = 0
            logger.debug(m)
            for s in m.path.mapping:
                cigar = tuple([to_cigar(i, False) for i in s.edit])
                read_end = read_pos + consume_cigar(cigar)
                a = pysam.AlignedSegment(header=samheader.header)
                a.query_sequence = m.sequence[
                    read_pos:
                    read_end]  # e.g. "AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG"
                if s.position.is_reverse:
Exemple #11
0
import sys
import stream
import logging
import vg_pb2
from collections import Counter
from collections import defaultdict
import collections
from collections import OrderedDict, namedtuple
from collections import defaultdict
import networkx as nx

bubble_nodes = set()
bubble_start = defaultdict()
bubble_end = set()
with stream.open('assembly_graph.P.int.remn2n.X_100.chrXIII.trans',
                 "rb") as istream:
    for data in istream:
        l = vg_pb2.SnarlTraversal()
        l.ParseFromString(data)
        start = l.snarl.start.node_id
        end = l.snarl.end.node_id
        if l.snarl.start.backward == True:
            bubble_start[end] = start
        else:
            bubble_start[start] = end

        for i in range(0, len(l.visits)):
            bubble_nodes.add(l.visits[i].node_id)

print(len(bubble_nodes))
Exemple #12
0
def vg_reader(locus_file, gam_file, sample):
    """
    input: sorted locus and sorted GAM file output from vg.
    output: sorted readset for core DP.
    assumptions: 
    1. locus file consists of linear ordering of simple bubbles only and hence sorted. Each locus file does not contain start and end vertex.
    2. paths in the locus should be covered by atleast one pacbio read.
    2. GAM file is sorted and restricted to locus file.
    3. files consists of all DAG connected components.
    4. add variant only when it identifies the branch uniquely.
    """

    locus_count = 0
    prev_startsnarl = 0
    prev_endsnarl = 0
    locus_branch_mapping = OrderedDict()
    locus_count = 0
    prev_startsnarl = 0
    prev_startsnarl_orientation = -1
    prev_endsnarl = 0
    prev_endsnarl_orientation = -1

    with stream.open(str(locus_file), "rb") as istream:
        for data in istream:
            l = vg_pb2.SnarlTraversal()
            l.ParseFromString(data)
            #TODO: make ordered doctionary locus_branch_mapping
            # handle forward and backward case of nodes
            current_startsnarl = l.snarl.start.node_id
            current_startsnarl_orientation = l.snarl.start.backward
            current_endsnarl = l.snarl.end.node_id
            current_endsnarl_orientation = l.snarl.end.backward
            path_in_bubble = []

            if len(l.visits) == 0:
                #TODO: for now, assumed, all nodes in path are either forward or backward
                if l.snarl.start.backward == True:
                    path_in_bubble.append(
                        tuple((l.snarl.end.node_id, l.snarl.start.node_id)))
                else:
                    path_in_bubble.append(
                        tuple((l.snarl.start.node_id, l.snarl.end.node_id)))
            else:
                #TODO: for now, assumed, all nodes in path are either forward or backward
                if l.snarl.start.backward == True:
                    path_in_bubble.append(
                        tuple((l.snarl.end.node_id, l.visits[-1].node_id)))
                    for i in range(0, len(l.visits) - 1):
                        path_in_bubble.append(
                            tuple((l.visits[i + 1].node_id,
                                   l.visits[i].node_id)))
                    path_in_bubble.append(
                        tuple((l.visits[0].node_id, l.snarl.start.node_id)))
                else:
                    path_in_bubble.append(
                        tuple((l.snarl.start.node_id, l.visits[0].node_id)))
                    for i in range(0, len(l.visits) - 1):
                        path_in_bubble.append(
                            tuple((l.visits[i].node_id,
                                   l.visits[i + 1].node_id)))
                    path_in_bubble.append(
                        tuple((l.visits[-1].node_id, l.snarl.end.node_id)))

            if current_startsnarl == prev_startsnarl and current_endsnarl == prev_endsnarl and current_endsnarl_orientation == prev_endsnarl_orientation and prev_startsnarl_orientation == current_startsnarl_orientation:
                per_locus.append(path_in_bubble)
            else:
                locus_count = locus_count + 1
                per_locus = []
                per_locus.append(path_in_bubble)
            prev_startsnarl = current_startsnarl
            prev_startsnarl_orientation = current_startsnarl_orientation
            prev_endsnarl = current_endsnarl
            prev_endsnarl_orientation = current_endsnarl_orientation
            if len(per_locus) < 3:
                locus_branch_mapping[locus_count] = per_locus
    print('The number of hets:')
    het_count = 0
    for k, v in locus_branch_mapping.items():
        if len(v) > 1:
            het_count = het_count + 1
    print(het_count)
    # keep branch of paths in each bubble.
    alleles_per_pos = defaultdict()
    for k, v in locus_branch_mapping.items():
        alleles_per_pos[k] = len(v)

    # both simple and complex bubbles: key is the values in locus_branch_mapping and value is triplet(locus, branch, alleles)
    reverse_mapping = defaultdict(list)
    for k, v in locus_branch_mapping.items():
        if len(v) > 1:  # more than one branch
            for i, b in enumerate(v):
                if len(b) > 0:
                    for p, j in enumerate(b):
                        reverse_mapping[j].append(
                            [k, i, len(v)]
                        )  # in complex bubbles, a node can map to multiple branches.

    # both simple and complex bubbles: extract reads from GAM file associated with the locus and create a sorted readset.
    # in complex bubble, set of nodes uniquely determine the path.
    readset = ReadSet()
    count = 0
    duplicated = 0
    #TODO: consider reads with only positive score.
    with stream.open(str(gam_file), "rb") as istream:
        for data in istream:
            g = vg_pb2.Alignment()
            g.ParseFromString(data)
            # hard-coded source id, mapping quality and other values.
            val1 = True
            val2 = False

            count1 = 0
            count2 = 0
            #score = g.score/len(g.sequence)

            #if score > 0.2:
            #   continue
            read = Read(g.name, 0, 0,
                        sample)  # create read for each read alignment

            prev_tmp = []
            prev_locus = -1
            locus = -1

            for i in range(0, len(g.path.mapping) - 1):
                #for i in g.path.mapping: # go over the mapping in a read
                # TODO: check for forward or reverse strand, we may not need it for DAG.
                edge1 = tuple((int(g.path.mapping[i].position.node_id),
                               int(g.path.mapping[i + 1].position.node_id)
                               ))  # go over nodes in a mapping
                edge2 = tuple((int(g.path.mapping[i + 1].position.node_id),
                               int(g.path.mapping[i].position.node_id)
                               ))  # go over nodes in a mapping
                if edge1 in reverse_mapping or edge2 in reverse_mapping:  # handle start and sink node.
                    if edge1 in reverse_mapping:
                        #qualities = [10]* reverse_mapping[edge1][0][2]
                        qualities = 1
                        node_inf = [
                            tuple(i[0:2]) for i in reverse_mapping[edge1]
                        ]  # consider (locus, branch)
                    else:
                        # qualities = [10]* reverse_mapping[edge2][0][2]
                        qualities = 1
                        node_inf = [
                            tuple(i[0:2]) for i in reverse_mapping[edge2]
                        ]
                    tmp = [x for x in node_inf]
                    if prev_locus != tmp[0][0]:
                        prev_tmp = tmp
                        prev_locus = tmp[0][0]

                    interset_tmp = list(set(tmp).intersection(set(prev_tmp)))
                    if len(prev_tmp) > 0 and len(
                            set(tmp).intersection(set(prev_tmp))
                    ) == 1:  # for complicated bubbles, but with Top-k paths. combination of some nodes uniquely determine branch.
                        #qualities[interset_tmp[0][1]] = 0
                        qualities = 1
                        if i == len(g.path.mapping) - 2:
                            if interset_tmp[0][1] == 0 or interset_tmp[0][
                                    1] == 1:
                                read.add_variant(interset_tmp[0][0],
                                                 interset_tmp[0][1], qualities)
                        else:
                            next_edge1 = tuple(
                                (int(g.path.mapping[i + 1].position.node_id),
                                 int(g.path.mapping[i + 2].position.node_id)))
                            next_edge2 = tuple(
                                (int(g.path.mapping[i + 2].position.node_id),
                                 int(g.path.mapping[i + 1].position.node_id)))

                            if next_edge1 not in reverse_mapping and next_edge2 not in reverse_mapping:
                                if interset_tmp[0][1] == 0 or interset_tmp[0][
                                        1] == 1:
                                    read.add_variant(interset_tmp[0][0],
                                                     interset_tmp[0][1],
                                                     qualities)

                        locus = interset_tmp[0][0]

            readset.add(read)

    readset1 = ReadSet()
    tmp_duplicated = set()
    for read in readset:
        if read.sort() == 1:
            duplicated = duplicated + 1
            tmp = []
            for variant in read:
                tmp.append(variant.position)
            #print("duplicated variant")
            x = [
                item for item, count in collections.Counter(tmp).items()
                if count > 1
            ]
            for a in x:
                tmp_duplicated.add(a)
            continue
        else:
            tmp = []
            for variant in read:
                tmp.append(variant.position)
            #print("duplicated variant")
            x = [
                item for item, count in collections.Counter(tmp).items()
                if count > 1
            ]
            if len(x) > 0:
                continue
            if len(read) >= 4:
                tmp = []
                for variant in read:
                    tmp.append(variant.position)
                flag = 0
                for i, x in enumerate(tmp):
                    if i > 0:
                        #print(int(x - tmp[i - 1]))
                        if int(x - tmp[i - 1]) > 20:
                            flag = 1
                            break
                if flag == 0:
                    #print(read)
                    readset1.add(read)
            #if len(read) >=5:
            #   readset1.add(read)
    #print("length of duplicated bubbles")
    #print(tmp_duplicated)
    #print(len(list(tmp_duplicated)))

    readset1.sort()

    #print("duplicated")
    #print(duplicated)
    print("reads considered before read-selection")
    print(len(readset1))
    #print(readset1)
    return readset1, alleles_per_pos, locus_branch_mapping
Exemple #13
0
def vg_reader(locus_file, gam_file):
    """
	input: sorted locus and sorted GAM file output from vg.
	output: sorted readset for core DP.
	assumptions: 
	1. locus file consists of linear ordering of simple bubbles only and hence sorted. Each locus file does not contain start and end vertex.
	2. paths in the locus should be covered by atleast one pacbio read.
	2. GAM file is sorted and restricted to locus file.
	3. files consists of all DAG connected components.
	4. add variant only when it identifies the branch uniquely.
	"""
    # create a dictionary of branches for each locus based on locus file.
    locus_branch_mapping = OrderedDict()
    locus_count = 0
    prev_startsnarl = 0
    prev_endsnarl = 0
    locus_branch_mapping = defaultdict()
    locus_count = 0
    prev_startsnarl = 0
    prev_startsnarl_orientation = -1
    prev_endsnarl = 0
    prev_endsnarl_orientation = -1
    with stream.open(str(locus_file), "rb") as istream:
        for data in istream:
            l = vg_pb2.SnarlTraversal()
            l.ParseFromString(data)
            #TODO: make ordered doctionary locus_branch_mapping
            # handle forward and backward case of nodes
            current_startsnarl = l.snarl.start.node_id
            current_startsnarl_orientation = l.snarl.start.backward
            current_endsnarl = l.snarl.end.node_id
            current_endsnarl_orientation = l.snarl.end.backward
            path_in_bubble = []
            cyclic_bubbles = [
                102838, 102840, 102846, 102850, 52424, 52430, 52708, 52711,
                54914, 54917, 60635, 60638, 60965, 60968, 61857, 61861, 61906,
                61909, 65760, 65762, 67841, 67844, 67858, 67862, 70509, 70513,
                73378, 73380, 83218, 83220, 83224, 83231, 83676, 83678, 86581,
                86586, 92007, 92012, 92467, 92474, 97403, 97405, 99187, 99190
            ]
            if l.snarl.end.node_id in cyclic_bubbles or l.snarl.start.node_id in cyclic_bubbles:
                continue
            if len(l.visits) == 0:
                #TODO: for now, assumed, all nodes in path are either forward or backward
                if l.snarl.start.backward == True:
                    path_in_bubble.append(
                        tuple((l.snarl.end.node_id, l.snarl.start.node_id)))
                else:
                    path_in_bubble.append(
                        tuple((l.snarl.start.node_id, l.snarl.end.node_id)))
            else:
                #TODO: for now, assumed, all nodes in path are either forward or backward
                if l.snarl.start.backward == True:
                    path_in_bubble.append(
                        tuple((l.snarl.end.node_id, l.visits[-1].node_id)))
                    for i in range(0, len(l.visits) - 1):
                        path_in_bubble.append(
                            tuple((l.visits[i + 1].node_id,
                                   l.visits[i].node_id)))
                    path_in_bubble.append(
                        tuple((l.visits[0].node_id, l.snarl.start.node_id)))
                else:
                    path_in_bubble.append(
                        tuple((l.snarl.start.node_id, l.visits[0].node_id)))
                    for i in range(0, len(l.visits) - 1):
                        path_in_bubble.append(
                            tuple((l.visits[i].node_id,
                                   l.visits[i + 1].node_id)))
                    path_in_bubble.append(
                        tuple((l.visits[-1].node_id, l.snarl.end.node_id)))

            if current_startsnarl == prev_startsnarl and current_endsnarl == prev_endsnarl and current_endsnarl_orientation == prev_endsnarl_orientation and prev_startsnarl_orientation == current_startsnarl_orientation:
                per_locus.append(path_in_bubble)
            else:
                locus_count = locus_count + 1
                per_locus = []
                per_locus.append(path_in_bubble)
            prev_startsnarl = current_startsnarl
            prev_startsnarl_orientation = current_startsnarl_orientation
            prev_endsnarl = current_endsnarl
            prev_endsnarl_orientation = current_endsnarl_orientation
            locus_branch_mapping[locus_count] = per_locus
    #for i in [1, 2, 131, 132, 509, 6, 3, 646, 10, 12, 13, 269, 143, 16, 17, 657, 659, 407, 280, 667, 31, 672, 169, 301, 687, 560, 48, 691, 563, 693, 694, 569, 572, 317, 573, 574, 319, 577, 701, 579, 580, 325, 582, 583, 584, 201, 330, 586, 588, 589, 585, 590, 592, 593, 594, 337, 339, 597, 85, 599, 87, 345, 601, 67, 607, 608, 609, 482, 612, 614, 360, 65, 632, 581, 494, 371, 500, 501, 629, 120, 506, 380, 381]:
    #del locus_branch_mapping[i]
    #for i in [7, 8, 12, 15, 19, 20, 22, 23, 27, 29, 31, 32, 35, 38, 40, 42, 43, 45, 46, 52, 53, 59, 60, 61, 62, 63, 65, 69, 70, 71, 72, 78, 81, 82, 87, 91, 92, 94, 98, 100, 102, 104, 108, 114, 115, 118, 127, 128, 129, 142, 149, 156, 162, 163, 164, 165, 167, 170, 171, 172, 177, 182, 185, 186, 195, 198, 203, 211, 212, 213, 216, 223, 226, 227, 229, 231, 233, 235, 237, 238, 242, 243, 248, 249, 258, 259, 260, 266, 270, 271, 272, 273, 277, 278, 280, 287, 288, 290, 295, 298, 299, 301, 304, 305, 307, 308, 310, 311, 312, 315, 316, 317, 319, 322, 323, 328, 330, 335, 338, 340, 343, 346, 347, 348, 351, 354, 362, 364, 366, 367, 368, 369, 372, 373, 379, 383, 384, 385, 387, 391, 392, 394, 395, 396, 397, 399, 403, 404, 405, 406, 407, 409, 411, 413, 415, 417, 419, 422, 425, 428, 429, 432, 433, 436, 437, 438, 440, 442, 443, 444, 446, 452, 453, 458, 459, 462, 464, 465, 467]:
    #del locus_branch_mapping[i]
    #for i in [5, 19, 20, 22, 23, 27, 29, 31, 32, 35, 38, 40, 42, 43, 45, 46, 52, 53, 59, 60, 61, 62, 63, 65, 69, 70, 71, 72, 78, 81, 82, 87, 91, 92, 94, 98, 100, 102, 104, 108, 114, 115, 118, 127, 128, 129, 142, 149, 156, 162, 163, 164, 165, 167, 170, 171, 172, 177, 182, 185, 186, 195, 198, 203, 211, 212, 213, 216, 223, 226, 227, 229, 231, 233, 235, 237, 238, 242, 243, 248, 249, 258, 259, 260, 266, 270, 271, 272, 273, 277, 278, 280, 287, 288, 290, 295, 298, 299, 301, 304, 305, 307, 308, 310, 311, 312, 315, 316, 317, 319, 322, 323, 328, 330, 335, 338, 340, 343, 346, 347, 348, 351, 354, 362, 364, 366, 367, 368, 369, 372, 373, 379, 383, 384, 385, 387, 391, 392, 394, 395, 396, 397, 399, 403, 404, 405, 406, 407, 409, 411, 413, 415, 417, 419, 422, 425, 428, 429, 432, 433, 436, 437, 438, 440, 442, 443, 444, 446, 452, 453, 458, 459, 462, 464, 465, 467]:
    #del locus_branch_mapping[i]
    #for i in [7, 8, 12, 15, 19, 20, 22, 23, 27, 29, 31, 32, 35, 38, 40, 42, 43, 45, 46, 52, 53, 59, 60, 61, 62, 63, 65, 69, 70, 71, 72, 78, 81, 82, 87, 91, 92, 94, 98, 100, 102, 104, 108, 114, 115, 118, 127, 128, 129, 142, 149, 156, 162, 163, 164, 165, 167, 170, 171, 172, 177, 182, 185, 186, 195, 198, 203, 211, 212, 213, 216, 223, 226, 227, 229, 231, 233, 235, 237, 238, 242, 243, 248, 249, 258, 259, 260, 266, 270, 271, 272, 273, 277, 278, 280, 287, 288, 290, 295, 298, 299, 301, 304, 305, 307, 308, 310, 311, 312, 315, 316, 317, 319, 322, 323, 328, 330, 335, 338, 340, 343, 346, 347, 348, 351, 354, 362, 364, 366, 367, 368, 369, 372, 373, 379, 383, 384, 385, 387, 391, 392, 394, 395, 396, 397, 398, 399, 400, 401, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 422, 425, 428, 429, 432, 433, 436, 437, 438, 440, 442, 443, 444, 446, 452, 453, 458, 459, 462, 464, 465, 467]:
    #del locus_branch_mapping[i]
    #for i in [7, 8, 12, 15, 19, 20, 22, 23, 27, 29, 31, 32, 35, 38, 40, 42, 43, 45, 46, 52, 53, 59, 60, 61, 62, 63, 65, 69, 70, 71, 72, 78, 81, 82, 87, 91, 92, 94, 98, 100, 102, 104, 108, 114, 115, 118, 127, 128, 129, 142, 149, 156, 162, 163, 164, 165, 167, 170, 171, 172, 177, 182, 185, 186, 195, 198, 203, 211, 212, 213, 216, 223, 226, 227, 229, 231, 233, 235, 237, 238, 242, 243, 248, 249, 258, 259, 260, 266, 270, 271, 272, 273, 277, 278, 280, 287, 288, 290, 295, 298, 299, 301, 304, 305, 307, 308, 310, 311, 312, 315, 316, 317, 319, 322, 323, 328, 330, 335, 338, 340, 343, 346, 347, 348, 351, 354, 362, 364, 366, 367, 368, 369, 372, 373, 379, 383, 384, 385, 387, 391, 392, 394, 395, 396, 397, 399, 403, 404, 405, 406, 407, 409, 411, 413, 415, 417, 419, 422, 425, 428, 429, 432, 433, 436, 437, 438, 440, 442, 443, 444, 446, 452, 453, 458, 459, 462, 464, 465, 467]:
    #del locus_branch_mapping[i]
    #for i in [395, 396]:
    #del locus_branch_mapping[i]

    #print(locus_branch_mapping)
    print('The number of hets:')
    het_count = 0
    for k, v in locus_branch_mapping.items():
        if len(v) > 1:
            het_count = het_count + 1
    print(het_count)
    # keep branch of paths in each bubble.
    alleles_per_pos = defaultdict()
    for k, v in locus_branch_mapping.items():
        alleles_per_pos[k] = len(v)

    # both simple and complex bubbles: key is the values in locus_branch_mapping and value is triplet(locus, branch, alleles)
    reverse_mapping = defaultdict(list)
    for k, v in locus_branch_mapping.items():
        if len(v) > 1:  # more than one branch
            for i, b in enumerate(v):
                if len(b) > 0:
                    for p, j in enumerate(b):
                        reverse_mapping[j].append(
                            [k, i, len(v)]
                        )  # in complex bubbles, a node can map to multiple branches.
    #print(reverse_mapping)
    print(locus_branch_mapping)

    # both simple and complex bubbles: extract reads from GAM file associated with the locus and create a sorted readset.
    # in complex bubble, set of nodes uniquely determine the path.
    readset = ReadSet()
    count = 0
    duplicated = 0
    #TODO: consider reads with only positive score.
    with stream.open(str(gam_file), "rb") as istream:
        for data in istream:
            g = vg_pb2.Alignment()
            g.ParseFromString(data)
            # hard-coded source id, mapping quality and other values.
            val1 = True
            val2 = False

            count1 = 0
            count2 = 0
            score = g.score / len(g.sequence)
            if score < 0.75:
                continue
            read = Read(g.name, 0, 0, 0)  # create read for each read alignment
            #readnames= ["S1_Y12_290","S1_SK1_290","S1_Y12_430","S1_SK1_657","S1_Y12_139","S1_Y12_427","S1_SK1_427","S1_Y12_657","S1_SK1_588","S1_Y12_588","S1_SK1_139","S1_SK1_430","S1_Y12_76","S1_Y12_463","S1_SK1_463","S1_SK1_76"]
            #readnames = ["S1_Y12_259"]
            #if g.name not in readnames:
            #continue
            print(g.name)
            prev_tmp = []
            prev_locus = -1
            locus = -1
            #for i in range(0,len(g.path.mapping)):
            #if g.path.mapping[i].position.is_reverse != val1:
            #val1 = False
            #break
            #else:
            #count1 = count1 +1

            #if count1 == len(g.path.mapping):
            #count = count+1
            ##print(g.name)

            #for i in range(0,len(g.path.mapping)):
            #if g.path.mapping[i].position.is_reverse != val2:
            #val2 = True
            #break
            #else:
            #count2 = count2 +1

            #if count2 == len(g.path.mapping):
            #count = count+1
            ##print(g.name)
            #print(val1)
            #print(val2)
            #if val1 ==val2:
            for i in range(0, len(g.path.mapping) - 1):
                #for i in g.path.mapping: # go over the mapping in a read
                # TODO: check for forward or reverse strand, we may not need it for DAG.
                edge1 = tuple((g.path.mapping[i].position.node_id,
                               g.path.mapping[i + 1].position.node_id
                               ))  # go over nodes in a mapping
                edge2 = tuple((g.path.mapping[i + 1].position.node_id,
                               g.path.mapping[i].position.node_id
                               ))  # go over nodes in a mapping
                print("edge")
                print(edge1)
                print(edge2)
                if edge1 in reverse_mapping or edge2 in reverse_mapping:  # handle start and sink node.
                    if edge1 in reverse_mapping:
                        qualities = [10] * reverse_mapping[edge1][0][2]
                        node_inf = [
                            tuple(i[0:2]) for i in reverse_mapping[edge1]
                        ]  # consider (locus, branch)
                    else:
                        qualities = [10] * reverse_mapping[edge2][0][2]
                        node_inf = [
                            tuple(i[0:2]) for i in reverse_mapping[edge2]
                        ]
                    tmp = [x for x in node_inf]
                    if prev_locus != tmp[0][0]:
                        prev_tmp = tmp
                        prev_locus = tmp[0][0]

                    interset_tmp = list(set(tmp).intersection(set(prev_tmp)))
                    print("I am outside if")
                    # TODO: handle case with prev_tmp =0
                    print("prev_tmp")
                    print(prev_tmp)
                    print("tmp")
                    print(tmp)
                    if len(prev_tmp) > 0 and len(
                            set(tmp).intersection(set(prev_tmp))
                    ) == 1:  # for complicated bubbles, but with Top-k paths. combination of some nodes uniquely determine branch.
                        print("I am inside if")
                        qualities[interset_tmp[0][1]] = 0
                        if i == len(g.path.mapping) - 2:
                            read.add_variant(interset_tmp[0][0],
                                             interset_tmp[0][1], qualities)
                        else:
                            print("i am in else")
                            next_edge1 = tuple(
                                (g.path.mapping[i + 1].position.node_id,
                                 g.path.mapping[i + 2].position.node_id))
                            next_edge2 = tuple(
                                (g.path.mapping[i + 2].position.node_id,
                                 g.path.mapping[i + 1].position.node_id))

                            if next_edge1 not in reverse_mapping and next_edge2 not in reverse_mapping:
                                read.add_variant(interset_tmp[0][0],
                                                 interset_tmp[0][1], qualities)

                        locus = interset_tmp[0][0]

                    #if prev_locus!=locus:
                    #prev_tmp = []
                    #else:
                    #for i in tmp:
                    #prev_tmp.append(i)
                    #prev_locus = locus
            print(read)

            if len(read) >= 2:
                readset.add(read)
    print("non-shattered")
    print(count)
    #print(readset)
    readset1 = ReadSet()
    tmp_duplicated = set()
    for read in readset:
        if read.sort() == 1:
            duplicated = duplicated + 1
            tmp = []
            for variant in read:
                tmp.append(variant.position)
            print("duplicated variant")
            x = [
                item for item, count in collections.Counter(tmp).items()
                if count > 1
            ]
            for a in x:
                tmp_duplicated.add(a)
            continue
        else:
            readset1.add(read)
    print("length of duplicated bubbles")
    print(tmp_duplicated)
    print(len(list(tmp_duplicated)))

    readset1.sort()
    print("******")
    for i, read in enumerate(readset1):
        for j, variant in enumerate(read):
            print(
                str(i) + " " + str(variant.position) + " " +
                str(variant.allele) + " " + "10")
    print("******")
    print("duplicated")
    print(duplicated)
    print("reads considered before read-selection")
    print(len(readset1))
    return readset1, alleles_per_pos, locus_branch_mapping
Exemple #14
0
#trans = sys.argv[3]

count = 0

locus_count = 0
prev_startsnarl = 0
prev_endsnarl = 0
locus_branch_mapping = OrderedDict()
locus_count = 0
prev_startsnarl = 0
prev_startsnarl_orientation = -1
prev_endsnarl = 0
prev_endsnarl_orientation = -1
reads_dict = defaultdict(list)
path_in_bubble = []
with stream.open(str(locus_file), "rb") as istream:
    for data in istream:
        l = vg_pb2.SnarlTraversal()
        l.ParseFromString(data)
        #TODO: make ordered doctionary locus_branch_mapping
        # handle forward and backward case of nodes
        current_startsnarl = l.snarl.start.node_id
        current_startsnarl_orientation = l.snarl.start.backward
        current_endsnarl = l.snarl.end.node_id
        current_endsnarl_orientation = l.snarl.end.backward

        if current_startsnarl == prev_startsnarl and current_endsnarl == prev_endsnarl:
            path_in_bubble.append(l)
        else:

            locus_branch_mapping[locus_count] = path_in_bubble
Exemple #15
0
# Filler structures
feature_filler = np.ndarray(shape=(batch_size, n_input // 3, 3),
                            dtype=np.float32)
label_filler = np.ndarray(shape=(batch_size, n_classes), dtype=np.float32)

# Add ops to save and restore all the variables.
saver = tf.train.Saver()

# Initializing the variables
sess = tf.Session()
sess.run(tf.global_variables_initializer())

# Read Data
print("Reading train data ...")
istream = stream.open("output/train.gam", "rb")
dataarray = []
for data in istream:
    dataarray.append(data)
istream.close()
print("... read %d events" % len(dataarray))
np.random.shuffle(dataarray)
np.set_printoptions(linewidth=200)

# Train
event = neuland.Event()
for iteration in range(1, 20, 1):
    for n, batch in enumerate(chunks(dataarray, batch_size)):
        feature_filler.fill(0.)
        label_filler.fill(0.)
        for m, data in enumerate(batch):
Exemple #16
0
# tandom and interspersed repeat from both aligned pacbio reads and true_haps
trans_filename = sys.argv[1]
gam_filename = sys.argv[2]
true_haps_filename = sys.argv[3]
parameter_interspersed = sys.argv[4]
out_filename = sys.argv[5]

out_file = open(out_filename, 'w')

d = {}
count = 1

bubbles_start = set()
#with stream.open('assembly_graph.P.int.remn2n.X_100.chrXIII.trans' ,"rb") as istream:
with stream.open(str(trans_filename), "rb") as istream:
    for data in istream:
        l = vg_pb2.SnarlTraversal()
        l.ParseFromString(data)
        if l.snarl.start.backward == True:
            start_node = l.snarl.end.node_id
        else:
            start_node = l.snarl.start.node_id
        bubbles_start.add(start_node)

multiplicity_bubbles = defaultdict(list)
read_details = defaultdict(list)
#with stream.open('../out.new.gam', "rb") as istream:
with stream.open(str(gam_filename), "rb") as istream:
    for data in istream:
        g = vg_pb2.Alignment()
Exemple #17
0
def vg_reader(locus_file, gam_file):
    """
	input: sorted locus and sorted GAM file output from vg.
	output: sorted readset for core DP.
	assumptions: 
	1. locus file consists of linear ordering of simple bubbles only and hence sorted. Each locus file does not contain start and end vertex.
	2. paths in the locus should be covered by atleast one pacbio read.
	2. GAM file is sorted and restricted to locus file.
	3. files consists of all DAG connected components.
	4. add variant only when it identifies the branch uniquely.
	"""

    locus_count = 0
    prev_startsnarl = 0
    prev_endsnarl = 0
    locus_branch_mapping = OrderedDict()
    locus_count = 0
    prev_startsnarl = 0
    prev_startsnarl_orientation = -1
    prev_endsnarl = 0
    prev_endsnarl_orientation = -1
    reads_dict = defaultdict(list)
    with stream.open(str(locus_file), "rb") as istream:
        for data in istream:
            l = vg_pb2.SnarlTraversal()
            l.ParseFromString(data)
            #TODO: make ordered doctionary locus_branch_mapping
            # handle forward and backward case of nodes
            current_startsnarl = l.snarl.start.node_id
            current_startsnarl_orientation = l.snarl.start.backward
            current_endsnarl = l.snarl.end.node_id
            current_endsnarl_orientation = l.snarl.end.backward
            path_in_bubble = []

            if len(l.visits) == 0:
                #TODO: for now, assumed, all nodes in path are either forward or backward
                if l.snarl.start.backward == True:
                    path_in_bubble.append(
                        tuple((l.snarl.end.node_id, l.snarl.start.node_id)))
                else:
                    path_in_bubble.append(
                        tuple((l.snarl.start.node_id, l.snarl.end.node_id)))
            else:
                #TODO: for now, assumed, all nodes in path are either forward or backward
                if l.snarl.start.backward == True:
                    path_in_bubble.append(
                        tuple((l.snarl.end.node_id, l.visits[-1].node_id)))
                    for i in range(0, len(l.visits) - 1):
                        path_in_bubble.append(
                            tuple((l.visits[i + 1].node_id,
                                   l.visits[i].node_id)))
                    path_in_bubble.append(
                        tuple((l.visits[0].node_id, l.snarl.start.node_id)))
                else:
                    path_in_bubble.append(
                        tuple((l.snarl.start.node_id, l.visits[0].node_id)))
                    for i in range(0, len(l.visits) - 1):
                        path_in_bubble.append(
                            tuple((l.visits[i].node_id,
                                   l.visits[i + 1].node_id)))
                    path_in_bubble.append(
                        tuple((l.visits[-1].node_id, l.snarl.end.node_id)))

            if current_startsnarl == prev_startsnarl and current_endsnarl == prev_endsnarl and current_endsnarl_orientation == prev_endsnarl_orientation and prev_startsnarl_orientation == current_startsnarl_orientation:
                per_locus.append(path_in_bubble)
            else:
                locus_count = locus_count + 1
                per_locus = []
                per_locus.append(path_in_bubble)
            prev_startsnarl = current_startsnarl
            prev_startsnarl_orientation = current_startsnarl_orientation
            prev_endsnarl = current_endsnarl
            prev_endsnarl_orientation = current_endsnarl_orientation
            locus_branch_mapping[locus_count] = per_locus

    print('The number of hets:')
    het_count = 0
    for k, v in locus_branch_mapping.items():
        if len(v) > 1:
            het_count = het_count + 1
    print(het_count)
    # keep branch of paths in each bubble.
    alleles_per_pos = defaultdict()
    for k, v in locus_branch_mapping.items():
        alleles_per_pos[k] = len(v)

    # both simple and complex bubbles: key is the values in locus_branch_mapping and value is triplet(locus, branch, alleles)
    reverse_mapping = defaultdict(list)
    for k, v in locus_branch_mapping.items():
        if len(v) > 1:  # more than one branch
            for i, b in enumerate(v):
                if len(b) > 0:
                    for p, j in enumerate(b):
                        reverse_mapping[j].append(
                            [k, i, len(v)]
                        )  # in complex bubbles, a node can map to multiple branches.

    count = 0
    duplicated = 0
    #TODO: consider reads with only positive score.
    with stream.open(str(gam_file), "rb") as istream:
        for data in istream:
            g = vg_pb2.Alignment()
            g.ParseFromString(data)
            # hard-coded source id, mapping quality and other values.
            val1 = True
            val2 = False

            count1 = 0
            count2 = 0
            score = g.score / len(g.sequence)

            #if score > 0.2:
            #       continue
            read = []  # create read for each read alignment

            prev_tmp = []
            prev_locus = -1
            locus = -1

            for i in range(0, len(g.path.mapping) - 1):
                #for i in g.path.mapping: # go over the mapping in a read
                # TODO: check for forward or reverse strand, we may not need it for DAG.
                edge1 = tuple((int(g.path.mapping[i].position.name),
                               int(g.path.mapping[i + 1].position.name)
                               ))  # go over nodes in a mapping
                edge2 = tuple((int(g.path.mapping[i + 1].position.name),
                               int(g.path.mapping[i].position.name)
                               ))  # go over nodes in a mapping
                if edge1 in reverse_mapping or edge2 in reverse_mapping:  # handle start and sink node.
                    if edge1 in reverse_mapping:
                        qualities = [10] * reverse_mapping[edge1][0][2]
                        node_inf = [
                            tuple(i[0:2]) for i in reverse_mapping[edge1]
                        ]  # consider (locus, branch)
                    else:
                        qualities = [10] * reverse_mapping[edge2][0][2]
                        node_inf = [
                            tuple(i[0:2]) for i in reverse_mapping[edge2]
                        ]
                    tmp = [x for x in node_inf]
                    if prev_locus != tmp[0][0]:
                        prev_tmp = tmp
                        prev_locus = tmp[0][0]

                    interset_tmp = list(set(tmp).intersection(set(prev_tmp)))
                    if len(prev_tmp) > 0 and len(
                            set(tmp).intersection(set(prev_tmp))
                    ) == 1:  # for complicated bubbles, but with Top-k paths. combination of some nodes uniquely determine branch.
                        qualities[interset_tmp[0][1]] = 0
                        if i == len(g.path.mapping) - 2:
                            #read.add_variant(interset_tmp[0][0], interset_tmp[0][1], qualities)
                            reads_dict[g.name + "_" +
                                       str(g.query_position)].append(
                                           interset_tmp[0][0])
                            read.append(interset_tmp[0][0])
                        else:
                            next_edge1 = tuple(
                                (int(g.path.mapping[i + 1].position.name),
                                 int(g.path.mapping[i + 2].position.name)))
                            next_edge2 = tuple(
                                (int(g.path.mapping[i + 2].position.name),
                                 int(g.path.mapping[i + 1].position.name)))

                            if next_edge1 not in reverse_mapping and next_edge2 not in reverse_mapping:
                                #read.add_variant(interset_tmp[0][0], interset_tmp[0][1], qualities)
                                reads_dict[g.name + "_" +
                                           str(g.query_position)].append(
                                               interset_tmp[0][0])
                                read.append(interset_tmp[0][0])
                        locus = interset_tmp[0][0]
                else:
                    read.append(int(g.path.mapping[i].position.name))
                    read.append(int(g.path.mapping[i + 1].position.name))
                    reads_dict[g.name + "_" + str(g.query_position)].append(
                        int(g.path.mapping[i].position.name))
                    reads_dict[g.name + "_" + str(g.query_position)].append(
                        int(g.path.mapping[i + 1].position.name))

            # for every pair of bubbles or bubble-node
            for k in range(0, len(read) - 1):
                pair1 = str(read[k]) + "_" + str(
                    read[k + 1])  # not taking care of reverse direction now
                pair2 = str(read[k + 1]) + "_" + str(read[k])
                # should take of direction, not adding pairs reverse of each other
                if pair2 in consec_pairs:
                    consec_pairs[pair2].add(g.name)
                else:
                    consec_pairs[pair1].add(g.name)

    return reads_dict, consec_pairs
Exemple #18
0
nodes = set()
edge_connections = defaultdict(set)

gfafile = open('assembly_graph.P.int.remn2n.X_100.view.gfa', "rb")
for line in gfafile:
	var = line.split('\t')
	if var[0] == 'S':
		nodes.add(int(var[1]))
	if var[0] == 'L':
		edge_connections[int(var[1])].add(int(var[3]))


multiplicity_bubbles = defaultdict(list)
read_details = defaultdict(list)
with stream.open('out.new.gam', "rb") as istream:
	for data in istream:
		g = vg_pb2.Alignment()
		g.ParseFromString(data)
		for i in range(0,len(g.path.mapping)):
			node = g.path.mapping[i].position.node_id
			if node in nodes:
				multiplicity_bubbles[node].append(g.name)
			tmp = '+'
			if g.path.mapping[i].position.is_reverse == 'True':
				tmp = '-'
			node_tmp = str(node)+ "_" + str(tmp)
			read_details[g.name].append(node_tmp)

count=0
repeaticity = defaultdict()
Exemple #19
0
    for neutron in rootevent.NeulandPrimaryNeutronInteractionPixels:
      n = protoevent.neutrons.add()
      n.x = neutron.GetX()
      n.y = neutron.GetY()
      n.z = neutron.GetZ()
      n.t = neutron.GetT()
    for digi in rootevent.NeulandDigis:
      b = protoevent.digis.add()
      b.id = digi.GetPaddle()
      b.tl = digi.GetTdcL()
      b.tr = digi.GetTdcR()
      b.e = digi.GetE()
    ostream.write(protoevent)


print("Writing to protobuf stream %s" % sys.argv[1])
ostream = stream.open(sys.argv[1], 'wb')

for digifilename in sys.argv[2:]:
  simufilename = digifilename.replace('.digi.', '.sim.')

  print("Reading ROOT file %s" % digifilename)
  tfile = ROOT.TFile.Open(digifilename)
  ttree = tfile.Get("evt")
  print("... and ROOT file %s" % simufilename)
  ttree.AddFriend("simtree = evt", simufilename)

  read_and_append(ttree, ostream)

ostream.close()
Exemple #20
0
def vg_reader(locus_file, gam_file):
	"""
	input: sorted locus and sorted GAM file output from vg.
	output: sorted readset for core DP.
	assumptions: 
	1. locus file consists of linear ordering of simple bubbles only and hence sorted. Each locus file does not contain start and end vertex.
	2. paths in the locus should be covered by atleast one pacbio read.
	2. GAM file is sorted and restricted to locus file.
	3. files consists of all DAG connected components.
	4. add variant only when it identifies the branch uniquely.
	"""
	# create a dictionary of branches for each locus based on locus file.
	locus_branch_mapping=defaultdict()
	locus_count=0
	prev_startsnarl = 0
	prev_endsnarl = 0
	locus_branch_mapping=defaultdict()
	locus_count=0
	prev_startsnarl = 0
	prev_startsnarl_orientation = -1
	prev_endsnarl = 0
	prev_endsnarl_orientation = -1
	with stream.open(str(locus_file), "rb") as istream:
		for data in istream:
			l = vg_pb2.SnarlTraversal()
			l.ParseFromString(data)
			# handle forward and backward case of nodes
			current_startsnarl = l.snarl.start.node_id
			current_startsnarl_orientation = l.snarl.start.backward
			current_endsnarl = l.snarl.end.node_id
			current_endsnarl_orientation = l.snarl.end.backward
			path_in_bubble =[]
			#if len(l.visits) ==1: # consider only hets 
				#path_in_bubble.append(tuple ((l.snarl.start.node_id,l.visits[0].node_id)))
				#path_in_bubble.append(tuple ((l.visits[0].node_id, l.snarl.end.node_id)))
				#if current_startsnarl == prev_startsnarl and current_endsnarl == prev_endsnarl and current_endsnarl_orientation == prev_endsnarl_orientation and prev_startsnarl_orientation == current_startsnarl_orientation:
					#per_locus.append(path_in_bubble)
				#else:
					#locus_count=locus_count+1
					#per_locus = []
					#per_locus.append(path_in_bubble)
				#prev_startsnarl = current_startsnarl
				#prev_startsnarl_orientation = current_startsnarl_orientation
				#prev_endsnarl = current_endsnarl
				#prev_endsnarl_orientation = current_endsnarl_orientation
				#locus_branch_mapping[locus_count]=per_locus
			# TODO: fix this properly
			#if len(l.visits) ==1 and l.snarl.start.backward == False and l.snarl.end.backward == False: # consider only hets 
				#path_in_bubble.append(tuple ((l.snarl.start.node_id,l.visits[0].node_id)))
				#path_in_bubble.append(tuple ((l.visits[0].node_id, l.snarl.end.node_id)))
			#if len(l.visits) ==1 and l.snarl.start.backward == True and l.snarl.end.backward == True: # consider only hets 
				#path_in_bubble.append(tuple ((l.snarl.end.node_id,l.visits[0].node_id)))
				#path_in_bubble.append(tuple ((l.visits[0].node_id, l.snarl.start.node_id)))
			if len(l.visits) ==0:
				path_in_bubble.append(tuple ((l.snarl.start.node_id,l.snarl.end.node_id)))
			else:
				path_in_bubble.append(tuple ((l.snarl.start.node_id,l.visits[0].node_id)))
				for i in range(0,len(l.visits)-1):
					path_in_bubble.append(tuple((l.visits[i].node_id, l.visits[i+1].node_id)))
				path_in_bubble.append(tuple ((l.visits[-1].node_id, l.snarl.end.node_id)))

			if current_startsnarl == prev_startsnarl and current_endsnarl == prev_endsnarl and current_endsnarl_orientation == prev_endsnarl_orientation and prev_startsnarl_orientation == current_startsnarl_orientation:
				per_locus.append(path_in_bubble)
			else:
				locus_count=locus_count+1
				per_locus = []
				per_locus.append(path_in_bubble)
			prev_startsnarl = current_startsnarl
			prev_startsnarl_orientation = current_startsnarl_orientation
			prev_endsnarl = current_endsnarl
			prev_endsnarl_orientation = current_endsnarl_orientation
			locus_branch_mapping[locus_count]=per_locus

	#print(locus_branch_mapping)
	print('The number of hets:')
	het_count= 0
	for k,v in locus_branch_mapping.items():
		if len(v) >1:
			het_count = het_count +1
	print(het_count)
	# keep branch of paths in each bubble.
	alleles_per_pos= defaultdict()
	for k,v in locus_branch_mapping.items():
		alleles_per_pos[k]=len(v)

	# both simple and complex bubbles: key is the values in locus_branch_mapping and value is triplet(locus, branch, alleles)
	reverse_mapping= defaultdict(list)
	for k,v in locus_branch_mapping.items():
		if len(v) > 1: # more than one branch
			for i,b in enumerate(v):
				if len(b) > 0:
					for p,j in enumerate(b):
						reverse_mapping[j].append([k,i, len(v)]) # in complex bubbles, a node can map to multiple branches.
	#print(reverse_mapping)

	# both simple and complex bubbles: extract reads from GAM file associated with the locus and create a sorted readset.
	# in complex bubble, set of nodes uniquely determine the path. 
	readset=ReadSet()
	count =0
	duplicated = 0
	#TODO: consider reads with only positive score.
	with stream.open(str(gam_file), "rb") as istream:
		for data in istream:
			g = vg_pb2.Alignment()
			g.ParseFromString(data) 
			# hard-coded source id, mapping quality and other values.
			val1 = True
			val2 = False

			count1 =0
			count2=0
			#score = g.score/len(g.sequence)
			#if score < 0.75:
				#continue
			read=Read(g.name, 0, 0, 0) # create read for each read alignment
			prev_tmp=[]
			prev_locus= -1
			locus = -1
			for i in range(0,len(g.path.mapping)):
				if g.path.mapping[i].position.is_reverse != val1:
					val1 = False
					break
				else:
					count1 = count1 +1
					
			if count1 == len(g.path.mapping):
				count = count+1
				#print(g.name)
				
			for i in range(0,len(g.path.mapping)):
				if g.path.mapping[i].position.is_reverse != val2:
					val2 = True
					break
				else:
					count2 = count2 +1
					
			if count2 == len(g.path.mapping):
				count = count+1
				#print(g.name)
			if val1 ==val2:
				for i in range(0,len(g.path.mapping)-1):
				#for i in g.path.mapping: # go over the mapping in a read
					# TODO: check for forward or reverse strand, we may not need it for DAG.
					edge1 = tuple((g.path.mapping[i].position.node_id, g.path.mapping[i+1].position.node_id)) # go over nodes in a mapping
					edge2 = tuple((g.path.mapping[i+1].position.node_id, g.path.mapping[i].position.node_id)) # go over nodes in a mapping
					#print(edge)
					if edge1 in reverse_mapping or edge2 in reverse_mapping: # handle start and sink node.
						if edge1 in reverse_mapping:
							qualities = [10]* reverse_mapping[edge1][0][2]
							node_inf= [tuple(i[0:2]) for i in reverse_mapping[edge1]] # consider (locus, branch)
						else:
							qualities = [10]* reverse_mapping[edge2][0][2]
							node_inf= [tuple(i[0:2]) for i in reverse_mapping[edge2]]
						tmp = [x for x in node_inf]
						interset_tmp= list(set(tmp).intersection(set(prev_tmp)))
						if len(prev_tmp) > 0 and len(set(tmp).intersection(set(prev_tmp)))==1: # for complicated bubbles, but with Top-k paths. combination of some nodes uniquely determine branch.
							qualities[interset_tmp[0][1]] = 0
							read.add_variant(interset_tmp[0][0], interset_tmp[0][1], qualities)
							locus= interset_tmp[0][0]
							
						if prev_locus!=locus:
							prev_tmp = []
						else:
							for i in tmp:
								prev_tmp.append(i)
						prev_locus = locus
				print(len(read))
				print(g.name)


				if len(read) >= 2:
					readset.add(read)
	print("non-shattered")
	print(count)
	#print(readset)
	readset1=ReadSet()
	for read in readset:
		if read.sort() ==1:
			duplicated = duplicated +1
			continue
		else:
			readset1.add(read)

	readset1.sort()
	print("duplicated")
	print(duplicated)
	print("reads considered before read-selection")
	print(len(readset1))
	return readset1, alleles_per_pos, locus_branch_mapping
Exemple #21
0
def vg_reader(locus_file, gam_file):
    """
	input: sorted locus and sorted GAM file output from vg.
	output: sorted readset for core DP.
	assumptions: 
	1. locus file consists of linear ordering of simple bubbles only and hence sorted. Each locus file does not contain start and end vertex.
	2. paths in the locus should be covered by atleast one pacbio read.
	2. GAM file is sorted and restricted to locus file.
	3. files consists of all DAG connected components.
	4. add variant only when it identifies the branch uniquely.
	"""

    locus_count = 0
    prev_startsnarl = 0
    prev_endsnarl = 0
    locus_branch_mapping = OrderedDict()
    prev_startsnarl_orientation = -1
    prev_endsnarl_orientation = -1
    start_end_bubblenods = set()
    insidebubble = 0
    with stream.open(str(locus_file), "rb") as istream:
        for data in istream:
            l = vg_pb2.SnarlTraversal()
            l.ParseFromString(data)
            #TODO: make ordered doctionary locus_branch_mapping
            # handle forward and backward case of nodes
            current_startsnarl = l.snarl.start.node_id
            current_startsnarl_orientation = l.snarl.start.backward
            current_endsnarl = l.snarl.end.node_id
            current_endsnarl_orientation = l.snarl.end.backward
            path_in_bubble = []
            start_end_bubblenods.add(l.snarl.end.node_id)
            start_end_bubblenods.add(l.snarl.start.node_id)
            hasInBubble = False
            if len(l.visits) == 0:
                #TODO: for now, assumed, all nodes in path are either forward or backward
                if l.snarl.start.backward == True:
                    path_in_bubble.append(
                        tuple((l.snarl.end.node_id, l.snarl.start.node_id)))
                else:
                    path_in_bubble.append(
                        tuple((l.snarl.start.node_id, l.snarl.end.node_id)))
            else:
                #TODO: for now, assumed, all nodes in path are either forward or backward
                if (l.snarl.start.backward == True and l.snarl.end.backward !=
                        True) or (l.snarl.start.backward != True
                                  and l.snarl.end.backward == True):
                    path_in_bubble.append(
                        tuple((l.snarl.end.node_id, l.visits[-1].node_id)))
                    local_path_back = -1
                    for i in range(len(l.visits)):
                        if l.visits[i].snarl.start.node_id != 0:
                            pathBack = True
                            if l.visits[i].backward:
                                insideBack = True
                            else:
                                insideBack = False
                            insidebubble = 1
                            hasInBubble = True
                        if i == len(l.visits) - 1:
                            break
                        path_in_bubble.append(
                            tuple((l.visits[-1 - i].node_id,
                                   l.visits[-2 - i].node_id)))
                    path_in_bubble.append(
                        tuple((l.visits[0].node_id, l.snarl.start.node_id)))
                else:
                    local_path_back = 1
                    path_in_bubble.append(
                        tuple((l.snarl.start.node_id, l.visits[0].node_id)))
                    for i in range(len(l.visits)):
                        if l.visits[i].snarl.start.node_id != 0:
                            pathBack = False
                            if l.visits[i].backward:
                                insideBack = True
                            else:
                                insideBack = False
                            insidebubble = 1
                            hasInBubble = True
                        if i == len(l.visits) - 1:
                            break
                        path_in_bubble.append(
                            tuple((l.visits[i].node_id,
                                   l.visits[i + 1].node_id)))
                    path_in_bubble.append(
                        tuple((l.visits[-1].node_id, l.snarl.end.node_id)))

            if hasInBubble:
                tempPath = path_in_bubble.copy()

                if current_startsnarl == prev_startsnarl and current_endsnarl == prev_endsnarl and current_endsnarl_orientation == prev_endsnarl_orientation and prev_startsnarl_orientation == current_startsnarl_orientation:
                    pass
                else:
                    try:
                        locus_branch_mapping[locus_count] = per_locus
                    except NameError:
                        pass
                    locus_count += 1
                    per_locus = []
                    trans_raw = []
                    trans_raw.append(l)
            else:
                if current_startsnarl == prev_startsnarl and current_endsnarl == prev_endsnarl and current_endsnarl_orientation == prev_endsnarl_orientation and prev_startsnarl_orientation == current_startsnarl_orientation:
                    if insidebubble == 2:
                        path_in_bubble = mergePath(tempPath, path_in_bubble,
                                                   insideBack, pathBack,
                                                   local_path_back)
                        per_locus.append(path_in_bubble)
                        insidebubble = 0
                        insideBack = False
                        pathBack = False
                    else:
                        per_locus.append(path_in_bubble)
                else:

                    if insidebubble == 1:
                        insidebubble = 2
                        path_in_bubble = mergePath(tempPath, path_in_bubble,
                                                   insideBack, pathBack,
                                                   local_path_back)
                        per_locus.append(path_in_bubble)
                    else:
                        try:
                            locus_branch_mapping[locus_count] = per_locus
                        except NameError:
                            pass
                        locus_count += 1
                        per_locus = []
                        per_locus.append(path_in_bubble)

            prev_startsnarl = current_startsnarl
            prev_startsnarl_orientation = current_startsnarl_orientation
            prev_endsnarl = current_endsnarl
            prev_endsnarl_orientation = current_endsnarl_orientation

    print('The number of hets:')
    het_count = 0
    for k, v in locus_branch_mapping.items():
        if len(v) > 1:
            het_count = het_count + 1
    print(het_count)
    # keep branch of paths in each bubble.
    alleles_per_pos = defaultdict()
    for k, v in locus_branch_mapping.items():
        alleles_per_pos[k] = len(v)

    # both simple and complex bubbles: key is the values in locus_branch_mapping and value is triplet(locus, branch, alleles)
    reverse_mapping = defaultdict(list)
    for k, v in locus_branch_mapping.items():
        if len(v) > 1:  # more than one branch
            for i, b in enumerate(v):
                if len(b) > 0:
                    for p, j in enumerate(b):
                        reverse_mapping[j].append(
                            [k, i, len(v)]
                        )  # in complex bubbles, a node can map to multiple branches.

    # both simple and complex bubbles: extract reads from GAM file associated with the locus and create a sorted readset.
    # in complex bubble, set of nodes uniquely determine the path.
    readset = ReadSet()
    count = 0
    duplicated = 0
    #TODO: consider reads with only positive score.
    with stream.open(str(gam_file), "rb") as istream:
        for data in istream:
            g = vg_pb2.Alignment()
            g.ParseFromString(data)
            # hard-coded source id, mapping quality and other values.
            val1 = True
            val2 = False

            count1 = 0
            count2 = 0
            #score = g.score/len(g.sequence)

            #if score > 0.2:
            #	continue
            read = Read(g.name, 0, 0, 0)  # create read for each read alignment

            prev_tmp = []
            prev_locus = -1
            locus = -1
            n_variant = 0
            for i in range(0, len(g.path.mapping) - 1):
                #for i in g.path.mapping: # go over the mapping in a read
                # TODO: check for forward or reverse strand, we may not need it for DAG.
                edge1 = tuple((g.path.mapping[i].position.node_id,
                               g.path.mapping[i + 1].position.node_id
                               ))  # go over nodes in a mapping
                edge2 = tuple((g.path.mapping[i + 1].position.node_id,
                               g.path.mapping[i].position.node_id
                               ))  # go over nodes in a mapping
                if edge1 in reverse_mapping or edge2 in reverse_mapping:  # handle start and sink node.
                    if edge1 in reverse_mapping:

                        qualities = [10] * reverse_mapping[edge1][0][2]
                        #qualitie = 1
                        node_inf = [
                            tuple(i[0:3]) for i in reverse_mapping[edge1]
                        ]  # consider (locus, branch)
                    else:
                        qualities = [10] * reverse_mapping[edge2][0][2]
                        #qualities = 1
                        node_inf = [
                            tuple(i[0:3]) for i in reverse_mapping[edge2]
                        ]
                    tmp = node_inf.copy()
                    if prev_locus != tmp[0][0]:
                        prev_tmp = tmp.copy()
                        prev_locus = tmp[0][0]
                        len_in_path = 1
                    else:
                        len_in_path += 1

                    interset_tmp = list(set(tmp).intersection(set(prev_tmp)))
                    if len(interset_tmp) == 1 and interset_tmp[0][
                            2] == len_in_path:  # for complicated bubbles, but with Top-k paths. combination of some nodes uniquely determine branch.
                        qualities[interset_tmp[0][1]] = 0
                        #qualities= 1
                        read.add_variant(interset_tmp[0][0],
                                         interset_tmp[0][1], qualities)
                        n_variant += 1

            if len(read) >= 2:
                readset.add(read)

    readset1 = ReadSet()
    tmp_duplicated = set()
    for read in readset:
        if read.sort() == 1:
            duplicated = duplicated + 1
            tmp = []
            for variant in read:
                tmp.append(variant.position)
            #print("duplicated variant")
            x = [
                item for item, count in collections.Counter(tmp).items()
                if count > 1
            ]
            for a in x:
                tmp_duplicated.add(a)
            continue
        else:
            if len(read) >= 2:
                readset1.add(read)

    readset1.sort()

    return readset1, alleles_per_pos, locus_branch_mapping, readset
Exemple #22
0
#!/usr/bin/env python

import sys, io
import packet_pb2 as packet
import job_pb2 as job
import stream

# Example from https://developers.google.com/protocol-buffers/docs/pythontutorial
if len(sys.argv) != 2:
    print "Usage:", sys.argv[0], "jobs.pb.gz"
    sys.exit(-1)

# Open the file and discard the header
istream = stream.open(sys.argv[1], "rb")

for msg in istream:
    hdr = packet.PacketHeader()
    hdr.ParseFromString(msg)
    print hdr
    #break

for msg in istream:
    jobinfo = job.JobInfo()
    jobinfo.ParseFromString(msg)
    print jobinfo

# Close the file
istream.close()

Exemple #23
0
import sys
import stream
import logging
import vg_pb2
from collections import Counter
from collections import defaultdict
import collections
from collections import OrderedDict, namedtuple
from collections import defaultdict
# assumption ... all S' and before L's

d = defaultdict(list)
count = 1

with stream.open('true_hap1.gam', "rb") as istream:
    for data in istream:
        g = vg_pb2.Alignment()
        g.ParseFromString(data)
        #if g.name != "S1_SK1_110":
        #	continue
        tmp = []
        for i in range(0, len(g.path.mapping)):
            node = g.path.mapping[i].position.node_id
            d[node].append(g.name)
        #print(tmp)

count = 0
max_val = 0
for k, v in d.items():
    for item, count in collections.Counter(v).items():
        if len(v) > 1:
feature_filler = np.ndarray(
    shape=(batch_size, 60*50, 3), dtype=np.float32)
label_filler = np.ndarray(
    shape=(batch_size, n_classes), dtype=np.float32)

# Add ops to save and restore all the variables.
saver = tf.train.Saver()

# Initializing the variables
sess = tf.Session()
sess.run(tf.global_variables_initializer())


# Read Data
print("Reading train data ...")
istream = stream.open("output/train.gam", "rb")
dataarray = []
for data in istream:
    dataarray.append(data)
istream.close()
print("... read %d events" % len(dataarray))
np.random.shuffle(dataarray)
np.set_printoptions(linewidth=200)


# Train
event = neuland.Event()
for n, batch in enumerate(chunks(dataarray, batch_size)):
    feature_filler.fill(0.)
    label_filler.fill(0.)
    for m, data in enumerate(batch):
Exemple #25
0
from collections import OrderedDict, namedtuple
from collections import defaultdict
# assumption ... all S' and before L's
filename = sys.argv[1]
#out = sys.argv[2]

d = {}
count = 1
with open(filename) as fp:
    for line in fp:
        var = line.rstrip()
        edge = var + "_" + next(fp).rstrip()
        #print(edge)
        d[edge] = defaultdict()  # periodicity in a read, read support

with stream.open('ouralns.SK1.gam', "rb") as istream:
    for data in istream:
        g = vg_pb2.Alignment()
        g.ParseFromString(data)
        #if g.name != "S1_SK1_110":
        #	continue
        tmp = []
        for i in range(0, len(g.path.mapping) - 1):
            edge1 = str(g.path.mapping[i].position.node_id) + "_" + str(
                g.path.mapping[
                    i + 1].position.node_id)  # go over nodes in a mapping
            edge2 = str(g.path.mapping[i + 1].position.node_id) + "_" + str(
                g.path.mapping[i].position.node_id
            )  # go over nodes in a mapping
            #if edge1 in d or edge2 in d:
            #print(edge1)
Exemple #26
0
from collections import OrderedDict, namedtuple
from collections import defaultdict
# assumption ... all S' and before L's
#filename = sys.argv[1]
#out = sys.argv[2]

d={}
count=1






nodes = set()
with stream.open('canu_new.contigs.gam', "rb") as istream:
	for data in istream:
		g = vg_pb2.Alignment()
		g.ParseFromString(data)
		for i in range(0,len(g.path.mapping)):
			node = g.path.mapping[i].position.node_id
			nodes.add(node)


bubbles_start = set()
covered_by_canu =set()
with stream.open('assembly_graph.P.int.remn2n.X_100.trans' ,"rb") as istream:
	for data in istream:
		l = vg_pb2.SnarlTraversal()
		l.ParseFromString(data)
		if l.snarl.start.backward == True:
Exemple #27
0
def generate_haplotigs(sample_superreads, components, node_seq_list,
                       locus_branch_mapping, canu_alignments, vg_file,
                       pred_haplotigs, locus_file):
    sample = 0
    pred_haplotigs_file = open(pred_haplotigs, 'w')

    # This holds a dict from (node ID, orientation) pair, where true is reverse
    # (leftward) and false is forward (rightward) to a set of (node ID,
    # orientation) pairs of the nodes you reach, and their orientations when you get
    # there, reading off of the node in the specified orientation.
    # We will call these pairs "traversals".
    traversals_after = defaultdict(set)

    with stream.open(str(vg_file), "rb") as istream:
        for data in istream:
            l = vg_pb2.Graph()
            l.ParseFromString(data)
            for j in range(len(l.edge)):
                from_traversal = (getattr(l.edge[j],
                                          "from"), l.edge[j].from_start)
                to_traversal = (l.edge[j].to, l.edge[j].to_end)

                # Put the edge in the way it was read
                traversals_after[from_traversal].add(to_traversal)
                # Also store it in the other orientation, so you can follow it backward
                traversals_after[(to_traversal[0], not to_traversal[1])].add(
                    (from_traversal[0], not from_traversal[1]))

    for haptype in range(2):
        # for second haplotype
        prev_comp = -1
        hap1 = ''
        hapseq1 = defaultdict(list)
        hapseq2 = defaultdict(list)
        haplotype_over_bubbles = defaultdict(list)
        start_node_to_bubble = defaultdict(list)

        for sample, superreads in sample_superreads.items():
            for v1, v2 in zip(*superreads):
                v = v1 if haptype == 0 else v2
                b = locus_branch_mapping[v.position][v.allele]
                # tmp stores the nodes over the haplotype path in a bubble
                tmp = list()
                tmp.append(b[0][0])
                for p, j in enumerate(b):
                    tmp.append(j[-1])

                def dfs_path(start, goal, tmp):
                    stack = [((start, True), [(start, True)]),
                             ((start, False), [(start, False)])]
                    visited = set()
                    visited.add(start)
                    count = 0
                    while stack:
                        (traversal, path) = stack.pop()
                        for next in traversals_after[traversal]:
                            if count > 5000:
                                break
                            if next[0] in tmp and next not in visited:
                                #if "{}_{}".format(vertex, next) in edge_connections_sign:
                                if next[0] == goal:
                                    if len(path) == len(tmp) - 1:
                                        return path + [next]
                                else:
                                    count += 1
                                    visited.add(next)
                                    stack.append((next, path + [next]))
                    return []

                path = dfs_path(tmp[0], tmp[-1], tmp)
                if len(path) != len(tmp):
                    path = dfs_path(tmp[-1], tmp[0], tmp)

                # We need a function to flip a traversal
                def reverse_traversal(trav):
                    return (trav[0], not trav[1])

                # We need a function to flip a path and all its traversals
                def reverse_path(to_reverse):
                    return [reverse_traversal(t) for t in reversed(path)]

                # store the haplotype path with start or end as key
                if len(path) == len(tmp):
                    haplotype_over_bubbles[path[0]] = path  # from start
                    haplotype_over_bubbles[reverse_traversal(
                        path[-1])] = reverse_path(path)
                    start_node_to_bubble[path[0]] = v.position
                    start_node_to_bubble[reverse_traversal(
                        path[-1])] = v.position

        # consider underlying graph as bidirected graph
        # start from canu contigs and make break them based on whatshap components
        # In bubbles, consider the haplotype path made up of nodes stored and whether to traverse the path in forward or backward, decide based on canu
        # at non-bubble region, consider path based on canu by considering the underlying graph.

        nodes_list = set()
        dummy_list = ['0'] * 1000
        orderalignment = defaultdict(list)
        orderalignment = defaultdict(lambda: [-1] * 10000, orderalignment)
        with stream.open(str(canu_alignments), "rb") as istream:
            for data in istream:
                g = vg_pb2.Alignment()
                contig_nodes = []
                contig_nodes_blocks = []
                contig_nodes_seq = ''
                g.ParseFromString(data)
                save_nodes = []
                canu_nodes_toseq = defaultdict()
                for i in range(0, len(g.path.mapping)):
                    index1 = g.path.mapping[i].position.node_id
                    orientation_canu = g.path.mapping[i].position.is_reverse
                    save_nodes.append((index1, orientation_canu))
                    canu_nodes_toseq[index1] = g.path.mapping[i].edit[
                        0].sequence

                # What component was the last bubble in, if there was a last bubble
                prev_component = None

                it_val = 0
                already_done = set()
                for i in range(0, len(save_nodes)):
                    if i >= it_val:
                        index1 = save_nodes[i][0]
                        orientation_canu = save_nodes[i][1]

                        # to take care of components, break when the bubbleid of previous and current is not equal
                        if (index1, orientation_canu) in start_node_to_bubble:
                            bubbleid = start_node_to_bubble[(index1,
                                                             orientation_canu)]
                            component = components[bubbleid]
                            if prev_component is not None and component != prev_component:
                                # We have moved to a new component of bubbles
                                contig_nodes.append(contig_nodes_blocks)
                                contig_nodes_blocks = []
                                prev_component = component
                            elif prev_component is None:
                                # Remember the first component
                                prev_component = component

                        if (index1, orientation_canu
                            ) not in haplotype_over_bubbles:
                            if orientation_canu == False:
                                already_done.add(index1)
                                contig_nodes_blocks.append(
                                    str(index1) + "_" + str(0))
                            else:
                                already_done.add(index1)
                                contig_nodes_blocks.append(
                                    str(index1) + "_" + str(1))

                        if (index1,
                                orientation_canu) in haplotype_over_bubbles:
                            if haplotype_over_bubbles[(
                                    index1, orientation_canu
                            )][-1] in save_nodes:  # taking ordering from graph:
                                for traversal in haplotype_over_bubbles[(
                                        index1, orientation_canu)][:-1]:
                                    if traversal[0] not in already_done:
                                        # Put each traversal that appears in the bubble in the contig node blocks
                                        # Except for the last one, which will be in the next bubble or in Canu again
                                        contig_nodes_blocks.append(
                                            str(traversal[0]) + "_" +
                                            ("1" if traversal[1] else "0"))
                                        already_done.add(traversal[0])

                        if (
                                index1, orientation_canu
                        ) in haplotype_over_bubbles and haplotype_over_bubbles[
                            (index1, orientation_canu)][-1] in save_nodes:
                            if save_nodes.index(haplotype_over_bubbles[(
                                    index1,
                                    orientation_canu)][-1]) > save_nodes.index(
                                        haplotype_over_bubbles[(
                                            index1, orientation_canu)][0]):
                                # Skip to the last traversal in the bubble
                                # It will also be shared by Canu
                                it_val = save_nodes.index(
                                    haplotype_over_bubbles[(index1,
                                                            orientation_canu)]
                                    [-1])  # end node is not repeated

                    else:
                        # Don't do this Canu visit, it's part of a bubble we already did.
                        continue

                contig_nodes.append(contig_nodes_blocks)  # for the last one.
                # build the contig sequence taking care of reverse complements for every canu contigs
                for j, contig_blocks in enumerate(contig_nodes):
                    contig_nodes_seq = ''
                    for i in contig_blocks:
                        node = int(i.split("_")[0])
                        if i.split("_")[1] == '1':
                            contig_nodes_seq = contig_nodes_seq + reverse_complement(
                                str(node_seq_list[node]))
                        else:
                            contig_nodes_seq = contig_nodes_seq + str(
                                node_seq_list[node])
                    pred_haplotigs_file.write(">seq" + str(j) + "_" +
                                              str(locus_file) + "_" +
                                              str(haptype + 1) + "\n")
                    pred_haplotigs_file.write(contig_nodes_seq + '\n')
import logging
import vg_pb2
from collections import Counter
from collections import defaultdict
import collections
from collections import OrderedDict, namedtuple
from collections import defaultdict
# assumption ... all S' and before L's
#filename = sys.argv[1]
#out = sys.argv[2]

d = {}
count = 1

bubbles_start = set()
with stream.open('assembly_graph.P.int.remn2n.X_100.chrXIII.trans',
                 "rb") as istream:
    #with stream.open('assembly_graph.P.int.remn2n.X_100.trans' ,"rb") as istream:
    for data in istream:
        l = vg_pb2.SnarlTraversal()
        l.ParseFromString(data)
        if l.snarl.start.backward == True:
            start_node = l.snarl.end.node_id
        else:
            start_node = l.snarl.start.node_id
        bubbles_start.add(start_node)

multiplicity_bubbles = defaultdict(list)
read_details = defaultdict(list)
with stream.open('../out.new.chrXIII.gam', "rb") as istream:
    for data in istream:
        g = vg_pb2.Alignment()
Exemple #29
0
def reverse_map(locus_file):

    print('Start to read locus_file')
    locus_count = 0
    per_locus = []
    #trans_raw = []
    prev_startsnarl = 0
    prev_endsnarl = 0
    locus_branch_mapping = OrderedDict()
    prev_startsnarl_orientation = -1
    prev_endsnarl_orientation = -1
    insidebubble = 0
    with stream.open(str(locus_file), "rb") as istream:
        for data in istream:
            l = vg_pb2.SnarlTraversal()
            l.ParseFromString(data)
            current_startsnarl = l.snarl.start.node_id
            current_startsnarl_orientation = l.snarl.start.backward
            current_endsnarl = l.snarl.end.node_id
            current_endsnarl_orientation = l.snarl.end.backward
            path_in_bubble = []
            hasInBubble = False

            if len(l.visits) == 0:
                if l.snarl.start.backward == True:
                    path_in_bubble.append(
                        tuple((l.snarl.end.node_id, l.snarl.start.node_id)))
                else:
                    path_in_bubble.append(
                        tuple((l.snarl.start.node_id, l.snarl.end.node_id)))
            else:
                if (l.snarl.start.backward == True and l.snarl.end.backward !=
                        True) or (l.snarl.start.backward != True
                                  and l.snarl.end.backward == True):
                    path_in_bubble.append(
                        tuple((l.snarl.end.node_id, l.visits[-1].node_id)))
                    local_path_back = -1
                    for i in range(len(l.visits)):
                        if l.visits[i].snarl.start.node_id != 0:
                            pathBack = True
                            if l.visits[i].backward:
                                insideBack = True
                            else:
                                insideBack = False
                            insidebubble = 1
                            hasInBubble = True
                        if i == len(l.visits) - 1:
                            break
                        path_in_bubble.append(
                            tuple((l.visits[-1 - i].node_id,
                                   l.visits[-2 - i].node_id)))
                    path_in_bubble.append(
                        tuple((l.visits[0].node_id, l.snarl.start.node_id)))
                else:
                    local_path_back = 1
                    path_in_bubble.append(
                        tuple((l.snarl.start.node_id, l.visits[0].node_id)))
                    for i in range(len(l.visits)):
                        if l.visits[i].snarl.start.node_id != 0:
                            pathBack = False
                            if l.visits[i].backward:
                                insideBack = True
                            else:
                                insideBack = False
                            insidebubble = 1
                            hasInBubble = True
                        if i == len(l.visits) - 1:
                            break
                        path_in_bubble.append(
                            tuple((l.visits[i].node_id,
                                   l.visits[i + 1].node_id)))
                    path_in_bubble.append(
                        tuple((l.visits[-1].node_id, l.snarl.end.node_id)))

            if hasInBubble:
                tempPath = path_in_bubble.copy()

                if current_startsnarl == prev_startsnarl and current_endsnarl == prev_endsnarl and current_endsnarl_orientation == prev_endsnarl_orientation and prev_startsnarl_orientation == current_startsnarl_orientation:
                    pass
                else:
                    try:
                        locus_branch_mapping[locus_count] = per_locus
                    except NameError:
                        pass
                    locus_count -= 1
                    per_locus = []
            else:
                if current_startsnarl == prev_startsnarl and current_endsnarl == prev_endsnarl and current_endsnarl_orientation == prev_endsnarl_orientation and prev_startsnarl_orientation == current_startsnarl_orientation:
                    if insidebubble == 2:
                        path_in_bubble = mergePath(tempPath, path_in_bubble,
                                                   insideBack, pathBack,
                                                   local_path_back)
                        per_locus.append(path_in_bubble)
                        insidebubble = 0
                        insideBack = False
                        pathBack = False
                    else:
                        per_locus.append(path_in_bubble)
                else:
                    if insidebubble == 1:
                        insidebubble = 2
                        path_in_bubble = mergePath(tempPath, path_in_bubble,
                                                   insideBack, pathBack,
                                                   local_path_back)
                        per_locus.append(path_in_bubble)
                    else:
                        try:
                            locus_branch_mapping[locus_count] = per_locus
                        except NameError:
                            pass
                        locus_count -= 1
                        per_locus = []
                        per_locus.append(path_in_bubble)

            prev_startsnarl = current_startsnarl
            prev_startsnarl_orientation = current_startsnarl_orientation
            prev_endsnarl = current_endsnarl
            prev_endsnarl_orientation = current_endsnarl_orientation

    locus_branch_mapping[locus_count] = per_locus
    het_count = 0
    alleles_per_pos = dict()
    for k, bubble in locus_branch_mapping.items():
        alleles_per_pos[k] = len(bubble)
        if len(bubble) > 1:
            het_count = het_count + 1
    print('The number of hets:', het_count)
    reverse_mapping = defaultdict(set)
    allele_reverse_mapping = defaultdict(list)
    for k, bubble in locus_branch_mapping.items():
        if bubble == []:
            continue
        for path in bubble:
            for edge in path:
                for node in edge:
                    reverse_mapping[node].add(k)
        for i, path in enumerate(bubble):
            if len(path) > 0:
                for edge in path:
                    allele_reverse_mapping[edge].append(
                        [k, i, len(path), len(bubble)])
    return reverse_mapping, allele_reverse_mapping, alleles_per_pos, locus_branch_mapping
Exemple #30
0
        for neutron in rootevent.NeulandPrimaryNeutronInteractionPixels:
            n = protoevent.neutrons.add()
            n.x = neutron.GetX()
            n.y = neutron.GetY()
            n.z = neutron.GetZ()
            n.t = neutron.GetT()
        for digi in rootevent.NeulandDigis:
            b = protoevent.digis.add()
            b.id = digi.GetPaddle()
            b.tl = digi.GetTdcL()
            b.tr = digi.GetTdcR()
            b.e = digi.GetE()
        ostream.write(protoevent)


print("Writing to protobuf stream %s" % sys.argv[1])
ostream = stream.open(sys.argv[1], 'wb')

for digifilename in sys.argv[2:]:
    simufilename = digifilename.replace('.digi.', '.sim.')

    print("Reading ROOT file %s" % digifilename)
    tfile = ROOT.TFile.Open(digifilename)
    ttree = tfile.Get("evt")
    print("... and ROOT file %s" % simufilename)
    ttree.AddFriend("simtree = evt", simufilename)

    read_and_append(ttree, ostream)

ostream.close()
Exemple #31
0
import sys
import stream
import logging
import vg_pb2
from collections import Counter
from collections import defaultdict
import collections
from collections import OrderedDict, namedtuple
from collections import defaultdict

file_input = sys.argv[1]
#file_out = argv[2]
out = open(file_input + '.gfa', 'w')

nodes_list = set()
with stream.open(str(file_input), "rb") as istream:
    for data in istream:
        g = vg_pb2.Alignment()
        g.ParseFromString(data)
        for i in range(0, len(g.path.mapping) - 1):
            node1 = g.path.mapping[i].position.node_id
            node2 = g.path.mapping[i + 1].position.node_id
            nodes_list.add(node1)
            nodes_list.add(node2)
            if g.path.mapping[i].position.is_reverse == True and g.path.mapping[
                    i + 1].position.is_reverse == True:
                out.write("L" + "\t" + str(node1) + "\t" + '-' + "\t" +
                          str(node2) + "\t" + '-' + "\t" + "0M" + "\n")
            if g.path.mapping[
                    i].position.is_reverse == False and g.path.mapping[
                        i + 1].position.is_reverse == True:
Exemple #32
0
        'C': 'G',
        'G': 'C',
        'T': 'A',
        'a': 'T',
        'c': 'G',
        'g': 'C',
        't': 'A'
    }
    return "".join([seq_dict[base] for base in reversed(seq)])


nodes_list = set()
dummy_list = ['0'] * 100
orderalignment = defaultdict(list)
orderalignment = defaultdict(lambda: [-1] * 100, orderalignment)
with stream.open(str(file_input), "rb") as istream:
    for data in istream:
        g = vg_pb2.Alignment()
        g.ParseFromString(data)
        #read_info = g.name
        canu_name = g.name

        canu_chunk_num = int(g.query_position)
        orderalignment[canu_name].insert(canu_chunk_num, g)

new_orderalignment = defaultdict(list)
for k, v in orderalignment.items():
    new_orderalignment[k] = [x for x in v if x != -1]

print('hello')
ostream = stream.open(sys.argv[2], 'wb')
Exemple #33
0
import sys
import stream
import vg_pb2
from collections import Counter
from collections import defaultdict
import collections
from collections import OrderedDict, namedtuple

# assumption ... all S' and before L's

out = sys.argv[1]
f = open(out, 'w')

bubble_to_remove = set()
bubbles_dict_trans = defaultdict(int)
with stream.open('component54.trans', "rb") as istream:
    for data in istream:
        l = vg_pb2.SnarlTraversal()
        l.ParseFromString(data)
        tmp = str(l.snarl.start.node_id)
        bubbles_dict_trans[tmp] = 0

with stream.open('out.new.chrI.gam', "rb") as istream:
    for data in istream:
        g = vg_pb2.Alignment()
        g.ParseFromString(data)
        for i in range(0, len(g.path.mapping)):
            node = str(g.path.mapping[i].position.node_id)
            if node in bubbles_dict_trans:
                bubbles_dict_trans[node] += 1