Beispiel #1
0
def get_intersecting_intervals(args):
    from offsetbasedgraph import IntervalCollection
    intervals1 = IntervalCollection.from_file(args.file1, text_file=True, graph=args.graph)
    intervals2 = IntervalCollection.from_file(args.file2, text_file=True, graph=args.graph)

    out = []
    for interval1 in intervals1.intervals:
        for interval2 in intervals2.intervals:
            if interval1.intersects(interval2):
                out.append(interval1)
                logging.info("Found match between %s and %s" % (interval1, interval2))
                continue

    IntervalCollection(out).to_file(args.out_file_name, text_file=True)
    logging.info("Wrote intersecting intervals to %s" % args.out_file_name)
Beispiel #2
0
 def _read_alignments(self):
     if self.alignment_file_name.endswith(".json"):
         self.alignments = vg_json_file_to_interval_collection(self.alignment_file_name).intervals
     elif self.alignment_file_name.endswith(".graphnodes"):
         self.alignments = (Interval(0, 1, [int(n) for n in line.strip().split()[1].split(",")])
                            for line in open(self.alignment_file_name))
     elif self.alignment_file_name.endswith(".graphalignments"):
         self.alignments = (Interval.from_file_line(line.strip().split("\t")[1]) for line in open(self.alignment_file_name) if line.strip().split("\t")[1] != ".")
     else:
         self.alignments = IntervalCollection.from_file(self.alignment_file_name).intervals
Beispiel #3
0
    def from_file(cls, file_name, graph):
        logging.info("Reading from file")
        logging.info("Reading dict structure")
        with open(file_name, "rb") as f:
            node_dict = pickle.load(f)

        logging.info("Reading intervals")
        intervals = IntervalCollection.from_file(file_name + ".intervals",
                                                 graph=graph)
        return cls(node_dict, graph, list(intervals))
def run_predict_path(args):
    chromosomes = args.chromosomes.split(",")
    processes = []
    if not os.path.isfile(args.alignments):
        logging.error("Input alignments file %s does not exist" %
                      args.alignments)
        sys.exit()

    for chromosome in chromosomes:
        logging.info("Starting process for chromosome %s " % chromosome)
        process = Process(target=run_predict_path_single_chromosome,
                          args=(args.alignments, chromosome, args.data_dir,
                                args.linear_ref_bonus, args.out_file_name,
                                args.max_nodes_to_traverse))
        process.start()
        processes.append(process)

    for process in processes:
        process.join()

    # Merge all fasta files that were produces
    out_fasta = open(args.out_file_name + ".fa", "w")
    logging.info("Merging fasta files")
    for chromosome in tqdm(chromosomes):
        with open(args.out_file_name + "_" + chromosome + ".fasta") as f:
            out_fasta.write(f.read())

    logging.info("Wrote resulting linear reference to %s" %
                 (args.out_file_name + ".fa"))

    # Create indexed intervals for each interval file that was produced
    logging.info("Creating indexed interval for all chromosomes")
    for chromosome in chromosomes:
        file_name = args.out_file_name + "_" + chromosome + ".intervalcollection"
        graph = Graph.from_file(args.data_dir + chromosome + ".nobg")
        intervals = IntervalCollection.from_file(file_name,
                                                 text_file=True,
                                                 graph=graph)
        intervals = list(intervals.intervals)
        assert len(
            intervals) == 1, "Only a single interval in file is supported"
        interval = intervals[0]
        indexed = interval.to_numpy_indexed_interval()
        indexed.to_file(file_name + ".indexed")
        logging.info("Wrote indexed interval to file %s" % file_name +
                     ".indexed")

    if not args.skip_bwa_index:
        logging.info("Running bwa index")
        run_bwa_index(args.out_file_name + ".fa")
    else:
        logging.info("Not creating bwa index")
def analyse_pileups_on_peaks(ob_graph, pileups_file_names,
                             peak_intervals_file_name):
    print("Analysing peaks")
    pileups = {
        name: SparsePileup.from_bed_graph(ob_graph, pileup)
        for name, pileup in pileups_file_names.items()
    }
    peaks = IntervalCollection.from_file(peak_intervals_file_name,
                                         text_file=True)

    for peak in peaks:
        print()
        print("Peak %s" % peak)
        rp = peak.region_paths[0]
        for name, pileup in pileups.items():
            pileup_sum = sum(pileup.data[rp].sum() for rp in peak.region_paths)
            print("Pileup %s: %d" % (name, pileup_sum))
Beispiel #6
0
def make_haplotype_fasta(chromosome, haplotype, data_dir):

    s = SequenceGraph.from_file(data_dir + "giab_chr" + chromosome +
                                ".nobg.sequences")
    print("Getting interval")
    interval = list(
        IntervalCollection.from_file(data_dir + "haplotype_" + chromosome +
                                     "__" + haplotype + ".intervalcollection",
                                     text_file=True).intervals)[0]

    print("Getting sequence")
    sequence = s.get_interval_sequence(interval)
    print("Writing to file")
    f = open(
        data_dir + "giab_chr" + chromosome + "_haplotype" + haplotype +
        ".fasta", "w")
    f.write(">seq\n%s\n" % sequence)
    f.close()
    f.close()
def vg_alignments_to_linear():
    ob_graph = obg.GraphWithReversals.from_file("haplo1kg50-mhc.obg")
    vg_graph = pyvg.vg.Graph.create_from_file("haplo1kg50-mhc.json")
    path = create_linear_path(ob_graph, vg_graph)
    analyser = AlignmentsAnalyser(
        vg_graph, "ENCFF001HNI_haplo1kg50-mhc_filtered_q50.gam", ob_graph,
        path)  # sample reads
    #linear = analyser.to_linear_alignments()
    #collection = IntervalCollection(linear)
    #collection.to_file("graph_reads_on_linear2.intervals")

    linear = IntervalCollection.from_file(
        "graph_reads_on_linear2.intervals").intervals
    #linear = IntervalCollection.create_list_from_file("graph_reads_on_linear.intervals")
    f = open("graph_reads_on_linear.bed", "w")
    path = path.to_indexed_interval()
    linear_reads = []
    for read in linear:
        read.graph = ob_graph
        assert np.all(np.array(read.region_paths) > 0) or np.all(
            np.array(read.region_paths) < 0)

        dir = "+"
        if read.region_paths[0] < 0:
            dir = "-"
            read = read.get_reverse()

        graph_start = read.start_position
        graph_end = read.end_position

        linear_start = MHC_REGION.start + path.get_offset_at_position(
            graph_start)
        linear_end = MHC_REGION.start + path.get_offset_at_position(graph_end)

        f.writelines("chr6\t%d\t%d\t.\t0\t%s\n" %
                     (linear_start, linear_end, dir))
    f.close()
Beispiel #8
0
def make_haplotype_paths(graph_file_name, linear_ref_path_file_name,
                         haplotype0_file_name, haplotype1_file_name,
                         out_base_name, chromosome):
    # Make a linear reference fasta and interval and haplotypes fasta and intervals

    chrom = chromosome
    graph = Graph.from_file(graph_file_name)
    sequence_graph = SequenceGraph.from_file(graph_file_name + ".sequences")

    linear_ref = IntervalCollection.from_file(linear_ref_path_file_name,
                                              text_file=True)
    linear_ref = list(linear_ref.intervals)[0]
    linear_ref_nodes = set(linear_ref.region_paths)

    # Write linear ref fasta to file
    linear_ref_seq = sequence_graph.get_interval_sequence(linear_ref)
    out_file = open("linear_ref_" + chrom + ".fasta", "w")
    out_file.writelines([">%s\n" % chrom])
    out_file.writelines([linear_ref_seq + "\n"])
    out_file.close()
    logging.info("Wrote linear ref sequence. N nodes in linear ref: %d" %
                 len(linear_ref_nodes))

    haplotype_nodes = [set(), set()]  # For haplotype 0 and 1
    for haplotype in [0, 1]:
        haplotype_file_name = haplotype0_file_name
        if haplotype == 1:
            haplotype_file_name = haplotype1_file_name

        intervals = vg_json_file_to_intervals(haplotype_file_name, graph)

        for interval in intervals:
            for node in interval.region_paths:
                haplotype_nodes[haplotype].add(node)

    logging.info("N nodes in haplotype 0: %d" % len(haplotype_nodes[0]))
    logging.info("N nodes in haplotype 0 that are also in linear ref: %d" %
                 len(haplotype_nodes[0].intersection(linear_ref_nodes)))
    logging.info("N nodes in haplotype 1: %d" % len(haplotype_nodes[1]))

    # Traverse graph to get full correct haplotype intervals
    first_nodes = graph.get_first_blocks()
    assert len(first_nodes) == 1
    logging.info("N nodes in graph: %d" % len(graph.blocks))

    for haplotype in [0, 1]:
        logging.info("Traversing haplotype %d" % haplotype)

        nodes = []
        node = first_nodes[0]
        nodes_in_haplotype = haplotype_nodes[haplotype]
        nodes_in_haplotype = set(range(
            0, max(linear_ref_nodes))).difference(linear_ref_nodes)
        logging.info("There are %d haplotype nodes" % len(nodes_in_haplotype))

        assert len(
            nodes_in_haplotype
        ) > 0, "There are no haplotype nodes. Check that haplotype json files are not empty"

        n_haplotype_nodes = 0
        i = 0
        while True:

            nodes.append(node)
            if i % 50000 == 0:
                logging.info("#%d nodes traversed. On node %d" % (i, node))
            i += 1

            next_nodes = set(graph.adj_list[node])

            if len(next_nodes) == 0:
                logging.info("Reached end node %d with 0 edges" % node)
                break

            next_on_haplotype = next_nodes.intersection(nodes_in_haplotype)
            if len(next_on_haplotype) == 1:
                n_haplotype_nodes += 1
                next_node = list(next_on_haplotype)[0]
                assert next_node != node
                node = next_node
            elif len(next_on_haplotype) == 0:
                logging.debug(
                    "No new haplotype node from %d. Will follow reference" %
                    node)
                # Choose reference with lowest id to avoid deletion
                node = min(list(next_nodes.intersection(linear_ref_nodes)))
            else:
                # logging.warning("There is a deletion from node %d. Choosing lowest node id as next to avoid deletion." % node)
                # This means more than one next node is on haplotype. Choose the one with lowest id to avoid taking deletion
                node = min(list(next_on_haplotype))

        logging.info("Found %d nodes. %d on haplotype" %
                     (len(nodes), n_haplotype_nodes))
        haplotype_interval = Interval(0, graph.blocks[nodes[-1]].length(),
                                      nodes, graph)
        print("Path length: %d" % haplotype_interval.length())

        file_base_name = out_base_name + "_" + str(haplotype)
        IntervalCollection([haplotype_interval
                            ]).to_file(file_base_name + ".intervalcollection",
                                       text_file=True)

        sequence = sequence_graph.get_interval_sequence(haplotype_interval)
        out_file = open(file_base_name + ".fasta", "w")
        out_file.writelines([">%s\n" % chrom])
        out_file.writelines([sequence + "\n"])
        out_file.close()
        logging.info("Wrote fasta sequence to %s" % file_base_name + ".fasta")