Example #1
0
def test_filter_tiling_paths_by_len():
    p_ctg_tiling_path_file = os.path.join(
        helpers.get_test_data_dir(), 'p_ctg_tiling_path_1')
    p_path, p_edge_to_ctg = mod.load_tiling_paths(p_ctg_tiling_path_file, 'P')
    _, p_ctg_len = mod.calc_tiling_paths_len(p_path)

    p_path_filtered = mod.filter_tiling_paths_by_len(p_path, p_ctg_len, 0)
    assert(sorted(p_path_filtered.keys()) == sorted(
        ['000000F', '000001F', '000002F', '000003F', '000004F']))

    p_path_filtered = mod.filter_tiling_paths_by_len(p_path, p_ctg_len, 10000)
    assert(sorted(p_path_filtered.keys()) == sorted(
        ['000000F', '000001F', '000002F', '000003F']))

    p_path_filtered = mod.filter_tiling_paths_by_len(p_path, p_ctg_len, 35000)
    assert(sorted(p_path_filtered.keys()) == sorted(['000000F', '000001F']))

    p_path_filtered = mod.filter_tiling_paths_by_len(p_path, p_ctg_len, 100000)
    assert(sorted(p_path_filtered.keys()) == sorted([]))

    # Test a degenerate case where there is no length for a particular contig.
    keys = p_ctg_len.keys()
    p_ctg_len_degenerate = {}
    for i in xrange(1, len(keys)):
        p_ctg_len_degenerate[keys[i]] = p_ctg_len[keys[i]]
    with pytest.raises(Exception) as e_info:
        p_path_filtered = mod.filter_tiling_paths_by_len(
            p_path, p_ctg_len_degenerate, 0)
Example #2
0
def test_format_gfa_v1_path_line():
    gfa_graph = mod.GFAGraph()

    # Load tiling paths from file.
    p_ctg_tiling_path_file = os.path.join(helpers.get_test_data_dir(),
                                          'p_ctg_tiling_path_1')
    p_paths, p_edge_to_ctg = gen_gfa_v1.load_tiling_paths(
        p_ctg_tiling_path_file, 'P')

    # If seq_len_map is None, all CIGAR operations should be '*'.
    expected = {
        '000000F':
        'P\t000000F\t000092122-,000081654-,000034462-,000061403-,000021348-,000062240-,000083779-,000019819+,000063672+,000026565+,000050047-\t*,*,*,*,*,*,*,*,*,*,*',
        '000001F':
        'P\t000001F\t000070651+,000018109+,000068978+,000100559+,000010548-,000006846-,000065052-,000071922+,000076878+,000000861+,000001755-\t*,*,*,*,*,*,*,*,*,*,*',
        '000002F':
        'P\t000002F\t000088930+,000008918+,000100248-,000085315-,000071965+,000082497+\t*,*,*,*,*,*',
        '000003F': 'P\t000003F\t000084518+,000011674+,000057445-\t*,*,*',
        '000004F': 'P\t000004F\t000014727+,000024020+,000060868+\t*,*,*',
    }
    seq_len_map = None
    for ctg_id, path in p_paths.iteritems():
        path_line = gfa_graph.format_gfa_v1_path_line(ctg_id, path,
                                                      seq_len_map)
        assert (path_line == expected[ctg_id])

    # The seq_len_map dict is only used for the first read in the path,
    # because it needs to be included completely. The other CIGAR operations
    # are determined directly from the edges.
    expected = {
        '000000F':
        'P\t000000F\t000092122-,000081654-,000034462-,000061403-,000021348-,000062240-,000083779-,000019819+,000063672+,000026565+,000050047-\t10000M,33726M,10123M,1352M,9924M,5834M,862M,5562M,1384M,473M,2171M',
        '000001F':
        'P\t000001F\t000070651+,000018109+,000068978+,000100559+,000010548-,000006846-,000065052-,000071922+,000076878+,000000861+,000001755-\t10000M,10077M,3766M,2648M,2421M,2089M,18168M,2723M,2451M,666M,15088M',
        '000002F':
        'P\t000002F\t000088930+,000008918+,000100248-,000085315-,000071965+,000082497+\t10000M,15215M,3113M,4851M,1857M,6035M',
        '000003F':
        'P\t000003F\t000084518+,000011674+,000057445-\t10000M,9432M,23096M',
        '000004F':
        'P\t000004F\t000014727+,000024020+,000060868+\t10000M,5238M,3235M',
    }
    for ctg_id, path in p_paths.iteritems():
        # Initialize all reads to a fixed value, just to be safe.
        seq_len_map = {}
        for edge in path:
            v, w = edge[0], edge[1]
            seq_len_map[v.split(':')[0]] = 10000
            seq_len_map[w.split(':')[0]] = 10000
        path_line = gfa_graph.format_gfa_v1_path_line(ctg_id, path,
                                                      seq_len_map)
        assert (path_line == expected[ctg_id])

    # Test a degenerate case where path is None.
    path_line = gfa_graph.format_gfa_v1_path_line('', None, None)
    assert (path_line == '')
Example #3
0
def test_calc_tiling_paths_len():
    p_ctg_tiling_path_file = os.path.join(
        helpers.get_test_data_dir(), 'p_ctg_tiling_path_1')
    p_path, p_edge_to_ctg = mod.load_tiling_paths(p_ctg_tiling_path_file, 'P')
    p_coords, p_ctg_len = mod.calc_tiling_paths_len(p_path)

    for ctg_id in p_coords.keys():
        shared_items = set(expected_coord_map[ctg_id].items()) & set(
            p_coords[ctg_id].items())
        assert(len(shared_items) == len(p_coords[ctg_id]))
        assert(expected_contig_len[ctg_id] == p_ctg_len[ctg_id])
Example #4
0
def test_calc_node_coords():
    # The p_ctg_tiling_path_1 is a normal tiling path file.
    p_ctg_tiling_path_file = os.path.join(
        helpers.get_test_data_dir(), 'p_ctg_tiling_path_1')
    p_paths, p_edge_to_ctg = mod.load_tiling_paths(p_ctg_tiling_path_file, 'P')

    ctg_id = '000000F'
    coord_map, contig_len = mod.calc_node_coords(p_paths[ctg_id])
    shared_items = set(expected_coord_map[ctg_id].items()) & set(
        coord_map.items())
    assert(len(shared_items) == len(coord_map))
    assert(expected_contig_len[ctg_id] == contig_len)

    # The p_ctg_tiling_path_2 has two degenerative cases:
    # - 000000F which has an inner cycle
    # - 000001F which has an out-of-order edge
    # - 000002F which is circular (this is a valid case)
    p_ctg_tiling_path_file = os.path.join(
        helpers.get_test_data_dir(), 'p_ctg_tiling_path_2')
    p_paths, p_edge_to_ctg = mod.load_tiling_paths(p_ctg_tiling_path_file, 'P')

    # Allow cycles, but the node's coord gets overwritten.
    ctg_id = '000000F'
    coord_map, contig_len = mod.calc_node_coords(p_paths[ctg_id])
    assert(coord_map['000081654:B'] == 55125)

    # Do not allow unsorted graphs.
    ctg_id = '000001F'
    with pytest.raises(Exception) as e_info:
        coord_map, contig_len = mod.calc_node_coords(p_paths[ctg_id])

    # Allow circular graphs.
    ctg_id = '000002F'
    coord_map, contig_len = mod.calc_node_coords(p_paths[ctg_id])
    assert(contig_len == 18473)

    # Test for an empty tiling path.
    coord_map, contig_len = mod.calc_node_coords([])
    assert(not coord_map)
    assert(contig_len == 0)
Example #5
0
def test_load_tiling_paths():
    p_ctg_tiling_path_file = os.path.join(
        helpers.get_test_data_dir(), 'p_ctg_tiling_path_1')
    p_path, p_edge_to_ctg = mod.load_tiling_paths(p_ctg_tiling_path_file, 'P')

    assert(sorted(p_path.keys()) == sorted(
        ['000000F', '000001F', '000002F', '000003F', '000004F']))

    for ctg_id, path in p_path.iteritems():
        for edge in path:
            v, w, b, e, l, idt, etype = edge
            assert((v, w) in p_edge_to_ctg)
            assert(p_edge_to_ctg[(v, w)] == (ctg_id, etype))
Example #6
0
def test_add_tiling_path():
    # Load the tiling path. These methods are tested in test_gen_gfa_v1.py.
    p_ctg_tiling_path_file = os.path.join(
        helpers.get_test_data_dir(), 'p_ctg_tiling_path_1')
    p_paths, p_edge_to_ctg = gen_gfa_v1.load_tiling_paths(
        p_ctg_tiling_path_file, 'P')

    # Create a new GFA graph.
    gfa_graph = mod.GFAGraph()

    # Add the tiling paths.
    for ctg_id, path in p_paths.iteritems():
        gfa_graph.add_tiling_path(path, ctg_id)

    # Check if we have the correct number of tiling paths.
    assert(len(gfa_graph.paths.keys()) == len(p_paths.keys()))

    # They should be same as loaded.
    for ctg_id, path in p_paths.iteritems():
        assert(ctg_id in gfa_graph.paths)
        assert(gfa_graph.paths[ctg_id] == path)
Example #7
0
def test_write_gfa_v1_2():
    # Tests a case where a node is added to the graph, but
    # there is no corresponding pread in preads4falcon.fasta file.

    # Create a GFA graph.
    gfa_graph = mod.GFAGraph()

    # Load the p_ctg tiling paths.
    p_ctg_tiling_path_file = os.path.join(
        helpers.get_test_data_dir(), 'gfa-1', 'p_ctg_tiling_path')
    p_paths, p_edge_to_ctg = gen_gfa_v1.load_tiling_paths(
        p_ctg_tiling_path_file, 'P')
    # Add the tiling paths to the GFA.
    for ctg_id, path in p_paths.iteritems():
        gfa_graph.add_tiling_path(path, ctg_id)

    # Init paths to other input files.
    preads_file = os.path.join(
        helpers.get_test_data_dir(), 'gfa-1', 'preads4falcon.fasta')
    p_ctg_fasta = os.path.join(
        helpers.get_test_data_dir(), 'gfa-1', 'p_ctg.fa')
    a_ctg_fasta = os.path.join(
        helpers.get_test_data_dir(), 'gfa-1', 'a_ctg.fa')

    write_reads = False
    write_contigs = False

    fp_out = StringIO()

    # Add a node which does not exist in the preads4falcon.fasta file.
    gfa_graph.add_read_from_node('12345:B')

    # Run the unit under test.
    with pytest.raises(Exception) as e_info:
        gfa_graph.write_gfa_v1(fp_out, preads_file, [
                               p_ctg_fasta, a_ctg_fasta], write_reads, write_contigs)
Example #8
0
def wrap_write_gfa_v1_test(use_sg, use_nx, use_tp, write_reads, write_contigs, min_p_len, min_a_len, expected_path):
    # Create a GFA graph.
    gfa_graph = mod.GFAGraph()

    if use_sg:
        # Load the assembly graph.
        sg_edges_list = os.path.join(
            helpers.get_test_data_dir(), 'gfa-1', 'sg_edges_list')
        utg_data = os.path.join(
            helpers.get_test_data_dir(), 'gfa-1', 'utg_data')
        ctg_paths = os.path.join(
            helpers.get_test_data_dir(), 'gfa-1', 'ctg_paths')
        asm_graph = AsmGraph(sg_edges_list, utg_data, ctg_paths)
        # Add the string graph to the GFA.
        gfa_graph.add_asm_graph(asm_graph)

    if use_tp:
        # Load the p_ctg tiling paths.
        p_ctg_tiling_path_file = os.path.join(
            helpers.get_test_data_dir(), 'gfa-1', 'p_ctg_tiling_path')
        p_paths, p_edge_to_ctg = gen_gfa_v1.load_tiling_paths(
            p_ctg_tiling_path_file, 'P')
        # Add the tiling paths to the GFA.
        for ctg_id, path in p_paths.iteritems():
            _, contig_len = gen_gfa_v1.calc_node_coords(path)
            if contig_len >= min_p_len:
                gfa_graph.add_tiling_path(path, ctg_id)
        a_ctg_tiling_path_file = os.path.join(
            helpers.get_test_data_dir(), 'gfa-1', 'a_ctg_tiling_path')
        a_paths, a_edge_to_ctg = gen_gfa_v1.load_tiling_paths(
            a_ctg_tiling_path_file, 'P')
        # Add the tiling paths to the GFA.
        for ctg_id, path in a_paths.iteritems():
            _, contig_len = gen_gfa_v1.calc_node_coords(path)
            if contig_len >= min_a_len:
                gfa_graph.add_tiling_path(path, ctg_id)

    if use_nx:
        gexf_file = os.path.join(
            helpers.get_test_data_dir(), 'gfa-1', 'sg.gexf')
        nx_sg = nx.read_gexf(gexf_file)
        gfa_graph.add_nx_string_graph(nx_sg)

    # Init paths to other input files.
    preads_file = os.path.join(
        helpers.get_test_data_dir(), 'gfa-1', 'preads4falcon.fasta')
    p_ctg_fasta = os.path.join(
        helpers.get_test_data_dir(), 'gfa-1', 'p_ctg.fa')
    a_ctg_fasta = os.path.join(
        helpers.get_test_data_dir(), 'gfa-1', 'a_ctg.fa')

    fp_out = StringIO()
    # Run the unit under test.
    gfa_graph.write_gfa_v1(fp_out, preads_file, [
                           p_ctg_fasta, a_ctg_fasta], write_reads, write_contigs)

    # Compare results.
    result = fp_out.getvalue()
    result = result.splitlines()
    expected = [line.strip() for line in open(expected_path).readlines()]
    assert(result == expected)