def test_filter_tiling_paths_by_len(): p_ctg_tiling_path_file = os.path.join( helpers.get_test_data_dir(), 'p_ctg_tiling_path_1') p_path, p_edge_to_ctg = mod.load_tiling_paths(p_ctg_tiling_path_file, 'P') _, p_ctg_len = mod.calc_tiling_paths_len(p_path) p_path_filtered = mod.filter_tiling_paths_by_len(p_path, p_ctg_len, 0) assert(sorted(p_path_filtered.keys()) == sorted( ['000000F', '000001F', '000002F', '000003F', '000004F'])) p_path_filtered = mod.filter_tiling_paths_by_len(p_path, p_ctg_len, 10000) assert(sorted(p_path_filtered.keys()) == sorted( ['000000F', '000001F', '000002F', '000003F'])) p_path_filtered = mod.filter_tiling_paths_by_len(p_path, p_ctg_len, 35000) assert(sorted(p_path_filtered.keys()) == sorted(['000000F', '000001F'])) p_path_filtered = mod.filter_tiling_paths_by_len(p_path, p_ctg_len, 100000) assert(sorted(p_path_filtered.keys()) == sorted([])) # Test a degenerate case where there is no length for a particular contig. keys = p_ctg_len.keys() p_ctg_len_degenerate = {} for i in xrange(1, len(keys)): p_ctg_len_degenerate[keys[i]] = p_ctg_len[keys[i]] with pytest.raises(Exception) as e_info: p_path_filtered = mod.filter_tiling_paths_by_len( p_path, p_ctg_len_degenerate, 0)
def test_format_gfa_v1_path_line(): gfa_graph = mod.GFAGraph() # Load tiling paths from file. p_ctg_tiling_path_file = os.path.join(helpers.get_test_data_dir(), 'p_ctg_tiling_path_1') p_paths, p_edge_to_ctg = gen_gfa_v1.load_tiling_paths( p_ctg_tiling_path_file, 'P') # If seq_len_map is None, all CIGAR operations should be '*'. expected = { '000000F': 'P\t000000F\t000092122-,000081654-,000034462-,000061403-,000021348-,000062240-,000083779-,000019819+,000063672+,000026565+,000050047-\t*,*,*,*,*,*,*,*,*,*,*', '000001F': 'P\t000001F\t000070651+,000018109+,000068978+,000100559+,000010548-,000006846-,000065052-,000071922+,000076878+,000000861+,000001755-\t*,*,*,*,*,*,*,*,*,*,*', '000002F': 'P\t000002F\t000088930+,000008918+,000100248-,000085315-,000071965+,000082497+\t*,*,*,*,*,*', '000003F': 'P\t000003F\t000084518+,000011674+,000057445-\t*,*,*', '000004F': 'P\t000004F\t000014727+,000024020+,000060868+\t*,*,*', } seq_len_map = None for ctg_id, path in p_paths.iteritems(): path_line = gfa_graph.format_gfa_v1_path_line(ctg_id, path, seq_len_map) assert (path_line == expected[ctg_id]) # The seq_len_map dict is only used for the first read in the path, # because it needs to be included completely. The other CIGAR operations # are determined directly from the edges. expected = { '000000F': 'P\t000000F\t000092122-,000081654-,000034462-,000061403-,000021348-,000062240-,000083779-,000019819+,000063672+,000026565+,000050047-\t10000M,33726M,10123M,1352M,9924M,5834M,862M,5562M,1384M,473M,2171M', '000001F': 'P\t000001F\t000070651+,000018109+,000068978+,000100559+,000010548-,000006846-,000065052-,000071922+,000076878+,000000861+,000001755-\t10000M,10077M,3766M,2648M,2421M,2089M,18168M,2723M,2451M,666M,15088M', '000002F': 'P\t000002F\t000088930+,000008918+,000100248-,000085315-,000071965+,000082497+\t10000M,15215M,3113M,4851M,1857M,6035M', '000003F': 'P\t000003F\t000084518+,000011674+,000057445-\t10000M,9432M,23096M', '000004F': 'P\t000004F\t000014727+,000024020+,000060868+\t10000M,5238M,3235M', } for ctg_id, path in p_paths.iteritems(): # Initialize all reads to a fixed value, just to be safe. seq_len_map = {} for edge in path: v, w = edge[0], edge[1] seq_len_map[v.split(':')[0]] = 10000 seq_len_map[w.split(':')[0]] = 10000 path_line = gfa_graph.format_gfa_v1_path_line(ctg_id, path, seq_len_map) assert (path_line == expected[ctg_id]) # Test a degenerate case where path is None. path_line = gfa_graph.format_gfa_v1_path_line('', None, None) assert (path_line == '')
def test_calc_tiling_paths_len(): p_ctg_tiling_path_file = os.path.join( helpers.get_test_data_dir(), 'p_ctg_tiling_path_1') p_path, p_edge_to_ctg = mod.load_tiling_paths(p_ctg_tiling_path_file, 'P') p_coords, p_ctg_len = mod.calc_tiling_paths_len(p_path) for ctg_id in p_coords.keys(): shared_items = set(expected_coord_map[ctg_id].items()) & set( p_coords[ctg_id].items()) assert(len(shared_items) == len(p_coords[ctg_id])) assert(expected_contig_len[ctg_id] == p_ctg_len[ctg_id])
def test_calc_node_coords(): # The p_ctg_tiling_path_1 is a normal tiling path file. p_ctg_tiling_path_file = os.path.join( helpers.get_test_data_dir(), 'p_ctg_tiling_path_1') p_paths, p_edge_to_ctg = mod.load_tiling_paths(p_ctg_tiling_path_file, 'P') ctg_id = '000000F' coord_map, contig_len = mod.calc_node_coords(p_paths[ctg_id]) shared_items = set(expected_coord_map[ctg_id].items()) & set( coord_map.items()) assert(len(shared_items) == len(coord_map)) assert(expected_contig_len[ctg_id] == contig_len) # The p_ctg_tiling_path_2 has two degenerative cases: # - 000000F which has an inner cycle # - 000001F which has an out-of-order edge # - 000002F which is circular (this is a valid case) p_ctg_tiling_path_file = os.path.join( helpers.get_test_data_dir(), 'p_ctg_tiling_path_2') p_paths, p_edge_to_ctg = mod.load_tiling_paths(p_ctg_tiling_path_file, 'P') # Allow cycles, but the node's coord gets overwritten. ctg_id = '000000F' coord_map, contig_len = mod.calc_node_coords(p_paths[ctg_id]) assert(coord_map['000081654:B'] == 55125) # Do not allow unsorted graphs. ctg_id = '000001F' with pytest.raises(Exception) as e_info: coord_map, contig_len = mod.calc_node_coords(p_paths[ctg_id]) # Allow circular graphs. ctg_id = '000002F' coord_map, contig_len = mod.calc_node_coords(p_paths[ctg_id]) assert(contig_len == 18473) # Test for an empty tiling path. coord_map, contig_len = mod.calc_node_coords([]) assert(not coord_map) assert(contig_len == 0)
def test_load_tiling_paths(): p_ctg_tiling_path_file = os.path.join( helpers.get_test_data_dir(), 'p_ctg_tiling_path_1') p_path, p_edge_to_ctg = mod.load_tiling_paths(p_ctg_tiling_path_file, 'P') assert(sorted(p_path.keys()) == sorted( ['000000F', '000001F', '000002F', '000003F', '000004F'])) for ctg_id, path in p_path.iteritems(): for edge in path: v, w, b, e, l, idt, etype = edge assert((v, w) in p_edge_to_ctg) assert(p_edge_to_ctg[(v, w)] == (ctg_id, etype))
def test_add_tiling_path(): # Load the tiling path. These methods are tested in test_gen_gfa_v1.py. p_ctg_tiling_path_file = os.path.join( helpers.get_test_data_dir(), 'p_ctg_tiling_path_1') p_paths, p_edge_to_ctg = gen_gfa_v1.load_tiling_paths( p_ctg_tiling_path_file, 'P') # Create a new GFA graph. gfa_graph = mod.GFAGraph() # Add the tiling paths. for ctg_id, path in p_paths.iteritems(): gfa_graph.add_tiling_path(path, ctg_id) # Check if we have the correct number of tiling paths. assert(len(gfa_graph.paths.keys()) == len(p_paths.keys())) # They should be same as loaded. for ctg_id, path in p_paths.iteritems(): assert(ctg_id in gfa_graph.paths) assert(gfa_graph.paths[ctg_id] == path)
def test_write_gfa_v1_2(): # Tests a case where a node is added to the graph, but # there is no corresponding pread in preads4falcon.fasta file. # Create a GFA graph. gfa_graph = mod.GFAGraph() # Load the p_ctg tiling paths. p_ctg_tiling_path_file = os.path.join( helpers.get_test_data_dir(), 'gfa-1', 'p_ctg_tiling_path') p_paths, p_edge_to_ctg = gen_gfa_v1.load_tiling_paths( p_ctg_tiling_path_file, 'P') # Add the tiling paths to the GFA. for ctg_id, path in p_paths.iteritems(): gfa_graph.add_tiling_path(path, ctg_id) # Init paths to other input files. preads_file = os.path.join( helpers.get_test_data_dir(), 'gfa-1', 'preads4falcon.fasta') p_ctg_fasta = os.path.join( helpers.get_test_data_dir(), 'gfa-1', 'p_ctg.fa') a_ctg_fasta = os.path.join( helpers.get_test_data_dir(), 'gfa-1', 'a_ctg.fa') write_reads = False write_contigs = False fp_out = StringIO() # Add a node which does not exist in the preads4falcon.fasta file. gfa_graph.add_read_from_node('12345:B') # Run the unit under test. with pytest.raises(Exception) as e_info: gfa_graph.write_gfa_v1(fp_out, preads_file, [ p_ctg_fasta, a_ctg_fasta], write_reads, write_contigs)
def wrap_write_gfa_v1_test(use_sg, use_nx, use_tp, write_reads, write_contigs, min_p_len, min_a_len, expected_path): # Create a GFA graph. gfa_graph = mod.GFAGraph() if use_sg: # Load the assembly graph. sg_edges_list = os.path.join( helpers.get_test_data_dir(), 'gfa-1', 'sg_edges_list') utg_data = os.path.join( helpers.get_test_data_dir(), 'gfa-1', 'utg_data') ctg_paths = os.path.join( helpers.get_test_data_dir(), 'gfa-1', 'ctg_paths') asm_graph = AsmGraph(sg_edges_list, utg_data, ctg_paths) # Add the string graph to the GFA. gfa_graph.add_asm_graph(asm_graph) if use_tp: # Load the p_ctg tiling paths. p_ctg_tiling_path_file = os.path.join( helpers.get_test_data_dir(), 'gfa-1', 'p_ctg_tiling_path') p_paths, p_edge_to_ctg = gen_gfa_v1.load_tiling_paths( p_ctg_tiling_path_file, 'P') # Add the tiling paths to the GFA. for ctg_id, path in p_paths.iteritems(): _, contig_len = gen_gfa_v1.calc_node_coords(path) if contig_len >= min_p_len: gfa_graph.add_tiling_path(path, ctg_id) a_ctg_tiling_path_file = os.path.join( helpers.get_test_data_dir(), 'gfa-1', 'a_ctg_tiling_path') a_paths, a_edge_to_ctg = gen_gfa_v1.load_tiling_paths( a_ctg_tiling_path_file, 'P') # Add the tiling paths to the GFA. for ctg_id, path in a_paths.iteritems(): _, contig_len = gen_gfa_v1.calc_node_coords(path) if contig_len >= min_a_len: gfa_graph.add_tiling_path(path, ctg_id) if use_nx: gexf_file = os.path.join( helpers.get_test_data_dir(), 'gfa-1', 'sg.gexf') nx_sg = nx.read_gexf(gexf_file) gfa_graph.add_nx_string_graph(nx_sg) # Init paths to other input files. preads_file = os.path.join( helpers.get_test_data_dir(), 'gfa-1', 'preads4falcon.fasta') p_ctg_fasta = os.path.join( helpers.get_test_data_dir(), 'gfa-1', 'p_ctg.fa') a_ctg_fasta = os.path.join( helpers.get_test_data_dir(), 'gfa-1', 'a_ctg.fa') fp_out = StringIO() # Run the unit under test. gfa_graph.write_gfa_v1(fp_out, preads_file, [ p_ctg_fasta, a_ctg_fasta], write_reads, write_contigs) # Compare results. result = fp_out.getvalue() result = result.splitlines() expected = [line.strip() for line in open(expected_path).readlines()] assert(result == expected)