def main(): fastg_file_name = '' blast_result_file = '' output_file = '' overlap_len = None options, args = getopt.getopt(sys.argv[1:], 'hl:') for option, value in options: if option == '-l': overlap_len = int(value) elif option == '-h': print_help_message() sys.exit() else: print_help_message() sys.exit() fastg_file_name, blast_result_file, output_file = args nodes = fastg_file.build_assembly_graph(fastg_file_name, overlap_len) alignments = list(filter(lambda x: x.is_valid and x.is_forward, read_file(blast_result_file))) Alignment.add_connection(alignments, nodes) alignments.sort(key=lambda x: x.start) # write_file(output_file, alignments) values, actions = Alignment.get_path(alignments) # Alignment.write_alignments_to_dot_file(alignments, output_file, # actions, values) Alignment.write_path_to_dot_file(actions, values, output_file)
def _test_read_file(): input_file = 'assembly_graph.siteGraph' fastg_file_ = 'assembly_graph.fastg' import fastg_file nodes = fastg_file.build_assembly_graph(fastg_file_, 127) sites = read_file(input_file, nodes) for site in sites.values(): print(site.id) print([(ele[0], ele[1]) for ele in site.children])
def main(): input_file = '' output_file = '' overlap_len = None shift_len = 0 to_simplify = False max_interval_len = None options, args = getopt.getopt(sys.argv[1:], 'm:k:i:l:o:hs') for option, value in options: if option == '-i': input_file = value elif option == '-o': output_file = value elif option == '-l': overlap_len = int(value) elif option == '-k': shift = int(value) elif option == '-m': max_interval_len = int(value) elif option == '-s': to_simplify = True elif option == '-h': printHelpMessage() sys.exit() else: printHelpMessage() sys.exit() print('overlap_len:', overlap_len) nodes = fastg_file.build_assembly_graph(input_file, overlap=overlap_len) tick = time.time() sites, site_position_index = build_site_graph(nodes, shift=shift) tock = time.time() if to_simplify: print('Simplifying site graph...') sites = site_graph.simplify_site_graph(sites) # print('{} sites created on {} nodes'.format(len(sites), len(site_position_index))) tock2 = time.time() # debug_site = sites['648r'] # print('site', debug_site.id, 'has {} children.'.format(len(debug_site.children))) # for child, interval, nodes_path, _ in debug_site.children: # print(child.id, interval, [ele.uid for ele in nodes_path]) site_graph.write_file(output_file, sites, [ ' '.join(sys.argv), 'Number of sites: {}'.format( len(sites)), 'Number of nodes: {}'.format(len(nodes)), 'Numner of nodes contain site: {}'.format(len(site_position_index)), 'Max interval len: {}'.format(max_interval_len), 'Allowed repeat num: {}'.format(ALLOWED_REPEAT_NUM), 'Time used to build graph: {} seconds'.format(round(tock - tick)), 'Time used to simplify graph: {} seconds'.format(round(tock2 - tock)) ])
def main(): interface = 'hk:' options, args = getopt.getopt(sys.argv[1:], interface) overlap_size = None for option, value in options: if option == '-h': print_help() sys.exit() elif option == '-k': overlap_size = int(value) input_file, output_file = args nodes = fastg_file.build_assembly_graph(input_file, overlap_size) write_last_graph(nodes, output_file, overlap_size)
def _test_build_site_graph(): input_file = 'assembly_graph.fastg' # input_file = 'test.fastg' overlap = 127 nodes = fastg_file.build_assembly_graph(input_file, overlap) # nodes = fastg_file.build_assembly_graph(input_file, overlap=2) sites, site_position_index = build_site_graph(nodes) num_position_in_index = sum( (len(ele) for ele in site_position_index.values())) site_ids = [ int(site.id) for site in sites.values() if not site.id.endswith('r') ] site_ids.sort() site_positions = list(site_position_index.items()) site_positions.sort(key=lambda x: int(x[0].uid.rstrip('r'))) site_index = {} count = [0] * 10 num_site_contain_self_as_child = 0 for node, positions in site_position_index.items(): for position, site in positions.items(): if site not in site_index: site_index[site] = [(node.uid, position)] else: site_index[site].append((node.uid, position)) for v in site_index.values(): v.sort(key=lambda x: int(x[0].rstrip('r'))) site_index_items = list(site_index.items()) site_index_items.sort(key=lambda x: x[1][0]) for site, positions in site_index_items: child_sites, intervals, node_paths = list(zip( *site.children)) if site.children else ([], [], []) child_site_ids = [ele.id for ele in child_sites] node_ids = [[ele.uid for ele in node_path] for node_path in node_paths] child_site_id_intervals = list(zip(child_site_ids, intervals)) child_site_id_intervals_ = sorted(child_site_id_intervals) # print(site, positions, 'C:', child_site_id_intervals_) if site.id in child_site_ids: num_site_contain_self_as_child += 1 print('!!!') count[len(positions)] += 1
def main(): paths_file_name = '' gap_info_file_name = '' fastg_file_name = '' interface = 'hp:g:' options, args = getopt.getopt(sys.argv[1:], interface) for option, value in options: if option == '-h': print_help() sys.exit() elif option == '-p': paths_file_name = value elif option == '-g': fastg_file_name = value gap_info_file_name, output_file_name = args # paths_file_name = '/home/huangbin/simulation_lab/Bacteria/E.coli_PL/spades.dir/contigs.paths' # gap_info_file_name = '/home/huangbin/simulation_lab/Bacteria/E.coli_PL/hybrid_scaffold_keep_11.dir/hybrid_scaffolds/tmp1.txt' # fastg_file_name = 'datasets/assembly_graph.fastg' paths = paths_file.read_file(paths_file_name) gaps = gap_info_file.read_file(gap_info_file_name) nodes = fastg_file.build_assembly_graph(fastg_file_name, overlap=OVERLAP) _, site_position_indexs = sitegraph_builder.build_site_graph(nodes, mode=1) # Transform gaps. for gap in gaps: debug = 0 if gap.start_node_id.startswith('NODE_2_'): debug = 1 if debug == 1: print('DEBUG:', gap.start_node_id, gap.start_site_index) gap.start_node_id, gap.start_site_index, gap.start_site_position =\ transform_position(gap.start_node_id, gap.start_site_position, nodes, paths, site_position_indexs, OVERLAP, debug=debug) gap.end_node_id, gap.end_site_index, gap.end_site_position =\ transform_position(gap.end_node_id, gap.end_site_position, nodes, paths, site_position_indexs, OVERLAP, debug=debug) if debug == 1: print('DEBUG:', gap.start_node_id, gap.start_site_index) # Write gaps. cmd = ' '.join(sys.argv) gap_info_file.write_file(gaps, output_file_name, comments=[cmd])
def main(): # unmodified_reference_cmap = None global find_path_script fastg_file_name = None site_graph_file_name = None find_path_script = 'find_path_dp.py' work_dir = None task_name = None n_thread = 1 is_node_id_processed = 0 interface = 'hf:o:n:x:s:m:' options, args = getopt.getopt(sys.argv[1:], interface) for option, value in options: if option == '-h': print_help() sys.exit() elif option == '-f': fastg_file_name = value elif option == '-s': site_graph_file_name = value elif option == '-x': find_path_script = value elif option == '-n': n_thread = int(value) elif option == '-o': overlap_len = int(value) elif option == '-m': is_node_id_processed = int(value) gap_info_file_name = args[0] if len(args) == 1: pass if len(args) == 2: task_name = args[1] elif len(args) == 3: work_dir, task_name = args[1:3] else: print_help() sys.exit() if task_name is None: task_name = fastg_file_name.rsplit('.', 1) global log_dir, seq_fa_file, graph_pickle_file log_dir = work_dir + '/' + task_name + '_log.dir' seq_fa_file = work_dir + '/' + task_name + '.fa' graph_pickle_file = work_dir + '/' + task_name + '_graph.pickle' subprocess.run(('rm', '-rf', log_dir)) subprocess.run(('mkdir', log_dir)) # Read assembly graph. nodes = fastg_file.build_assembly_graph(fastg_file_name, overlap_len) if site_graph_file_name: # Reads site graph from file. sites = site_graph.read_file(site_graph_file_name, nodes) _, site_position_index = sitegraph_builder.build_site_graph(nodes, mode=1) for position_index in site_position_index.values(): for position in position_index: original_site = position_index[position] position_index[position] = sites[original_site.id] else: # Build site graph form assembly graph, and write site graph. sites, site_position_index = \ sitegraph_builder.build_site_graph(nodes) sites = site_graph.simplify_site_graph(sites) site_graph_file_name = fastg_file_name.rsplit('.')[0] +\ '.sitegraph' site_graph.write_file(site_graph_file_name, sites) # Write nodes and sites to pickle file. with open(graph_pickle_file, 'wb') as fout: pickle.dump(nodes, fout, -1) pickle.dump(sites, fout, -1) # Find start site and end site. gaps = gap_info_file.read_file(gap_info_file_name) exe_tuples = [] for gap in gaps: if is_node_id_processed: start_node_id = gap.start_node_id end_node_id = gap.end_node_id else: start_node_id = get_node_id_from_long_name(gap.start_node_id) end_node_id = get_node_id_from_long_name(gap.end_node_id) start_site_position, start_site = get_site_by_index( gap.start_site_index, site_position_index[nodes[start_node_id]]) end_site_position, end_site = get_site_by_index( gap.end_site_index, site_position_index[nodes[end_node_id]]) print(start_site.id, end_site.id) gap_seq_id = '-'.join((start_node_id, end_node_id)) exe_tuples.append( (gap_seq_id, construct_cmd(start_site.id, end_site.id, start_site_position, end_site_position, graph_pickle_file, gap.intervals, gap_seq_id))) with Pool(n_thread) as p: p.map(find_path_dp_process, exe_tuples)