def split(replicon_path, chunk=None, outdir='.'): """ Split the replicon_file in *chunk* chunks and write them in files. the name of the chunk is the input filename with suffix '_chunk_i' (where i is the chunk number) if the chunk contains several sequences or the id of the sequence if there is only one sequence in the chunk. There also a system that prevent to over write an existing file by appending (number) to the file name for instance ESCO001.B.00018.P002_(1).fst :param str replicon_path: The path to the replicon file. :param int chunk: The number of chunk desire (chunk > 0). :param str outdir: The path of a directory where to write chunk files. The directory must exists. :return: The name of all chunks created. :rtype: List of strings. """ def grouper(sequences_db, chunk_size): """ :param sequences_db: The sequences to group :type sequences_db: A :class:`integron_finder.utils.FastaIterator` object. :param int chunk_size: The number of sequence by Chunk file. :return: a chunk of sequences. :rtype: An iterator of tuples. """ args = [iter(sequences_db)] * chunk_size return zip_longest(*args) with utils.FastaIterator(replicon_path) as sequences_db: sequences_db_len = len(sequences_db) if not chunk: chunk_size = 1 else: chunk_size = math.ceil(sequences_db_len / chunk) chunks = grouper(sequences_db, chunk_size) all_chunk_name = [] for chunk_no, chunk_in in enumerate(chunks, 1): # if replicon contains illegal characters # or replicon is too short < 50 bp # then replicon is None chunk_out = [] for rep_no, replicon in enumerate(chunk_in, 1): if replicon is not None: replicon_name = replicon.id chunk_out.append(replicon) else: rep_no_in_db = (chunk_no - 1) * chunk_size + rep_no if rep_no_in_db <= sequences_db_len: _log.warning("Skipping replicon {}/{} in chunk {}".format(rep_no_in_db, sequences_db_len, chunk_no)) if chunk_out: if chunk_size == 1: chunk_name = "{}.fst".format(replicon_name) else: replicon_name = utils.get_name_from_path(replicon_path) chunk_name = "{}_chunk_{}.fst".format(replicon_name, chunk_no) chunk_name = os.path.join(outdir, chunk_name) i = 0 while os.path.exists(chunk_name): root, ext = os.path.splitext(chunk_name) i += 1 match = re.search("_chunk_\d+$", root) if match: root = root[:match.start()] chunk_name = "{}_chunk_{}{}".format(root, i, ext) _log.info("writing chunk '{}'".format(chunk_name)) SeqIO.write(chunk_out, chunk_name, "fasta") all_chunk_name.append(chunk_name) return all_chunk_name
def test_get_name_from_path(self): self.assertEqual(utils.get_name_from_path('/foo/bar.baz'), 'bar') self.assertEqual(utils.get_name_from_path('bar.baz'), 'bar') self.assertEqual(utils.get_name_from_path('../foo/bar.baz'), 'bar') self.assertEqual(utils.get_name_from_path('../foo/bar'), 'bar')
def main(args=None, loglevel=None): """ main entry point to integron_finder :param str args: the arguments passed on the command line :param loglevel: the output verbosity :type loglevel: a positive int or a string among 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL' """ global _log args = sys.argv[1:] if args is None else args config = parse_args(args) ################################### # Prepare directories for results # ################################### # need to create directory before to init logger # as we write log in integron_finder.out in this dir if not os.path.exists(config.outdir): os.mkdir(config.outdir) else: if not os.path.isdir(config.outdir): msg = "outdir '{}' already exists and is not a directory".format(config.outdir) # _log.critical(msg) # we can not log it because logger are not initialized yet. raise IsADirectoryError(msg) if not os.path.exists(config.result_dir): os.mkdir(config.result_dir) else: if not os.path.isdir(config.result_dir): msg = "result dir '{}' already exists and is not a directory".format(config.outdir) # _log.critical(msg) # we can not log it because logger are not initialized yet. raise IsADirectoryError(msg) elif not os.access(config.result_dir, os.W_OK): msg = "result dir '{}' already exists and is not writable".format(config.outdir) # _log.critical(msg) # we can not log it because logger are not initialized yet. raise PermissionError(msg) #################### # init the loggers # #################### log_file = os.path.join(config.result_dir, 'integron_finder.out') integron_finder.init_logger(log_file=log_file, out=not config.mute) _log = colorlog.getLogger('integron_finder') if not loglevel: # logs are specify from args options logger_set_level(config.log_level) else: # used by unit tests to mute or unmute logs logger_set_level(loglevel) ####################################### # do last config check before running # ####################################### if config.cmsearch is None: msg = """cannot find 'cmsearch' in PATH. Please install infernal package or setup 'cmsearch' binary path with --cmsearch option""" _log.critical(msg) raise RuntimeError(msg) if config.hmmsearch is None: msg = """cannot find 'hmmsearch' in PATH. Please install hmmer package or setup 'hmmsearch' binary path with --hmmsearch option""" _log.critical(msg) raise RuntimeError(msg) if config.prodigal is None: msg = """cannot find 'prodigal' in PATH. Please install prodigal package or setup 'prodigal' binary path with --prodigal option""" _log.critical(msg) raise RuntimeError(msg) ################ # print Header # ################ log_header = colorlog.getLogger('integron_finder.header') logging = colorlog.logging.logging handlers = [] header_log_file = logging.FileHandler(log_file) handlers.append(header_log_file) if not config.mute: header_stream = colorlog.StreamHandler(sys.stdout) handlers.append(header_stream) formatter = colorlog.ColoredFormatter("%(message)s") for h in handlers: h.setFormatter(formatter) log_header.addHandler(h) log_header.setLevel(colorlog.logging.logging.INFO) log_header.propagate = False log_header.info(header(args)) with utils.FastaIterator(config.input_seq_path, dist_threshold=config.distance_threshold) as sequences_db: ################ # set topology # ################ default_topology = 'circ' if len(sequences_db) == 1 else 'lin' if config.linear: default_topology = 'lin' elif config.circular: default_topology = 'circ' # the both options are mutually exclusive topologies = Topology(default_topology, topology_file=config.topology_file) # allow sequences_db to inject topology information # in seq.topology attribute sequences_db.topologies = topologies ############## # do the job # ############## sequences_db_len = len(sequences_db) all_integrons = [] all_summaries = [] for rep_no, replicon in enumerate(sequences_db, 1): # if replicon contains illegal characters # or replicon is too short < 50 bp # then replicon is None if replicon is not None: _log.info("############ Processing replicon {} ({}/{}) ############\n".format(replicon.id, rep_no, sequences_db_len)) integron_res, summary = find_integron_in_one_replicon(replicon, config) if integron_res: all_integrons.append(integron_res) if summary: all_summaries.append(summary) else: _log.warning("############ Skipping replicon {}/{} ############".format(rep_no, sequences_db_len)) if not config.split_results: _log.info("Merging integrons results.\n") agg_integrons = results.merge_results(*all_integrons) agg_summary = results.merge_results(*all_summaries) outfile_base_name = os.path.join(config.result_dir, utils.get_name_from_path(config.input_seq_path)) merged_integron_file = outfile_base_name + ".integrons" if not agg_integrons.empty: agg_integrons.to_csv(merged_integron_file, sep="\t", index=False, na_rep="NA") else: with open(merged_integron_file, "w") as out_f: out_f.write("# No Integron found\n") merged_summary_file = outfile_base_name + ".summary" if not agg_integrons.empty: agg_summary.to_csv(merged_summary_file, sep="\t", index=False, na_rep="NA", columns=['ID_replicon', 'ID_integron', 'complete', 'In0', 'CALIN']) for _file in all_integrons + all_summaries: if _file != merged_integron_file and _file != merged_summary_file: # in special case where the merged file has the same name that a replicon result file os.unlink(_file)
def split(replicon_path, chunk=None, outdir='.'): """ Split the replicon_file in *chunk* chunks and write them in files. the name of the chunk is the input filename with suffix '_chunk_i' (where i is the chunk number) if the chunk contains several sequences or the id of the sequence if there is only one sequence in the chunk. There also a system that prevent to over write an existing file by appending (number) to the file name for instance ESCO001.B.00018.P002_(1).fst :param str replicon_path: The path to the replicon file. :param int chunk: The number of chunk desire (chunk > 0). :param str outdir: The path of a directory where to write chunk files. The directory must exists. :return: The name of all chunks created. :rtype: List of strings. """ def grouper(sequences_db, chunk_size): """ :param sequences_db: The sequences to group :type sequences_db: A :class:`integron_finder.utils.FastaIterator` object. :param int chunk_size: The number of sequence by Chunk file. :return: a chunk of sequences. :rtype: An iterator of tuples. """ args = [iter(sequences_db)] * chunk_size return zip_longest(*args) with utils.FastaIterator(replicon_path) as sequences_db: sequences_db_len = len(sequences_db) if not chunk: chunk_size = 1 else: chunk_size = math.ceil(sequences_db_len / chunk) chunks = grouper(sequences_db, chunk_size) all_chunk_name = [] for chunk_no, chunk_in in enumerate(chunks, 1): # if replicon contains illegal characters # or replicon is too short < 50 bp # then replicon is None chunk_out = [] for rep_no, replicon in enumerate(chunk_in, 1): if replicon is not None: replicon_name = replicon.id chunk_out.append(replicon) else: rep_no_in_db = (chunk_no - 1) * chunk_size + rep_no if rep_no_in_db <= sequences_db_len: _log.warning( "Skipping replicon {}/{} in chunk {}".format( rep_no_in_db, sequences_db_len, chunk_no)) if chunk_out: if chunk_size == 1: chunk_name = "{}.fst".format(replicon_name) else: replicon_name = utils.get_name_from_path(replicon_path) chunk_name = "{}_chunk_{}.fst".format( replicon_name, chunk_no) chunk_name = os.path.join(outdir, chunk_name) i = 0 while os.path.exists(chunk_name): root, ext = os.path.splitext(chunk_name) i += 1 match = re.search("_chunk_\d+$", root) if match: root = root[:match.start()] chunk_name = "{}_chunk_{}{}".format(root, i, ext) _log.info("writing chunk '{}'".format(chunk_name)) SeqIO.write(chunk_out, chunk_name, "fasta") all_chunk_name.append(chunk_name) return all_chunk_name
def main(args=None, loglevel=None): """ main entry point to integron_finder :param str args: the arguments passed on the command line :param loglevel: the output verbosity :type loglevel: a positive int or a string among 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL' """ global _log args = sys.argv[1:] if args is None else args config = parse_args(args) ################################### # Prepare directories for results # ################################### # need to create directory before to init logger # as we write log in integron_finder.out in this dir if not os.path.exists(config.outdir): os.mkdir(config.outdir) else: if not os.path.isdir(config.outdir): msg = "outdir '{}' already exists and is not a directory".format( config.outdir) # _log.critical(msg) # we can not log it because logger are not initialized yet. raise IsADirectoryError(msg) if not os.path.exists(config.result_dir): os.mkdir(config.result_dir) else: if not os.path.isdir(config.result_dir): msg = "result dir '{}' already exists and is not a directory".format( config.outdir) # _log.critical(msg) # we can not log it because logger are not initialized yet. raise IsADirectoryError(msg) elif not os.access(config.result_dir, os.W_OK): msg = "result dir '{}' already exists and is not writable".format( config.outdir) # _log.critical(msg) # we can not log it because logger are not initialized yet. raise PermissionError(msg) #################### # init the loggers # #################### log_file = os.path.join(config.result_dir, 'integron_finder.out') integron_finder.init_logger(log_file=log_file, out=not config.mute) _log = colorlog.getLogger('integron_finder') if not loglevel: # logs are specify from args options logger_set_level(config.log_level) else: # used by unit tests to mute or unmute logs logger_set_level(loglevel) ####################################### # do last config check before running # ####################################### if config.cmsearch is None: msg = """cannot find 'cmsearch' in PATH. Please install infernal package or setup 'cmsearch' binary path with --cmsearch option""" _log.critical(msg) raise RuntimeError(msg) if config.hmmsearch is None: msg = """cannot find 'hmmsearch' in PATH. Please install hmmer package or setup 'hmmsearch' binary path with --hmmsearch option""" _log.critical(msg) raise RuntimeError(msg) if config.prodigal is None: msg = """cannot find 'prodigal' in PATH. Please install prodigal package or setup 'prodigal' binary path with --prodigal option""" _log.critical(msg) raise RuntimeError(msg) ################ # print Header # ################ log_header = colorlog.getLogger('integron_finder.header') logging = colorlog.logging.logging handlers = [] header_log_file = logging.FileHandler(log_file) handlers.append(header_log_file) if not config.mute: header_stream = colorlog.StreamHandler(sys.stdout) handlers.append(header_stream) formatter = colorlog.ColoredFormatter("%(message)s") for h in handlers: h.setFormatter(formatter) log_header.addHandler(h) log_header.setLevel(colorlog.logging.logging.INFO) log_header.propagate = False log_header.info(header(args)) with utils.FastaIterator( config.input_seq_path, dist_threshold=config.distance_threshold) as sequences_db: ################ # set topology # ################ default_topology = 'circ' if len(sequences_db) == 1 else 'lin' if config.linear: default_topology = 'lin' elif config.circular: default_topology = 'circ' # the both options are mutually exclusive topologies = Topology(default_topology, topology_file=config.topology_file) # allow sequences_db to inject topology information # in seq.topology attribute sequences_db.topologies = topologies ############## # do the job # ############## sequences_db_len = len(sequences_db) all_integrons = [] all_summaries = [] for rep_no, replicon in enumerate(sequences_db, 1): # if replicon contains illegal characters # or replicon is too short < 50 bp # then replicon is None if replicon is not None: _log.info( "############ Processing replicon {} ({}/{}) ############\n" .format(replicon.id, rep_no, sequences_db_len)) integron_res, summary = find_integron_in_one_replicon( replicon, config) if integron_res: all_integrons.append(integron_res) if summary: all_summaries.append(summary) else: _log.warning( "############ Skipping replicon {}/{} ############".format( rep_no, sequences_db_len)) if not config.split_results: _log.info("Merging integrons results.\n") agg_integrons = results.merge_results(*all_integrons) agg_summary = results.merge_results(*all_summaries) outfile_base_name = os.path.join( config.result_dir, utils.get_name_from_path(config.input_seq_path)) merged_integron_file = outfile_base_name + ".integrons" if not agg_integrons.empty: agg_integrons.to_csv(merged_integron_file, sep="\t", index=False, na_rep="NA") else: with open(merged_integron_file, "w") as out_f: out_f.write("# No Integron found\n") merged_summary_file = outfile_base_name + ".summary" if not agg_integrons.empty: agg_summary.to_csv(merged_summary_file, sep="\t", index=False, na_rep="NA", columns=[ 'ID_replicon', 'ID_integron', 'complete', 'In0', 'CALIN' ]) for _file in all_integrons + all_summaries: if _file != merged_integron_file and _file != merged_summary_file: # in special case where the merged file has the same name that a replicon result file os.unlink(_file)