Example #1
0
def split(replicon_path, chunk=None, outdir='.'):
    """
    Split the replicon_file in *chunk* chunks and write them in files.
    the name of the chunk is the input filename with suffix '_chunk_i'
    (where i is the chunk number) if the chunk contains several sequences
    or the id of the sequence if there is only one sequence in the chunk.
    There also a system that prevent to over write an existing file by appending (number)
    to the file name for instance ESCO001.B.00018.P002_(1).fst

    :param str replicon_path: The path to the replicon file.
    :param int chunk: The number of chunk desire (chunk > 0).
    :param str outdir: The path of a directory where to write chunk files.
                       The directory must exists.
    :return: The name of all chunks created.
    :rtype: List of strings.
    """
    def grouper(sequences_db, chunk_size):
        """

        :param sequences_db: The sequences to group
        :type sequences_db: A :class:`integron_finder.utils.FastaIterator` object.
        :param int chunk_size: The number of sequence by Chunk file.
        :return: a chunk of sequences.
        :rtype: An iterator of tuples.
        """
        args = [iter(sequences_db)] * chunk_size
        return zip_longest(*args)

    with utils.FastaIterator(replicon_path) as sequences_db:
        sequences_db_len = len(sequences_db)
        if not chunk:
            chunk_size = 1
        else:
            chunk_size = math.ceil(sequences_db_len / chunk)

        chunks = grouper(sequences_db, chunk_size)
        all_chunk_name = []
        for chunk_no, chunk_in in enumerate(chunks, 1):
            # if replicon contains illegal characters
            # or replicon is too short < 50 bp
            # then replicon is None
            chunk_out = []
            for rep_no, replicon in enumerate(chunk_in, 1):
                if replicon is not None:
                    replicon_name = replicon.id
                    chunk_out.append(replicon)
                else:
                    rep_no_in_db = (chunk_no - 1) * chunk_size + rep_no
                    if rep_no_in_db <= sequences_db_len:
                        _log.warning("Skipping replicon {}/{} in chunk {}".format(rep_no_in_db,
                                                                                  sequences_db_len,
                                                                                  chunk_no))
            if chunk_out:
                if chunk_size == 1:
                    chunk_name = "{}.fst".format(replicon_name)
                else:
                    replicon_name = utils.get_name_from_path(replicon_path)
                    chunk_name = "{}_chunk_{}.fst".format(replicon_name, chunk_no)
                chunk_name = os.path.join(outdir, chunk_name)
                i = 0
                while os.path.exists(chunk_name):
                    root, ext = os.path.splitext(chunk_name)
                    i += 1
                    match = re.search("_chunk_\d+$", root)
                    if match:
                        root = root[:match.start()]
                    chunk_name = "{}_chunk_{}{}".format(root, i, ext)

                _log.info("writing chunk '{}'".format(chunk_name))
                SeqIO.write(chunk_out, chunk_name, "fasta")
                all_chunk_name.append(chunk_name)
    return all_chunk_name
 def test_get_name_from_path(self):
     self.assertEqual(utils.get_name_from_path('/foo/bar.baz'), 'bar')
     self.assertEqual(utils.get_name_from_path('bar.baz'), 'bar')
     self.assertEqual(utils.get_name_from_path('../foo/bar.baz'), 'bar')
     self.assertEqual(utils.get_name_from_path('../foo/bar'), 'bar')
Example #3
0
def main(args=None, loglevel=None):
    """
    main entry point to integron_finder

    :param str args: the arguments passed on the command line
    :param loglevel: the output verbosity
    :type loglevel: a positive int or a string among 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'
    """
    global _log

    args = sys.argv[1:] if args is None else args
    config = parse_args(args)

    ###################################
    # Prepare directories for results #
    ###################################

    # need to create directory before to init logger
    # as we write log in integron_finder.out in this dir

    if not os.path.exists(config.outdir):
        os.mkdir(config.outdir)
    else:
        if not os.path.isdir(config.outdir):
            msg = "outdir '{}' already exists and is not a directory".format(config.outdir)
            # _log.critical(msg)
            # we can not log it because logger are not initialized yet.
            raise IsADirectoryError(msg)

    if not os.path.exists(config.result_dir):
        os.mkdir(config.result_dir)
    else:
        if not os.path.isdir(config.result_dir):
            msg = "result dir '{}' already exists and is not a directory".format(config.outdir)
            # _log.critical(msg)
            # we can not log it because logger are not initialized yet.
            raise IsADirectoryError(msg)
        elif not os.access(config.result_dir, os.W_OK):
            msg = "result dir '{}' already exists and is not writable".format(config.outdir)
            # _log.critical(msg)
            # we can not log it because logger are not initialized yet.
            raise PermissionError(msg)

    ####################
    # init the loggers #
    ####################
    log_file = os.path.join(config.result_dir, 'integron_finder.out')
    integron_finder.init_logger(log_file=log_file,
                                out=not config.mute)

    _log = colorlog.getLogger('integron_finder')

    if not loglevel:
        # logs are specify from args options
        logger_set_level(config.log_level)
    else:
        # used by unit tests to mute or unmute logs
        logger_set_level(loglevel)

    #######################################
    # do last config check before running #
    #######################################
    if config.cmsearch is None:
        msg = """cannot find 'cmsearch' in PATH.
Please install infernal package or setup 'cmsearch' binary path with --cmsearch option"""
        _log.critical(msg)
        raise RuntimeError(msg)

    if config.hmmsearch is None:
        msg = """cannot find 'hmmsearch' in PATH.
Please install hmmer package or setup 'hmmsearch' binary path with --hmmsearch option"""
        _log.critical(msg)
        raise RuntimeError(msg)

    if config.prodigal is None:
        msg = """cannot find 'prodigal' in PATH.
Please install prodigal package or setup 'prodigal' binary path with --prodigal option"""
        _log.critical(msg)
        raise RuntimeError(msg)

    ################
    # print Header #
    ################
    log_header = colorlog.getLogger('integron_finder.header')
    logging = colorlog.logging.logging
    handlers = []
    header_log_file = logging.FileHandler(log_file)
    handlers.append(header_log_file)
    if not config.mute:
        header_stream = colorlog.StreamHandler(sys.stdout)
        handlers.append(header_stream)
    formatter = colorlog.ColoredFormatter("%(message)s")
    for h in handlers:
        h.setFormatter(formatter)
        log_header.addHandler(h)
    log_header.setLevel(colorlog.logging.logging.INFO)
    log_header.propagate = False
    log_header.info(header(args))

    with utils.FastaIterator(config.input_seq_path, dist_threshold=config.distance_threshold) as sequences_db:
        ################
        # set topology #
        ################
        default_topology = 'circ' if len(sequences_db) == 1 else 'lin'
        if config.linear:
            default_topology = 'lin'
        elif config.circular:
            default_topology = 'circ'
        # the both options are mutually exclusive
        topologies = Topology(default_topology, topology_file=config.topology_file)

        # allow sequences_db to inject topology information
        # in seq.topology attribute
        sequences_db.topologies = topologies

        ##############
        # do the job #
        ##############
        sequences_db_len = len(sequences_db)
        all_integrons = []
        all_summaries = []
        for rep_no, replicon in enumerate(sequences_db, 1):
            # if replicon contains illegal characters
            # or replicon is too short < 50 bp
            # then replicon is None
            if replicon is not None:
                _log.info("############ Processing replicon {} ({}/{}) ############\n".format(replicon.id,
                                                                                              rep_no,
                                                                                              sequences_db_len))
                integron_res, summary = find_integron_in_one_replicon(replicon, config)
                if integron_res:
                    all_integrons.append(integron_res)
                if summary:
                    all_summaries.append(summary)
            else:
                _log.warning("############ Skipping replicon {}/{} ############".format(rep_no,
                                                                                        sequences_db_len))

    if not config.split_results:
        _log.info("Merging integrons results.\n")
        agg_integrons = results.merge_results(*all_integrons)
        agg_summary = results.merge_results(*all_summaries)
        outfile_base_name = os.path.join(config.result_dir, utils.get_name_from_path(config.input_seq_path))
        merged_integron_file = outfile_base_name + ".integrons"
        if not agg_integrons.empty:
            agg_integrons.to_csv(merged_integron_file, sep="\t", index=False, na_rep="NA")
        else:
            with open(merged_integron_file, "w") as out_f:
                out_f.write("# No Integron found\n")
        merged_summary_file = outfile_base_name + ".summary"
        if not agg_integrons.empty:
            agg_summary.to_csv(merged_summary_file, sep="\t", index=False, na_rep="NA",
                               columns=['ID_replicon', 'ID_integron', 'complete', 'In0', 'CALIN'])

        for _file in all_integrons + all_summaries:
            if _file != merged_integron_file and _file != merged_summary_file:
                # in special case where the merged file has the same name that a replicon result file
                os.unlink(_file)
Example #4
0
def split(replicon_path, chunk=None, outdir='.'):
    """
    Split the replicon_file in *chunk* chunks and write them in files.
    the name of the chunk is the input filename with suffix '_chunk_i'
    (where i is the chunk number) if the chunk contains several sequences
    or the id of the sequence if there is only one sequence in the chunk.
    There also a system that prevent to over write an existing file by appending (number)
    to the file name for instance ESCO001.B.00018.P002_(1).fst

    :param str replicon_path: The path to the replicon file.
    :param int chunk: The number of chunk desire (chunk > 0).
    :param str outdir: The path of a directory where to write chunk files.
                       The directory must exists.
    :return: The name of all chunks created.
    :rtype: List of strings.
    """
    def grouper(sequences_db, chunk_size):
        """

        :param sequences_db: The sequences to group
        :type sequences_db: A :class:`integron_finder.utils.FastaIterator` object.
        :param int chunk_size: The number of sequence by Chunk file.
        :return: a chunk of sequences.
        :rtype: An iterator of tuples.
        """
        args = [iter(sequences_db)] * chunk_size
        return zip_longest(*args)

    with utils.FastaIterator(replicon_path) as sequences_db:
        sequences_db_len = len(sequences_db)
        if not chunk:
            chunk_size = 1
        else:
            chunk_size = math.ceil(sequences_db_len / chunk)

        chunks = grouper(sequences_db, chunk_size)
        all_chunk_name = []
        for chunk_no, chunk_in in enumerate(chunks, 1):
            # if replicon contains illegal characters
            # or replicon is too short < 50 bp
            # then replicon is None
            chunk_out = []
            for rep_no, replicon in enumerate(chunk_in, 1):
                if replicon is not None:
                    replicon_name = replicon.id
                    chunk_out.append(replicon)
                else:
                    rep_no_in_db = (chunk_no - 1) * chunk_size + rep_no
                    if rep_no_in_db <= sequences_db_len:
                        _log.warning(
                            "Skipping replicon {}/{} in chunk {}".format(
                                rep_no_in_db, sequences_db_len, chunk_no))
            if chunk_out:
                if chunk_size == 1:
                    chunk_name = "{}.fst".format(replicon_name)
                else:
                    replicon_name = utils.get_name_from_path(replicon_path)
                    chunk_name = "{}_chunk_{}.fst".format(
                        replicon_name, chunk_no)
                chunk_name = os.path.join(outdir, chunk_name)
                i = 0
                while os.path.exists(chunk_name):
                    root, ext = os.path.splitext(chunk_name)
                    i += 1
                    match = re.search("_chunk_\d+$", root)
                    if match:
                        root = root[:match.start()]
                    chunk_name = "{}_chunk_{}{}".format(root, i, ext)

                _log.info("writing chunk '{}'".format(chunk_name))
                SeqIO.write(chunk_out, chunk_name, "fasta")
                all_chunk_name.append(chunk_name)
    return all_chunk_name
Example #5
0
def main(args=None, loglevel=None):
    """
    main entry point to integron_finder

    :param str args: the arguments passed on the command line
    :param loglevel: the output verbosity
    :type loglevel: a positive int or a string among 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'
    """
    global _log

    args = sys.argv[1:] if args is None else args
    config = parse_args(args)

    ###################################
    # Prepare directories for results #
    ###################################

    # need to create directory before to init logger
    # as we write log in integron_finder.out in this dir

    if not os.path.exists(config.outdir):
        os.mkdir(config.outdir)
    else:
        if not os.path.isdir(config.outdir):
            msg = "outdir '{}' already exists and is not a directory".format(
                config.outdir)
            # _log.critical(msg)
            # we can not log it because logger are not initialized yet.
            raise IsADirectoryError(msg)

    if not os.path.exists(config.result_dir):
        os.mkdir(config.result_dir)
    else:
        if not os.path.isdir(config.result_dir):
            msg = "result dir '{}' already exists and is not a directory".format(
                config.outdir)
            # _log.critical(msg)
            # we can not log it because logger are not initialized yet.
            raise IsADirectoryError(msg)
        elif not os.access(config.result_dir, os.W_OK):
            msg = "result dir '{}' already exists and is not writable".format(
                config.outdir)
            # _log.critical(msg)
            # we can not log it because logger are not initialized yet.
            raise PermissionError(msg)

    ####################
    # init the loggers #
    ####################
    log_file = os.path.join(config.result_dir, 'integron_finder.out')
    integron_finder.init_logger(log_file=log_file, out=not config.mute)

    _log = colorlog.getLogger('integron_finder')

    if not loglevel:
        # logs are specify from args options
        logger_set_level(config.log_level)
    else:
        # used by unit tests to mute or unmute logs
        logger_set_level(loglevel)

    #######################################
    # do last config check before running #
    #######################################
    if config.cmsearch is None:
        msg = """cannot find 'cmsearch' in PATH.
Please install infernal package or setup 'cmsearch' binary path with --cmsearch option"""
        _log.critical(msg)
        raise RuntimeError(msg)

    if config.hmmsearch is None:
        msg = """cannot find 'hmmsearch' in PATH.
Please install hmmer package or setup 'hmmsearch' binary path with --hmmsearch option"""
        _log.critical(msg)
        raise RuntimeError(msg)

    if config.prodigal is None:
        msg = """cannot find 'prodigal' in PATH.
Please install prodigal package or setup 'prodigal' binary path with --prodigal option"""
        _log.critical(msg)
        raise RuntimeError(msg)

    ################
    # print Header #
    ################
    log_header = colorlog.getLogger('integron_finder.header')
    logging = colorlog.logging.logging
    handlers = []
    header_log_file = logging.FileHandler(log_file)
    handlers.append(header_log_file)
    if not config.mute:
        header_stream = colorlog.StreamHandler(sys.stdout)
        handlers.append(header_stream)
    formatter = colorlog.ColoredFormatter("%(message)s")
    for h in handlers:
        h.setFormatter(formatter)
        log_header.addHandler(h)
    log_header.setLevel(colorlog.logging.logging.INFO)
    log_header.propagate = False
    log_header.info(header(args))

    with utils.FastaIterator(
            config.input_seq_path,
            dist_threshold=config.distance_threshold) as sequences_db:
        ################
        # set topology #
        ################
        default_topology = 'circ' if len(sequences_db) == 1 else 'lin'
        if config.linear:
            default_topology = 'lin'
        elif config.circular:
            default_topology = 'circ'
        # the both options are mutually exclusive
        topologies = Topology(default_topology,
                              topology_file=config.topology_file)

        # allow sequences_db to inject topology information
        # in seq.topology attribute
        sequences_db.topologies = topologies

        ##############
        # do the job #
        ##############
        sequences_db_len = len(sequences_db)
        all_integrons = []
        all_summaries = []
        for rep_no, replicon in enumerate(sequences_db, 1):
            # if replicon contains illegal characters
            # or replicon is too short < 50 bp
            # then replicon is None
            if replicon is not None:
                _log.info(
                    "############ Processing replicon {} ({}/{}) ############\n"
                    .format(replicon.id, rep_no, sequences_db_len))
                integron_res, summary = find_integron_in_one_replicon(
                    replicon, config)
                if integron_res:
                    all_integrons.append(integron_res)
                if summary:
                    all_summaries.append(summary)
            else:
                _log.warning(
                    "############ Skipping replicon {}/{} ############".format(
                        rep_no, sequences_db_len))

    if not config.split_results:
        _log.info("Merging integrons results.\n")
        agg_integrons = results.merge_results(*all_integrons)
        agg_summary = results.merge_results(*all_summaries)
        outfile_base_name = os.path.join(
            config.result_dir, utils.get_name_from_path(config.input_seq_path))
        merged_integron_file = outfile_base_name + ".integrons"
        if not agg_integrons.empty:
            agg_integrons.to_csv(merged_integron_file,
                                 sep="\t",
                                 index=False,
                                 na_rep="NA")
        else:
            with open(merged_integron_file, "w") as out_f:
                out_f.write("# No Integron found\n")
        merged_summary_file = outfile_base_name + ".summary"
        if not agg_integrons.empty:
            agg_summary.to_csv(merged_summary_file,
                               sep="\t",
                               index=False,
                               na_rep="NA",
                               columns=[
                                   'ID_replicon', 'ID_integron', 'complete',
                                   'In0', 'CALIN'
                               ])

        for _file in all_integrons + all_summaries:
            if _file != merged_integron_file and _file != merged_summary_file:
                # in special case where the merged file has the same name that a replicon result file
                os.unlink(_file)
 def test_get_name_from_path(self):
     self.assertEqual(utils.get_name_from_path('/foo/bar.baz'), 'bar')
     self.assertEqual(utils.get_name_from_path('bar.baz'), 'bar')
     self.assertEqual(utils.get_name_from_path('../foo/bar.baz'), 'bar')
     self.assertEqual(utils.get_name_from_path('../foo/bar'), 'bar')