Exemple #1
0
def split(replicon_path, chunk=None, outdir='.'):
    """
    Split the replicon_file in *chunk* chunks and write them in files.
    the name of the chunk is the input filename with suffix '_chunk_i'
    (where i is the chunk number) if the chunk contains several sequences
    or the id of the sequence if there is only one sequence in the chunk.
    There also a system that prevent to over write an existing file by appending (number)
    to the file name for instance ESCO001.B.00018.P002_(1).fst

    :param str replicon_path: The path to the replicon file.
    :param int chunk: The number of chunk desire (chunk > 0).
    :param str outdir: The path of a directory where to write chunk files.
                       The directory must exists.
    :return: The name of all chunks created.
    :rtype: List of strings.
    """
    def grouper(sequences_db, chunk_size):
        """

        :param sequences_db: The sequences to group
        :type sequences_db: A :class:`integron_finder.utils.FastaIterator` object.
        :param int chunk_size: The number of sequence by Chunk file.
        :return: a chunk of sequences.
        :rtype: An iterator of tuples.
        """
        args = [iter(sequences_db)] * chunk_size
        return zip_longest(*args)

    with utils.FastaIterator(replicon_path) as sequences_db:
        sequences_db_len = len(sequences_db)
        if not chunk:
            chunk_size = 1
        else:
            chunk_size = math.ceil(sequences_db_len / chunk)

        chunks = grouper(sequences_db, chunk_size)
        all_chunk_name = []
        for chunk_no, chunk_in in enumerate(chunks, 1):
            # if replicon contains illegal characters
            # or replicon is too short < 50 bp
            # then replicon is None
            chunk_out = []
            for rep_no, replicon in enumerate(chunk_in, 1):
                if replicon is not None:
                    replicon_name = replicon.id
                    chunk_out.append(replicon)
                else:
                    rep_no_in_db = (chunk_no - 1) * chunk_size + rep_no
                    if rep_no_in_db <= sequences_db_len:
                        _log.warning(
                            "Skipping replicon {}/{} in chunk {}".format(
                                rep_no_in_db, sequences_db_len, chunk_no))
            if chunk_out:
                if chunk_size == 1:
                    chunk_name = "{}.fst".format(replicon_name)
                else:
                    replicon_name = utils.get_name_from_path(replicon_path)
                    chunk_name = "{}_chunk_{}.fst".format(
                        replicon_name, chunk_no)
                chunk_name = os.path.join(outdir, chunk_name)
                i = 0
                while os.path.exists(chunk_name):
                    root, ext = os.path.splitext(chunk_name)
                    i += 1
                    match = re.search("_chunk_\d+$", root)
                    if match:
                        root = root[:match.start()]
                    chunk_name = "{}_chunk_{}{}".format(root, i, ext)

                _log.info("writing chunk '{}'".format(chunk_name))
                SeqIO.write(chunk_out, chunk_name, "fasta")
                all_chunk_name.append(chunk_name)
    return all_chunk_name
    def test_FastaIterator(self):
        file_name = 'multi_fasta'
        replicon_path = self.find_data(
            os.path.join('Replicons', file_name + '.fst'))
        topologies = Topology('lin')
        with utils.FastaIterator(replicon_path) as seq_db:
            seq_db.topologies = topologies
            received_seq_id = sorted([seq.id for seq in seq_db])

        expected_seq_id = sorted(
            ['ACBA.007.P01_13', 'LIAN.001.C02_10', 'PSSU.001.C01_13'])
        self.assertListEqual(expected_seq_id, received_seq_id)
        self.assertEqual(len(seq_db), 3)

        expected_seq_name = expected_seq_id
        with utils.FastaIterator(replicon_path) as seq_db:
            seq_db.topologies = topologies
            received_seq_name = sorted([seq.name for seq in seq_db])
        self.assertListEqual(expected_seq_name, received_seq_name)

        replicon_name = 'foo'
        with utils.FastaIterator(replicon_path,
                                 replicon_name=replicon_name) as seq_db:
            seq_db.topologies = topologies
            received_seq_id = set([seq.name for seq in seq_db])
        expected_seq_name = set([replicon_name])
        self.assertSetEqual(expected_seq_name, received_seq_id)

        with utils.FastaIterator(replicon_path) as seq_db:
            received_seq_top = [seq.topology for seq in seq_db]
        expected_seq_top = ['lin', 'lin', 'lin']
        self.assertListEqual(expected_seq_top, received_seq_top)

        topologies_data = {
            'ACBA.007.P01_13': 'lin',
            'LIAN.001.C02_10': 'circ',
            'PSSU.001.C01_13': 'lin',
        }
        with tempfile.NamedTemporaryFile(mode='w') as topology_file:
            for rep, topo in topologies_data.items():
                topology_file.write("{} {}\n".format(rep, topo))
            topology_file.flush()
            topologies = Topology('lin', topology_file=topology_file.name)
            with utils.FastaIterator(replicon_path) as seq_db:
                seq_db.topologies = topologies
                received_seq_top = {seq.id: seq.topology for seq in seq_db}
            self.assertDictEqual(topologies_data, received_seq_top)

        file_name = 'acba_short'
        replicon_path = self.find_data(
            os.path.join('Replicons', file_name + '.fst'))
        topologies = Topology('circ')
        with utils.FastaIterator(replicon_path) as seq_db:
            seq_db.topologies = topologies
            received_seq_top = [seq.topology for seq in seq_db]
        expected_seq_top = ['lin']
        self.assertListEqual(expected_seq_top, received_seq_top)

        file_name = 'replicon_ambiguous_char'
        replicon_path = self.find_data(
            os.path.join('Replicons', file_name + '.fst'))
        with utils.FastaIterator(replicon_path) as seq_db:
            received_seq_id = sorted([seq.id for seq in seq_db if seq])
        expected_seq_id = sorted(['seq_1', 'seq_2', 'seq_3', 'seq_4'])
        self.assertListEqual(expected_seq_id, received_seq_id)

        file_name = 'replicon_bad_char'
        replicon_path = self.find_data(
            os.path.join('Replicons', file_name + '.fst'))
        expected_warning = """sequence seq_(3|4) contains invalid characters, the sequence is skipped.
sequence seq_(3|4) contains invalid characters, the sequence is skipped."""
        with utils.FastaIterator(replicon_path) as seq_db:
            # 2 sequences are rejected so 2 message is produced (for seq 3 and seq 4)
            with self.catch_log() as log:
                received_seq_id = sorted([seq.id for seq in seq_db if seq])
                got_warning = log.get_value().strip()
        self.assertRegex(got_warning, expected_warning)
        expected_seq_id = sorted(['seq_1', 'seq_2'])
        self.assertListEqual(expected_seq_id, received_seq_id)

        file_name = 'replicon_too_short'
        replicon_path = self.find_data(
            os.path.join('Replicons', file_name + '.fst'))
        expected_warning = """sequence seq_(4|2) is too short \(32 bp\), the sequence is skipped \(must be > 50bp\).
sequence seq_(4|2) is too short \(32 bp\), the sequence is skipped \(must be > 50bp\)."""
        with utils.FastaIterator(replicon_path) as seq_db:
            # 2 sequences are rejected so 2 messages are produced (for seq 2 & 4)
            with self.catch_log() as log:
                received_seq_id = sorted([seq.id for seq in seq_db if seq])
                got_warning = log.get_value().strip()

        self.assertRegex(got_warning, expected_warning)
        expected_seq_id = sorted(['seq_1', 'seq_3'])
        self.assertListEqual(expected_seq_id, received_seq_id)
Exemple #3
0
def main(args=None, loglevel=None):
    """
    main entry point to integron_finder

    :param str args: the arguments passed on the command line
    :param loglevel: the output verbosity
    :type loglevel: a positive int or a string among 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'
    """
    global _log

    args = sys.argv[1:] if args is None else args
    config = parse_args(args)

    ###################################
    # Prepare directories for results #
    ###################################

    # need to create directory before to init logger
    # as we write log in integron_finder.out in this dir

    if not os.path.exists(config.outdir):
        os.mkdir(config.outdir)
    else:
        if not os.path.isdir(config.outdir):
            msg = "outdir '{}' already exists and is not a directory".format(
                config.outdir)
            # _log.critical(msg)
            # we can not log it because logger are not initialized yet.
            raise IsADirectoryError(msg)

    if not os.path.exists(config.result_dir):
        os.mkdir(config.result_dir)
    else:
        if not os.path.isdir(config.result_dir):
            msg = "result dir '{}' already exists and is not a directory".format(
                config.outdir)
            # _log.critical(msg)
            # we can not log it because logger are not initialized yet.
            raise IsADirectoryError(msg)
        elif not os.access(config.result_dir, os.W_OK):
            msg = "result dir '{}' already exists and is not writable".format(
                config.outdir)
            # _log.critical(msg)
            # we can not log it because logger are not initialized yet.
            raise PermissionError(msg)

    ####################
    # init the loggers #
    ####################
    log_file = os.path.join(config.result_dir, 'integron_finder.out')
    integron_finder.init_logger(log_file=log_file, out=not config.mute)

    _log = colorlog.getLogger('integron_finder')

    if not loglevel:
        # logs are specify from args options
        logger_set_level(config.log_level)
    else:
        # used by unit tests to mute or unmute logs
        logger_set_level(loglevel)

    #######################################
    # do last config check before running #
    #######################################
    if config.cmsearch is None:
        msg = """cannot find 'cmsearch' in PATH.
Please install infernal package or setup 'cmsearch' binary path with --cmsearch option"""
        _log.critical(msg)
        raise RuntimeError(msg)

    if config.hmmsearch is None:
        msg = """cannot find 'hmmsearch' in PATH.
Please install hmmer package or setup 'hmmsearch' binary path with --hmmsearch option"""
        _log.critical(msg)
        raise RuntimeError(msg)

    if config.prodigal is None:
        msg = """cannot find 'prodigal' in PATH.
Please install prodigal package or setup 'prodigal' binary path with --prodigal option"""
        _log.critical(msg)
        raise RuntimeError(msg)

    ################
    # print Header #
    ################
    log_header = colorlog.getLogger('integron_finder.header')
    logging = colorlog.logging.logging
    handlers = []
    header_log_file = logging.FileHandler(log_file)
    handlers.append(header_log_file)
    if not config.mute:
        header_stream = colorlog.StreamHandler(sys.stdout)
        handlers.append(header_stream)
    formatter = colorlog.ColoredFormatter("%(message)s")
    for h in handlers:
        h.setFormatter(formatter)
        log_header.addHandler(h)
    log_header.setLevel(colorlog.logging.logging.INFO)
    log_header.propagate = False
    log_header.info(header(args))

    with utils.FastaIterator(
            config.input_seq_path,
            dist_threshold=config.distance_threshold) as sequences_db:
        ################
        # set topology #
        ################
        default_topology = 'circ' if len(sequences_db) == 1 else 'lin'
        if config.linear:
            default_topology = 'lin'
        elif config.circular:
            default_topology = 'circ'
        # the both options are mutually exclusive
        topologies = Topology(default_topology,
                              topology_file=config.topology_file)

        # allow sequences_db to inject topology information
        # in seq.topology attribute
        sequences_db.topologies = topologies

        ##############
        # do the job #
        ##############
        sequences_db_len = len(sequences_db)
        all_integrons = []
        all_summaries = []
        for rep_no, replicon in enumerate(sequences_db, 1):
            # if replicon contains illegal characters
            # or replicon is too short < 50 bp
            # then replicon is None
            if replicon is not None:
                _log.info(
                    "############ Processing replicon {} ({}/{}) ############\n"
                    .format(replicon.id, rep_no, sequences_db_len))
                integron_res, summary = find_integron_in_one_replicon(
                    replicon, config)
                if integron_res:
                    all_integrons.append(integron_res)
                if summary:
                    all_summaries.append(summary)
            else:
                _log.warning(
                    "############ Skipping replicon {}/{} ############".format(
                        rep_no, sequences_db_len))

    if not config.split_results:
        _log.info("Merging integrons results.\n")
        agg_integrons = results.merge_results(*all_integrons)
        agg_summary = results.merge_results(*all_summaries)
        outfile_base_name = os.path.join(
            config.result_dir, utils.get_name_from_path(config.input_seq_path))
        merged_integron_file = outfile_base_name + ".integrons"
        if not agg_integrons.empty:
            agg_integrons.to_csv(merged_integron_file,
                                 sep="\t",
                                 index=False,
                                 na_rep="NA")
        else:
            with open(merged_integron_file, "w") as out_f:
                out_f.write("# No Integron found\n")
        merged_summary_file = outfile_base_name + ".summary"
        if not agg_integrons.empty:
            agg_summary.to_csv(merged_summary_file,
                               sep="\t",
                               index=False,
                               na_rep="NA",
                               columns=[
                                   'ID_replicon', 'ID_integron', 'complete',
                                   'In0', 'CALIN'
                               ])

        for _file in all_integrons + all_summaries:
            if _file != merged_integron_file and _file != merged_summary_file:
                # in special case where the merged file has the same name that a replicon result file
                os.unlink(_file)