Ejemplo n.º 1
0
 def get_description(self, gene_id):
     """
     :param str gene_id: a protein/gene identifier
     :return: The description of the protein corresponding to the gene_id
     :rtype: :class:`SeqDesc` namedtuple object
     :raise IntegronError: when gene_id is not a valid Gembase gene identifier
     :raise KeyError: if gene_id is not found in GembaseDB instance
     """
     try:
         specie, date, strain, contig_gene = gene_id.split('.')
         contig_gene = contig_gene[1:]  # remove the first letter b/i
     except ValueError:
         raise IntegronError(
             "'{}' is not a valid Gembase protein identifier.".format(
                 gene_id))
     pattern = '{}\.{}\.{}\.\w?{}'.format(specie, date, strain, contig_gene)
     seq_info = self._info.loc[self._info['seq_id'].str.contains(
         pattern, regex=True)]
     if not seq_info.empty:
         return SeqDesc(
             seq_info.seq_id.values[0],
             1 if seq_info.strand.values[0] == "D" else -1,
             seq_info.start.values[0],
             seq_info.end.values[0],
         )
     else:
         raise KeyError(gene_id)
Ejemplo n.º 2
0
    def set_log_level(cls, level):
        levels = {
            'NOTSET': colorlog.logging.logging.NOTSET,
            'DEBUG': colorlog.logging.logging.DEBUG,
            'INFO': colorlog.logging.logging.INFO,
            'WARNING': colorlog.logging.logging.WARNING,
            'ERROR': colorlog.logging.logging.ERROR,
            'CRITICAL': colorlog.logging.logging.CRITICAL,
        }
        if level in levels:
            level = levels[level]
        elif not isinstance(level, int):
            raise IntegronError("Level must be {} or a positive integer")
        elif level < 0:
            raise IntegronError("Level must be {} or a positive integer")

        logger_set_level(level)
Ejemplo n.º 3
0
 def get_description(self, gene_id):
     """
     :param str gene_id: a protein/gene identifier
     :returns: The description of the protein corresponding to the gene_id
     :rtype: :class:`SeqDesc` namedtuple object
     :raise IntegronError: when gene_id is not a valid Gembase gene identifier
     :raise KeyError: if gene_id is not found in ProdigalDB instance
     """
     seq = self[gene_id]
     try:
         id_, start, stop, strand, *_ = seq.description.split(" # ")
     except ValueError:
         raise IntegronError(
             "'{}' is not a valid Prodigal protein identifier.".format(
                 gene_id))
     start = int(start)
     stop = int(stop)
     strand = int(strand)
     return SeqDesc(id_, strand, start, stop)
Ejemplo n.º 4
0
def merge_integrons(out_file, *in_dirs):
    """

    :param in_dirs: The path of the source directories
    :type in_dirs: list of str
    :param str out_file: The path to the merged file
    :return: The The path to the merged file
    """
    integrons_files = []
    for _dir in in_dirs:
        in_files = glob.glob(os.path.join(_dir, '*' + '.integrons'))
        integrons_files.extend(in_files)
    if integrons_files:
        agg_file = results.merge_results(*integrons_files)
        agg_file.to_csv(out_file, index=False, sep="\t", na_rep="NA")
        return out_file
    else:
        msg = "No integrons file to merge"
        _log.critical(msg)
        raise IntegronError(msg)
Ejemplo n.º 5
0
def find_integron_in_one_replicon(replicon, config):
    """
    scan replicon for integron.

      * presence of integrase
      * presence of attC sites
      * presence of promoters and attI sites

    depending on the configuration

     * perform functional annotation

    produce a file containing presence of putative integrons

    depending on configuration

        * produce genbank file with replicon and annotations with integrons
        * produce schema of replicon with integrons (in pdf)

    :param replicon: the replicon to analyse.
    :type replicon: a :class:`Bio.SeqRecord` object.
    :param config: The configuration
    :type config: a :class:`integron_finder.config.Config` object.
    :returns: the path to the integron file (<replicon_id>.integrons)
              and the summary file (<replicon_id.summary>).
              if there is no integron the summary file is None
    :rtype: tuple (str integron_file, str summary_file) or (str integron_file, None)
    """
    result_tmp_dir = config.tmp_dir(replicon.id)
    try:
        os.mkdir(result_tmp_dir)
    except OSError:
        pass
    tmp_replicon_path = os.path.join(result_tmp_dir, replicon.id + '.fst')
    SeqIO.write(replicon, tmp_replicon_path, "fasta")
    # create attr path
    # used to generate protein file with prodigal
    replicon.path = tmp_replicon_path

    # func_annot_path is the canonical path for Functional_annotation
    # path_func_annot is the path provide on the command line
    if config.func_annot and not config.no_proteins and not config.path_func_annot:
        if os.path.exists('bank_hmm'):
            fa_hmm = scan_hmm_bank('bank_hmm')
        elif os.path.exists(config.func_annot_path):
            fa_hmm = scan_hmm_bank(config.func_annot_path)
        else:
            raise IntegronError(
                "the dir '{}' neither 'bank_hmm' exists, specify the location of hmm "
                "profile with --path-func-annot option".format(
                    config.func_annot_path))
        is_func_annot = True

    elif config.path_func_annot and config.no_proteins is False:
        fa_hmm = scan_hmm_bank(config.path_func_annot)
        is_func_annot = True
    else:
        is_func_annot = False

    if is_func_annot and not fa_hmm:
        _log.warning(
            "No hmm profiles for functional annotation detected, skip functional annotation step."
        )

    if config.gembase_path:
        protein_db = GembaseDB(replicon,
                               config,
                               gembase_path=config.gembase_path)
    elif config.gembase:
        protein_db = GembaseDB(replicon, config)
    else:
        protein_db = ProdigalDB(replicon, config)

    ##################
    # Default search #
    ##################
    intI_file = os.path.join(result_tmp_dir, replicon.id + "_intI.res")
    phageI_file = os.path.join(result_tmp_dir, replicon.id + "_phage_int.res")
    attC_default_file = os.path.join(result_tmp_dir,
                                     replicon.id + "_attc_table.res")

    try:
        if not config.no_proteins:
            if not os.path.isfile(intI_file) or not os.path.isfile(
                    phageI_file):
                find_integrase(replicon.id, protein_db.protfile,
                               result_tmp_dir, config)
        _log.info("Starting Default search ... :")
        if not os.path.isfile(attC_default_file):
            # find attc with cmsearch
            find_attc(tmp_replicon_path,
                      replicon.name,
                      config.cmsearch,
                      result_tmp_dir,
                      config.model_attc_path,
                      incE=config.evalue_attc,
                      cpu=config.cpu)

        _log.info("Default search done... : ")
        integrons = find_integron(replicon, protein_db, attC_default_file,
                                  intI_file, phageI_file, config)

        #########################
        # Search with local_max #
        #########################
        if config.local_max:
            _log.info("Starting search with local_max...:")
            if not os.path.isfile(
                    os.path.join(result_tmp_dir, "integron_max.pickle")):
                circular = True if replicon.topology == 'circ' else False
                integron_max = find_attc_max(
                    integrons,
                    replicon,
                    config.distance_threshold,
                    config.model_attc_path,
                    max_attc_size=config.max_attc_size,
                    min_attc_size=config.min_attc_size,
                    circular=circular,
                    out_dir=result_tmp_dir,
                    cpu=config.cpu,
                    evalue_attc=config.evalue_attc)
                integron_max.to_pickle(
                    os.path.join(result_tmp_dir, "integron_max.pickle"))
                _log.info("Search with local_max done... :")

            else:
                integron_max = pd.read_pickle(
                    os.path.join(result_tmp_dir, "integron_max.pickle"))
                integron_max = integron_max[
                    (integron_max.evalue < config.evalue_attc)
                    & (abs(integron_max.pos_end -
                           integron_max.pos_beg) < config.max_attc_size) &
                    (config.min_attc_size <
                     abs(integron_max.pos_end - integron_max.pos_beg))]
                _log.info(
                    "Search with local_max was already done, continue... :")

            integrons = find_integron(replicon, protein_db, integron_max,
                                      intI_file, phageI_file, config)

        ##########################
        # Add promoters and attI #
        ##########################
        for integron in integrons:
            integron_type = integron.type()
            if integron_type != "In0":  # complete & CALIN
                if not config.no_proteins:
                    _log.info("Adding proteins ... :")
                    integron.add_proteins(protein_db)

            if config.promoter_attI:
                _log.info("Adding promoters and attI ... :")
                if integron_type == "complete":
                    integron.add_promoter()
                    integron.add_attI()
                elif integron_type == "In0":
                    integron.add_attI()
                    integron.add_promoter()
        #########################
        # Functional annotation #
        #########################
        if is_func_annot and fa_hmm:
            _log.info("Starting functional annotation ...:")
            func_annot(integrons, replicon, protein_db, fa_hmm, config,
                       result_tmp_dir)

        #######################
        # Writing out results #
        #######################
        _log.info("Writing out results for replicon {}".format(replicon.id))

        if config.pdf:
            for j, integron in enumerate(integrons, 1):
                if integron.type() == "complete":
                    integron.draw_integron(file=os.path.join(
                        config.result_dir, "{}_{}.pdf".format(replicon.id, j)))

        base_outfile = os.path.join(config.result_dir, replicon.id)
        integron_file = base_outfile + ".integrons"
        _log.debug("Writing integron_file {}".format(integron_file))
        if integrons:
            integrons_report = results.integrons_report(integrons)
            integrons_report.to_csv(integron_file,
                                    sep="\t",
                                    index=False,
                                    na_rep="NA")

            summary = results.summary(integrons_report)
            summary_file = base_outfile + ".summary"
            summary.to_csv(summary_file,
                           sep="\t",
                           na_rep="NA",
                           index=False,
                           columns=[
                               'ID_replicon', 'ID_integron', 'complete', 'In0',
                               'CALIN'
                           ])
            if config.gbk:
                add_feature(replicon, integrons_report, protein_db,
                            config.distance_threshold)
                SeqIO.write(
                    replicon,
                    os.path.join(config.result_dir, replicon.id + ".gbk"),
                    "genbank")
        else:
            with open(integron_file, "w") as out_f:
                out_f.write("# No Integron found\n")
            summary_file = None
    except integron_finder.EmptyFileError as err:
        _log.warning('############ Skip replicon {} ############'.format(
            replicon.name))
        integron_file = ''
        summary_file = ''
    #########################
    # clean temporary files #
    #########################

    if not config.keep_tmp:
        try:
            shutil.rmtree(result_tmp_dir)
        except Exception as err:
            _log.warning("Cannot remove temporary results : '{} : {}'".format(
                result_tmp_dir, str(err)))

    return integron_file, summary_file