Beispiel #1
0
class Bioconvert(object):
    """Universal converter used by the standalone

    ::

        from bioconvert import Bioconvert
        c = Bioconvert("test.fastq", "test.fasta", threads=4, force=True)


    """
    def __init__(self, infile, outfile, force=False, threads=None, extra=None):
        """.. rubric:: constructor

        :param str infile: The path of the input file.
        :param str outfile: The path of The output file
        :param bool force: overwrite output file if it exists already
            otherwise raises an error

        """
        # don't check the input file because there are cases where input parameter is just a prefix
        # if os.path.exists(infile) is False:
        #     msg = "Incorrect input file: %s" % infile
        #     _log.error(msg)
        #     raise ValueError(msg)

        # check existence of output file. If it exists,
        # fails except if force argument is set to True

        if type(outfile) is str:
            outfile = [outfile]

        if type(infile) is str:
            infile = [infile]

        # some checking on the output files (existence, special case of dsrc)
        for filename in outfile:
            if os.path.exists(filename) is True:
                msg = "output file {} exists already.".format(filename)
                if force is False:
                    _log.critical(
                        "output file exists. If you are using bioconvert, use --force "
                    )
                    raise ValueError(msg)
                else:
                    _log.warning(msg + " --force used so will be over written")

            # Only fastq files can be compressed with dsrc
            if filename.endswith(".dsrc"):
                # only valid for FastQ files extension
                # dsrc accepts only .fastq file extension
                if filename.endswith(".fastq.dsrc") is False:
                    msg = "When compressing with .dsrc extension, " +\
                        "only files ending with .fastq extension are " +\
                        "accepted. This is due to the way dsrc executable "+\
                        "is implemented."
                    _log.critical(msg)
                    raise IOError

        Lin = len(infile)
        Lout = len(outfile)

        self.inext = []
        self.outext = []

        # populate the inext
        for filename in infile:
            # example: fastq.gz to fasta.bz2
            # Here, we want to decompress, convert, compress.
            # so we need the extension without .gz or .bz2
            # We should have inext set to fastq and outext
            # set to fasta.bz2
            self.inext.append(getext(filename, remove_compression=True))

        # populate the outext
        for filename in outfile:
            self.outext.append(getext(filename, remove_compression=True))

        # special case one to one for compression/decompression
        # Case 2, fastq.gz to fastq.bz2
        # data is not changed, just the type of compression, so we want
        # to keep the original extensions, here inext and outext  will contain
        # .gz and .bz2
        # if 1 to 1 and same extension, we overwrite self.inext and self.outext
        if Lin == Lout == 1:
            if self.inext == self.outext:
                _log.info("decompression/compression mode")
                self.inext = [getext(infile[0])]
                self.outext = [getext(outfile[0])]

        self.mapper = Registry()

        # From the input parameters 1 and 2, we get the module name
        if not list(
                set(list(self.mapper.get_converters_names())).intersection(
                    sys.argv)):
            # get format from extensions
            in_fmt = [get_format_from_extension(x) for x in self.inext]
            out_fmt = [get_format_from_extension(x) for x in self.outext]
        else:
            in_fmt, out_fmt = ConvMeta.split_converter_to_format(
                list(
                    set(list(self.mapper.get_converters_names())).intersection(
                        sys.argv))[0])

        self.in_fmt = in_fmt
        self.out_fmt = out_fmt

        self.in_fmt = [format.lower() for format in in_fmt]
        self.in_fmt = tuple(in_fmt)

        self.out_fmt = [format.lower() for format in out_fmt]
        self.out_fmt = tuple(out_fmt)

        _log.info("Input: {}".format(self.in_fmt))
        _log.info("Output: {}".format(self.out_fmt))

        try:
            class_converter = self.mapper[(self.in_fmt, self.out_fmt)]
            self.name = class_converter.__name__

        except KeyError:
            # This module name was not found
            # Try to find path of converters
            conv_path = self.mapper.conversion_path(self.in_fmt, self.out_fmt)
            _log.debug("path: {}".format(conv_path))
            if conv_path:
                _log.info("Direct conversion not implemented. "
                          "Chaining converters.")
                # implemented in bioconvert/core/base.py
                # using temporary files
                class_converter = make_chain([(pair, self.mapper[pair])
                                              for pair in conv_path])
            else:
                msg = "Requested input format ('{}') to output format ('{}') is not available in bioconvert".format(
                    self.in_fmt,
                    self.out_fmt,
                )
                _log.critical(msg)
                _log.critical(
                    "Use --formats to know the available formats and --help for examples"
                )
                raise Exception(msg)

        # If --threads provided, we update the threads attribute

        #FIXME: hack for the compression/decompression decorators

        if Lin == 1:
            infile = infile[0]

        if Lout == 1:
            outfile = outfile[0]

        self.converter = class_converter(infile, outfile)
        if threads is not None:
            self.converter.threads = threads
        if extra:
            self.converter._extra_arguments = extra

        _log.info("Using {} class (with {} threads if needed)".format(
            self.converter.name, self.converter.threads))

    def __call__(self, *args, **kwargs):
        self.converter(*args, **kwargs)

    def boxplot_benchmark(self, *args, **kwargs):
        self.converter.boxplot_benchmark(*args, **kwargs)
Beispiel #2
0
class Bioconvert(object):
    """Universal converter used by the standalone

    ::

        from bioconvert import Bioconvert
        c = Bioconvert("test.fastq", "test.fasta")


    """
    def __init__(self, infile, outfile, in_fmt=None, out_fmt=None, force=False):
        """.. rubric:: constructor

        :param str infile: The path of the input file.
        :param str outfile: The path of The output file
        :param str in_fmt: the format for the input file
        :param str out_fmt: the format for the output
        :param bool force: overwrite output file if it exists already
            otherwise raises an error

        """
        # don't check the input file because there are cases where input parameter is just a prefix
        # if os.path.exists(infile) is False:
        #     msg = "Incorrect input file: %s" % infile
        #     _log.error(msg)
        #     raise ValueError(msg)

        # check existence of output file. If it exists,
        # fails except if force argument is set to True
        if os.path.exists(outfile) is True:
            msg = "output file {} exists already".format(outfile)
            _log.warning("output file exists already")
            if force is False:
                _log.critical("output file exists. If you are using bioconvert, use --force ")
                raise ValueError(msg)
            else:
                _log.warning("output file will be overwritten")

        # Only fastq files can be compressed with dsrc
        if outfile.endswith(".dsrc"):
            # only valid for FastQ files extension
            # dsrc accepts only .fastq file extension
            if outfile.endswith(".fastq.dsrc") is False:
                msg = "When compressing with .dsrc extension, " +\
                    "only files ending with .fastq extension are " +\
                    "accepted. This is due to the way dsrc executable +"\
                    "is implemented."
                _log.critical(msg)
                raise IOError

        # Case1: fastq.gz to fasta.bz2
        # Here, we want to decompress, convert, compress.
        # so we need the extension without .gz or .bz2
        # We should have inext set to fastq and outext
        # set to fasta.bz2
        self.inext = getext(infile, remove_compression=True)
        self.outext = getext(outfile, remove_compression=True)

        # Case 2, fastq.gz to fastq.bz2
        # data is not changed, just the type of compression, so we want
        # to keep the original extensions, here inext and outext  will contain
        # .gz and .bz2
        if self.inext == self.outext:
            _log.info("decompression/compression mode")
            self.inext = getext(infile)
            self.outext = getext(outfile)

        self.mapper = Registry()

        # From the input parameters 1 and 2, we get the module name
        try:
            if in_fmt is None:
                in_fmt = get_format_from_extension(self.inext)
            if out_fmt is None:
                out_fmt = get_format_from_extension(self.outext)
            self.in_fmt = in_fmt.upper()
            self.out_fmt = out_fmt.upper()
            _log.info("Input: %s", self.in_fmt)
            _log.info("Output: %s", self.out_fmt)
            class_converter = self.mapper[(self.in_fmt, self.out_fmt)]
            self.name = class_converter.__name__
        except KeyError:
            # This module name was not found
            # Try to find path of converters
            conv_path = self.mapper.conversion_path(self.in_fmt, self.out_fmt)
            _log.debug("path: {}".format(conv_path))
            if conv_path:
                _log.info("Direct conversion not implemented. "
                          "Chaining converters.")
                # implemented in bioconvert/core/base.py
                # using temporary files
                class_converter = make_chain([
                    (pair, self.mapper[pair]) for pair in conv_path])
            else:
                msg = "Requested input format ('%s') to output format ('%s') is not available in bioconvert" %(
                    self.in_fmt,
                    self.out_fmt,
                )
                _log.critical(msg)
                _log.critical("Use --formats to know the available formats and --help for examples")
                raise Exception(msg)

        self.converter = class_converter(infile, outfile)
        _log.info("Using {} class".format(self.converter.name))

    def __call__(self, *args, **kwargs):
        self.converter(*args, **kwargs)

    def boxplot_benchmark(self, *args, **kwargs):
        self.converter.boxplot_benchmark(*args, **kwargs)