Beispiel #1
0
    def __init__(self, args):
        registry = Registry()
        self.args = args[:]
        if len(self.args) == 0:
            error("Please provide at least some arguments. See --help")

        if args[0].lower() not in list(registry.get_converters_names()) \
                and "." in args[0]:
            self.mode = "implicit"
            # we shoule have at least 2 entries in implicit mode and the first
            # input filename must exists (1 to 1 or many to 1)
            if len(args) < 2:
                error("In implicit mode, you must define your input and output file (only 1 provided)")
        else:
            self.mode = "explicit"
        _log.debug("parsing mode {}".format(self.mode))
Beispiel #2
0
def get_output_format(input_value):
    options = []
    try:
        r = Registry()
        all_converter = list(r.get_converters_names())
        list_format = []
        for converter in all_converter:
            if converter.startswith(input_value):
                input_format, output_format = converter.split('2', 1)
                list_format.append(output_format)
        list_format = list(set(list_format))
        list_format.sort()
        for format in list_format:
            options.append({'label': format, 'value': format})
        return options
    except TypeError:
        return options
Beispiel #3
0
def get_input_format():
    """This function allows to fill the input_dropdown

    : return: all the input format available in bioconvert
    : rtype: dict
    """

    r = Registry()
    all_converter = list(r.get_converters_names())
    list_format = []
    #  We collect input file by splitting the converters
    for converter in all_converter:
        input_format, output_format = converter.split('2', 1)
        list_format.append(input_format)
    list_format = list(set(list_format))
    #  to have sorted input format in the dropdown
    list_format.sort()
    options = []
    #  the dropdown option take a dictionnary as argument, so convert the list in dict. Label and value is the same
    for format in list_format:
        options.append({'label': format, 'value': format})
    return options
Beispiel #4
0
class Bioconvert(object):
    """Universal converter used by the standalone

    ::

        from bioconvert import Bioconvert
        c = Bioconvert("test.fastq", "test.fasta", threads=4, force=True)


    """
    def __init__(self, infile, outfile, force=False, threads=None, extra=None):
        """.. rubric:: constructor

        :param str infile: The path of the input file.
        :param str outfile: The path of The output file
        :param bool force: overwrite output file if it exists already
            otherwise raises an error

        """
        # don't check the input file because there are cases where input parameter is just a prefix
        # if os.path.exists(infile) is False:
        #     msg = "Incorrect input file: %s" % infile
        #     _log.error(msg)
        #     raise ValueError(msg)

        # check existence of output file. If it exists,
        # fails except if force argument is set to True

        if type(outfile) is str:
            outfile = [outfile]

        if type(infile) is str:
            infile = [infile]

        # some checking on the output files (existence, special case of dsrc)
        for filename in outfile:
            if os.path.exists(filename) is True:
                msg = "output file {} exists already.".format(filename)
                if force is False:
                    _log.critical(
                        "output file exists. If you are using bioconvert, use --force "
                    )
                    raise ValueError(msg)
                else:
                    _log.warning(msg + " --force used so will be over written")

            # Only fastq files can be compressed with dsrc
            if filename.endswith(".dsrc"):
                # only valid for FastQ files extension
                # dsrc accepts only .fastq file extension
                if filename.endswith(".fastq.dsrc") is False:
                    msg = "When compressing with .dsrc extension, " +\
                        "only files ending with .fastq extension are " +\
                        "accepted. This is due to the way dsrc executable "+\
                        "is implemented."
                    _log.critical(msg)
                    raise IOError

        Lin = len(infile)
        Lout = len(outfile)

        self.inext = []
        self.outext = []

        # populate the inext
        for filename in infile:
            # example: fastq.gz to fasta.bz2
            # Here, we want to decompress, convert, compress.
            # so we need the extension without .gz or .bz2
            # We should have inext set to fastq and outext
            # set to fasta.bz2
            self.inext.append(getext(filename, remove_compression=True))

        # populate the outext
        for filename in outfile:
            self.outext.append(getext(filename, remove_compression=True))

        # special case one to one for compression/decompression
        # Case 2, fastq.gz to fastq.bz2
        # data is not changed, just the type of compression, so we want
        # to keep the original extensions, here inext and outext  will contain
        # .gz and .bz2
        # if 1 to 1 and same extension, we overwrite self.inext and self.outext
        if Lin == Lout == 1:
            if self.inext == self.outext:
                _log.info("decompression/compression mode")
                self.inext = [getext(infile[0])]
                self.outext = [getext(outfile[0])]

        self.mapper = Registry()

        # From the input parameters 1 and 2, we get the module name
        if not list(
                set(list(self.mapper.get_converters_names())).intersection(
                    sys.argv)):
            # get format from extensions
            in_fmt = [get_format_from_extension(x) for x in self.inext]
            out_fmt = [get_format_from_extension(x) for x in self.outext]
        else:
            in_fmt, out_fmt = ConvMeta.split_converter_to_format(
                list(
                    set(list(self.mapper.get_converters_names())).intersection(
                        sys.argv))[0])

        self.in_fmt = in_fmt
        self.out_fmt = out_fmt

        self.in_fmt = [format.lower() for format in in_fmt]
        self.in_fmt = tuple(in_fmt)

        self.out_fmt = [format.lower() for format in out_fmt]
        self.out_fmt = tuple(out_fmt)

        _log.info("Input: {}".format(self.in_fmt))
        _log.info("Output: {}".format(self.out_fmt))

        try:
            class_converter = self.mapper[(self.in_fmt, self.out_fmt)]
            self.name = class_converter.__name__

        except KeyError:
            # This module name was not found
            # Try to find path of converters
            conv_path = self.mapper.conversion_path(self.in_fmt, self.out_fmt)
            _log.debug("path: {}".format(conv_path))
            if conv_path:
                _log.info("Direct conversion not implemented. "
                          "Chaining converters.")
                # implemented in bioconvert/core/base.py
                # using temporary files
                class_converter = make_chain([(pair, self.mapper[pair])
                                              for pair in conv_path])
            else:
                msg = "Requested input format ('{}') to output format ('{}') is not available in bioconvert".format(
                    self.in_fmt,
                    self.out_fmt,
                )
                _log.critical(msg)
                _log.critical(
                    "Use --formats to know the available formats and --help for examples"
                )
                raise Exception(msg)

        # If --threads provided, we update the threads attribute

        #FIXME: hack for the compression/decompression decorators

        if Lin == 1:
            infile = infile[0]

        if Lout == 1:
            outfile = outfile[0]

        self.converter = class_converter(infile, outfile)
        if threads is not None:
            self.converter.threads = threads
        if extra:
            self.converter._extra_arguments = extra

        _log.info("Using {} class (with {} threads if needed)".format(
            self.converter.name, self.converter.threads))

    def __call__(self, *args, **kwargs):
        self.converter(*args, **kwargs)

    def boxplot_benchmark(self, *args, **kwargs):
        self.converter.boxplot_benchmark(*args, **kwargs)
Beispiel #5
0
def main(args=None):
    registry = Registry()

    if args is None:
        args = sys.argv[1:]

    if not len(sys.argv) == 1:

        # check that the first argument is not a converter in the registry
        if args[0].lower() not in list(registry.get_converters_names()) \
                and "." in args[0]:

            in_ext = utils.get_extension(args[0], remove_compression=True)
            out_ext = utils.get_extension(args[1], remove_compression=True)

            # Check that the input file exists
            # Fixes https://github.com/bioconvert/bioconvert/issues/204
            if os.path.exists(args[0]) is False:
                _log.error("Input file {} does not exist".format(args[0]))
                sys.exit(1)

            # assign to converter the converter (s) found for the ext_pair = (in_ext, out_ext)
            try:
                converter = registry.get_ext((in_ext, out_ext))
                # for testing the mutiple converter for one extension pair
                # converter = [bioconvert.fastq2fasta.Fastq2Fasta, bioconvert.phylip2xmfa.PHYLIP2XMFA]
            except KeyError:
                converter = []

            # if no converter is found
            if not converter:
                _log.error(
                    '\n Bioconvert does not support conversion {} -> {}. \n'
                    'Please specify the converter'
                    '\n Usage : \n\n'
                    '\t bioconvert converter input_file output_file \n '
                    '\n To see all the converter : '
                    '\n \t bioconvert --help '.format(in_ext, out_ext))

                sys.exit(1)
            # if the ext_pair matches a single converter
            elif len(converter) == 1:
                args.insert(0, converter[0].__name__.lower())
            # if the ext_pair matches multiple converters
            else:

                _log.error("Ambiguous extension.\n"
                           "You must specify the right conversion  Please "
                           "choose a conversion from: \n\n"
                           "{}".format("\n".join(
                               [c.__name__.lower() for c in converter])))
                sys.exit(1)

    # Set the default level
    bioconvert.logger.level = "ERROR"

    # Changing the log level before argparse is run
    try:
        bioconvert.logger.level = args[args.index("-l") + 1]
    except:
        pass
    try:
        bioconvert.logger.level = args[args.index("--level") + 1]
    except:
        pass

    try:
        bioconvert.logger.level = args[args.index("-v") + 1]
    except:
        pass
    try:
        bioconvert.logger.level = args[args.index("--verbosity") + 1]
    except:
        pass

    allow_indirect_conversion = False
    try:
        args.index("--allow-indirect-conversion")
        allow_indirect_conversion = True
    except:
        pass
    try:
        args.index("-a")
        allow_indirect_conversion = True
    except:
        pass

    arg_parser = argparse.ArgumentParser(
        prog="bioconvert",
        description="""Convertor infer the
                                         formats from the first command. We do
                                         not scan the input file. Therefore
                                         users must ensure that their input
                                         format files are properly
                                         formatted.""",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Bioconvert contains tens of converters whose list is available as follows:

    bioconvert --help

Each conversion has its own sub-command and dedicated help. For instance:

    bioconvert fastq2fasta --help

Because the subcommand contains the format, extensions are not important
for the conversion itself. This would convert the test.txt file (fastq
format) into a fasta file:

    bioconvert fastq2fasta test.txt test.fasta

Users must ensure that their input format files are properly formatted.

If there is a conversion from A to B and another for B to C, you can also
perform indirect conversion using -a argument (experimental). This command
shows all possible indirect conversions:

    bioconvert --help -a

Please visit http://bioconvert.readthedocs.org for more information about the
project or formats available.

Bioconvert is an open source collaborative project. Please feel free to 
join us at https://github/biokit/bioconvert
""")

    subparsers = arg_parser.add_subparsers(
        help='sub-command help',
        dest='converter',
    )
    max_converter_width = 2 + max(
        [len(in_fmt) for in_fmt, _, _, _ in registry.iter_converters()])

    # show all possible conversion
    for in_fmt, out_fmt, converter, path in \
            sorted(registry.iter_converters(allow_indirect_conversion)):

        sub_parser_name = "{}2{}".format(in_fmt.lower(), out_fmt.lower())

        if converter:
            link_char = '-'
            if len(converter.available_methods
                   ) < 1 and converter._library_to_install is None:
                help_details = " (no available methods please see the doc" \
                               " for install the necessary libraries) "
            elif len(converter.available_methods
                     ) < 1 and converter._library_to_install is not None:
                help_details = " (no available methods please install {} \n" \
                               "see the doc for more details) ".format(converter._library_to_install)
            else:
                help_details = " (%i methods)" % len(
                    converter.available_methods)
        else:  #if path:
            link_char = '~'
            if len(path) == 3:
                help_details = " (w/ 1 intermediate)"
            else:
                help_details = " (w/ %i intermediates)" % (len(path) - 2)

        help_text = '{}to{}> {}{}'.format(
            (in_fmt + ' ').ljust(max_converter_width, link_char),
            link_char,
            out_fmt,
            help_details,
        )
        sub_parser = subparsers.add_parser(
            sub_parser_name,
            help=help_text,
            formatter_class=argparse.ArgumentDefaultsHelpFormatter,
            # aliases=["{}_to_{}".format(in_fmt.lower(), out_fmt.lower()), ],
            epilog="""Bioconvert is an open source collaborative project. 
Please feel free to join us at https://github/biokit/bioconvert
""",
        )

        if converter:
            converter.add_argument_to_parser(sub_parser=sub_parser)
        elif path:
            for a in ConvBase.get_common_arguments():
                a.add_to_sub_parser(sub_parser)

    arg_parser.add_argument(
        "-v",
        "--verbosity",
        default=bioconvert.logger.level,
        help="Set the outpout verbosity.",
        choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
    )

    arg_parser.add_argument(
        "--dependency-report",
        action="store_true",
        default=False,
        help="Output all bioconvert dependencies in json and exit")

    arg_parser.add_argument("-a",
                            "--allow-indirect-conversion",
                            action="store_true",
                            help="Show all possible indirect conversions "
                            "(labelled as intermediate) (EXPERIMENTAL)")

    arg_parser.add_argument("--version",
                            action="store_true",
                            default=False,
                            help="Show version")

    arg_parser.add_argument(
        "--conversion-graph",
        nargs="?",
        default=None,
        choices=[
            "cytoscape",
            "cytoscape-all",
        ],
    )

    try:
        args = arg_parser.parse_args(args)
    except SystemExit as e:
        # parsing ask to stop, maybe a normal exit
        if e.code == 0:
            raise e
        # Parsing failed, trying to guess converter
        from bioconvert.core.levenshtein import wf_levenshtein as lev
        sub_command = None
        args_i = 0
        while sub_command is None and args_i < len(args):
            if args[args_i][0] != '-' and (
                    args_i == 0 or args[args_i - 1] != '-v'
                    and args[args_i - 1] != '--verbose'
                    and args[args_i - 1] != '--conversion-graph'):
                sub_command = args[args_i]
            args_i += 1

        if sub_command is None:
            # No sub_command found, so letting the initial exception be risen
            raise e

        conversions = []
        for in_fmt, out_fmt, converter, path in registry.iter_converters(
                allow_indirect_conversion):
            conversion_name = "{}2{}".format(in_fmt.lower(), out_fmt.lower())
            conversions.append((lev(conversion_name,
                                    sub_command), conversion_name))
        matches = sorted(conversions)[:5]
        if matches[0][0] == 0:
            # sub_command was ok, problem comes from elswhere
            raise e
        arg_parser.exit(
            e.code, '\n\nYour converter {}() was not found. \n'
            'Here is a list of possible matches: {} ... '
            '\nYou may also add the -a argument to enfore a '
            'transitive conversion. The whole list is available using\n\n'
            '    bioconvert --help -a \n'.format(
                sub_command, ', '.join([v for _, v in matches])))

    if args.version:
        print("{}".format(bioconvert.version))
        sys.exit(0)

    if args.dependency_report:
        print(
            json.dumps(
                get_known_dependencies_with_availability(as_dict=True),
                sort_keys=True,
                indent=4,
            ))
        sys.exit(0)

    if args.conversion_graph:
        if args.conversion_graph.startswith("cytoscape"):
            all_converter = args.conversion_graph == "cytoscape-all"
            print(
                json.dumps(
                    graph.create_graph_for_cytoscape(
                        all_converter=all_converter),
                    indent=4,
                ))
        sys.exit(0)

    if args.converter is None:
        msg = 'No converter specified. You can list converter by doing bioconvert --help'
        arg_parser.error(msg)

    if not (getattr(args, "show_methods", False) or args.input_file):
        arg_parser.error('Either specify an input_file (<INPUT_FILE>) or '
                         'ask for available methods (--show-method)')

    if not args.allow_indirect_conversion and \
        ConvMeta.split_converter_to_format(args.converter) not in registry:

        arg_parser.error('The conversion {} is not available directly, '
                         'you have to accept that we chain converter to do'
                         ' so (--allow-indirect-conversion or -a)'.format(
                             args.converter))

    args.raise_exception = args.raise_exception or args.verbosity == "DEBUG"

    # Set the logging level
    bioconvert.logger.level = args.verbosity

    # Figure out whether we have several input files or not
    # Are we in batch mode ?
    import glob
    if args.batch:
        filenames = glob.glob(args.input_file)
    else:
        filenames = [args.input_file]

    for filename in filenames:
        args.input_file = filename
        try:
            analysis(args)
        except Exception as e:
            if args.raise_exception:
                raise e
            else:
                bioconvert.logger.error(e)
            sys.exit(1)