Beispiel #1
0
def main(args=None):
    registry = Registry()

    if args is None:
        args = sys.argv[1:]

    if not len(sys.argv) == 1:

        # check that the first argument is not a converter in the registry
        if args[0].lower() not in list(registry.get_converters_names()) \
                and "." in args[0]:

            in_ext = utils.get_extension(args[0], remove_compression=True)
            out_ext = utils.get_extension(args[1], remove_compression=True)

            # Check that the input file exists
            # Fixes https://github.com/bioconvert/bioconvert/issues/204
            if os.path.exists(args[0]) is False:
                _log.error("Input file {} does not exist".format(args[0]))
                sys.exit(1)

            # assign to converter the converter (s) found for the ext_pair = (in_ext, out_ext)
            try:
                converter = registry.get_ext((in_ext, out_ext))
                # for testing the mutiple converter for one extension pair
                # converter = [bioconvert.fastq2fasta.Fastq2Fasta, bioconvert.phylip2xmfa.PHYLIP2XMFA]
            except KeyError:
                converter = []

            # if no converter is found
            if not converter:
                _log.error(
                    '\n Bioconvert does not support conversion {} -> {}. \n'
                    'Please specify the converter'
                    '\n Usage : \n\n'
                    '\t bioconvert converter input_file output_file \n '
                    '\n To see all the converter : '
                    '\n \t bioconvert --help '.format(in_ext, out_ext))

                sys.exit(1)
            # if the ext_pair matches a single converter
            elif len(converter) == 1:
                args.insert(0, converter[0].__name__.lower())
            # if the ext_pair matches multiple converters
            else:

                _log.error("Ambiguous extension.\n"
                           "You must specify the right conversion  Please "
                           "choose a conversion from: \n\n"
                           "{}".format("\n".join(
                               [c.__name__.lower() for c in converter])))
                sys.exit(1)

    # Set the default level
    bioconvert.logger.level = "ERROR"

    # Changing the log level before argparse is run
    try:
        bioconvert.logger.level = args[args.index("-l") + 1]
    except:
        pass
    try:
        bioconvert.logger.level = args[args.index("--level") + 1]
    except:
        pass

    try:
        bioconvert.logger.level = args[args.index("-v") + 1]
    except:
        pass
    try:
        bioconvert.logger.level = args[args.index("--verbosity") + 1]
    except:
        pass

    allow_indirect_conversion = False
    try:
        args.index("--allow-indirect-conversion")
        allow_indirect_conversion = True
    except:
        pass
    try:
        args.index("-a")
        allow_indirect_conversion = True
    except:
        pass

    arg_parser = argparse.ArgumentParser(
        prog="bioconvert",
        description="""Convertor infer the
                                         formats from the first command. We do
                                         not scan the input file. Therefore
                                         users must ensure that their input
                                         format files are properly
                                         formatted.""",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Bioconvert contains tens of converters whose list is available as follows:

    bioconvert --help

Each conversion has its own sub-command and dedicated help. For instance:

    bioconvert fastq2fasta --help

Because the subcommand contains the format, extensions are not important
for the conversion itself. This would convert the test.txt file (fastq
format) into a fasta file:

    bioconvert fastq2fasta test.txt test.fasta

Users must ensure that their input format files are properly formatted.

If there is a conversion from A to B and another for B to C, you can also
perform indirect conversion using -a argument (experimental). This command
shows all possible indirect conversions:

    bioconvert --help -a

Please visit http://bioconvert.readthedocs.org for more information about the
project or formats available.

Bioconvert is an open source collaborative project. Please feel free to 
join us at https://github/biokit/bioconvert
""")

    subparsers = arg_parser.add_subparsers(
        help='sub-command help',
        dest='converter',
    )
    max_converter_width = 2 + max(
        [len(in_fmt) for in_fmt, _, _, _ in registry.iter_converters()])

    # show all possible conversion
    for in_fmt, out_fmt, converter, path in \
            sorted(registry.iter_converters(allow_indirect_conversion)):

        sub_parser_name = "{}2{}".format(in_fmt.lower(), out_fmt.lower())

        if converter:
            link_char = '-'
            if len(converter.available_methods
                   ) < 1 and converter._library_to_install is None:
                help_details = " (no available methods please see the doc" \
                               " for install the necessary libraries) "
            elif len(converter.available_methods
                     ) < 1 and converter._library_to_install is not None:
                help_details = " (no available methods please install {} \n" \
                               "see the doc for more details) ".format(converter._library_to_install)
            else:
                help_details = " (%i methods)" % len(
                    converter.available_methods)
        else:  #if path:
            link_char = '~'
            if len(path) == 3:
                help_details = " (w/ 1 intermediate)"
            else:
                help_details = " (w/ %i intermediates)" % (len(path) - 2)

        help_text = '{}to{}> {}{}'.format(
            (in_fmt + ' ').ljust(max_converter_width, link_char),
            link_char,
            out_fmt,
            help_details,
        )
        sub_parser = subparsers.add_parser(
            sub_parser_name,
            help=help_text,
            formatter_class=argparse.ArgumentDefaultsHelpFormatter,
            # aliases=["{}_to_{}".format(in_fmt.lower(), out_fmt.lower()), ],
            epilog="""Bioconvert is an open source collaborative project. 
Please feel free to join us at https://github/biokit/bioconvert
""",
        )

        if converter:
            converter.add_argument_to_parser(sub_parser=sub_parser)
        elif path:
            for a in ConvBase.get_common_arguments():
                a.add_to_sub_parser(sub_parser)

    arg_parser.add_argument(
        "-v",
        "--verbosity",
        default=bioconvert.logger.level,
        help="Set the outpout verbosity.",
        choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
    )

    arg_parser.add_argument(
        "--dependency-report",
        action="store_true",
        default=False,
        help="Output all bioconvert dependencies in json and exit")

    arg_parser.add_argument("-a",
                            "--allow-indirect-conversion",
                            action="store_true",
                            help="Show all possible indirect conversions "
                            "(labelled as intermediate) (EXPERIMENTAL)")

    arg_parser.add_argument("--version",
                            action="store_true",
                            default=False,
                            help="Show version")

    arg_parser.add_argument(
        "--conversion-graph",
        nargs="?",
        default=None,
        choices=[
            "cytoscape",
            "cytoscape-all",
        ],
    )

    try:
        args = arg_parser.parse_args(args)
    except SystemExit as e:
        # parsing ask to stop, maybe a normal exit
        if e.code == 0:
            raise e
        # Parsing failed, trying to guess converter
        from bioconvert.core.levenshtein import wf_levenshtein as lev
        sub_command = None
        args_i = 0
        while sub_command is None and args_i < len(args):
            if args[args_i][0] != '-' and (
                    args_i == 0 or args[args_i - 1] != '-v'
                    and args[args_i - 1] != '--verbose'
                    and args[args_i - 1] != '--conversion-graph'):
                sub_command = args[args_i]
            args_i += 1

        if sub_command is None:
            # No sub_command found, so letting the initial exception be risen
            raise e

        conversions = []
        for in_fmt, out_fmt, converter, path in registry.iter_converters(
                allow_indirect_conversion):
            conversion_name = "{}2{}".format(in_fmt.lower(), out_fmt.lower())
            conversions.append((lev(conversion_name,
                                    sub_command), conversion_name))
        matches = sorted(conversions)[:5]
        if matches[0][0] == 0:
            # sub_command was ok, problem comes from elswhere
            raise e
        arg_parser.exit(
            e.code, '\n\nYour converter {}() was not found. \n'
            'Here is a list of possible matches: {} ... '
            '\nYou may also add the -a argument to enfore a '
            'transitive conversion. The whole list is available using\n\n'
            '    bioconvert --help -a \n'.format(
                sub_command, ', '.join([v for _, v in matches])))

    if args.version:
        print("{}".format(bioconvert.version))
        sys.exit(0)

    if args.dependency_report:
        print(
            json.dumps(
                get_known_dependencies_with_availability(as_dict=True),
                sort_keys=True,
                indent=4,
            ))
        sys.exit(0)

    if args.conversion_graph:
        if args.conversion_graph.startswith("cytoscape"):
            all_converter = args.conversion_graph == "cytoscape-all"
            print(
                json.dumps(
                    graph.create_graph_for_cytoscape(
                        all_converter=all_converter),
                    indent=4,
                ))
        sys.exit(0)

    if args.converter is None:
        msg = 'No converter specified. You can list converter by doing bioconvert --help'
        arg_parser.error(msg)

    if not (getattr(args, "show_methods", False) or args.input_file):
        arg_parser.error('Either specify an input_file (<INPUT_FILE>) or '
                         'ask for available methods (--show-method)')

    if not args.allow_indirect_conversion and \
        ConvMeta.split_converter_to_format(args.converter) not in registry:

        arg_parser.error('The conversion {} is not available directly, '
                         'you have to accept that we chain converter to do'
                         ' so (--allow-indirect-conversion or -a)'.format(
                             args.converter))

    args.raise_exception = args.raise_exception or args.verbosity == "DEBUG"

    # Set the logging level
    bioconvert.logger.level = args.verbosity

    # Figure out whether we have several input files or not
    # Are we in batch mode ?
    import glob
    if args.batch:
        filenames = glob.glob(args.input_file)
    else:
        filenames = [args.input_file]

    for filename in filenames:
        args.input_file = filename
        try:
            analysis(args)
        except Exception as e:
            if args.raise_exception:
                raise e
            else:
                bioconvert.logger.error(e)
            sys.exit(1)
Beispiel #2
0
    def __init__(self, infile, outfile, in_fmt=None, out_fmt=None, force=False):
        """.. rubric:: constructor

        :param str infile: The path of the input file.
        :param str outfile: The path of The output file
        :param str in_fmt: the format for the input file
        :param str out_fmt: the format for the output
        :param bool force: overwrite output file if it exists already
            otherwise raises an error

        """
        # don't check the input file because there are cases where input parameter is just a prefix
        # if os.path.exists(infile) is False:
        #     msg = "Incorrect input file: %s" % infile
        #     _log.error(msg)
        #     raise ValueError(msg)

        # check existence of output file. If it exists,
        # fails except if force argument is set to True
        if os.path.exists(outfile) is True:
            msg = "output file {} exists already".format(outfile)
            _log.warning("output file exists already")
            if force is False:
                _log.critical("output file exists. If you are using bioconvert, use --force ")
                raise ValueError(msg)
            else:
                _log.warning("output file will be overwritten")

        # Only fastq files can be compressed with dsrc
        if outfile.endswith(".dsrc"):
            # only valid for FastQ files extension
            # dsrc accepts only .fastq file extension
            if outfile.endswith(".fastq.dsrc") is False:
                msg = "When compressing with .dsrc extension, " +\
                    "only files ending with .fastq extension are " +\
                    "accepted. This is due to the way dsrc executable +"\
                    "is implemented."
                _log.critical(msg)
                raise IOError

        # Case1: fastq.gz to fasta.bz2
        # Here, we want to decompress, convert, compress.
        # so we need the extension without .gz or .bz2
        # We should have inext set to fastq and outext
        # set to fasta.bz2
        self.inext = getext(infile, remove_compression=True)
        self.outext = getext(outfile, remove_compression=True)

        # Case 2, fastq.gz to fastq.bz2
        # data is not changed, just the type of compression, so we want
        # to keep the original extensions, here inext and outext  will contain
        # .gz and .bz2
        if self.inext == self.outext:
            _log.info("decompression/compression mode")
            self.inext = getext(infile)
            self.outext = getext(outfile)

        self.mapper = Registry()

        # From the input parameters 1 and 2, we get the module name
        try:
            if in_fmt is None:
                in_fmt = get_format_from_extension(self.inext)
            if out_fmt is None:
                out_fmt = get_format_from_extension(self.outext)
            self.in_fmt = in_fmt.upper()
            self.out_fmt = out_fmt.upper()
            _log.info("Input: %s", self.in_fmt)
            _log.info("Output: %s", self.out_fmt)
            class_converter = self.mapper[(self.in_fmt, self.out_fmt)]
            self.name = class_converter.__name__
        except KeyError:
            # This module name was not found
            # Try to find path of converters
            conv_path = self.mapper.conversion_path(self.in_fmt, self.out_fmt)
            _log.debug("path: {}".format(conv_path))
            if conv_path:
                _log.info("Direct conversion not implemented. "
                          "Chaining converters.")
                # implemented in bioconvert/core/base.py
                # using temporary files
                class_converter = make_chain([
                    (pair, self.mapper[pair]) for pair in conv_path])
            else:
                msg = "Requested input format ('%s') to output format ('%s') is not available in bioconvert" %(
                    self.in_fmt,
                    self.out_fmt,
                )
                _log.critical(msg)
                _log.critical("Use --formats to know the available formats and --help for examples")
                raise Exception(msg)

        self.converter = class_converter(infile, outfile)
        _log.info("Using {} class".format(self.converter.name))
Beispiel #3
0
def analysis(args):
    in_fmt, out_fmt = ConvMeta.split_converter_to_format(args.converter)

    # do we want to know the available methods ? If so, print info and quit
    if getattr(args, "show_methods", False):
        class_converter = Registry()[(in_fmt, out_fmt)]
        print("Methods available: {}".format(
            class_converter.available_methods))
        print("\nPlease see http://bioconvert.readthedocs.io/en/master/"
              "references.html#{} for details ".format(
                  str(class_converter).split("'")[1]))
        if args.raise_exception:
            return
        sys.exit(0)

    # Input and output filename
    infile = args.input_file

    # Check that the input file exists
    # Fixes https://github.com/bioconvert/bioconvert/issues/204
    if os.path.exists(infile) is False:

        # Some convertors uses prefix instead of filename. We could have
        # ambiguities: if we use a prefix without extension,
        # we could be confused with the convertor name. This is true
        # for the plink families
        if "plink" in args.converter:
            pass
        else:
            _log.error(
                "Input file {} does not exist (analysis)".format(infile))
            sys.exit(1)

    if args.output_file is None and infile:
        outext = ConvMeta.split_converter_to_format(args.converter)
        outfile = infile.rsplit(".", 1)[0] + "." + outext[1].lower()
    else:
        outfile = args.output_file

    # Call a generic wrapper of all available conversion
    conv = Bioconvert(
        infile,
        outfile,
        in_fmt=in_fmt,
        out_fmt=out_fmt,
        force=args.force,
    )

    # # Users may provide information about the input file.
    # # Indeed, the input may be a FastQ file but with an extension
    # # that is not standard. For instance fq instead of fastq
    # # If so, we can use the --input-format fastq to overwrite the
    # # provided filename extension

    # no need to do this
    # if args.input_format:
    #     inext = args.input_format
    #     if not conv.inext.startswith("."):
    #         conv.inext = "." + inext

    if not conv.in_fmt:
        raise RuntimeError("convert infer the format from the extension name."
                           " So add extension to the input file name or use"
                           " --input-format option.")

    if not conv.out_fmt:
        raise RuntimeError("convert infer the format from the extension name."
                           " So add extension to the output file name or use"
                           " --output-format option.")

    bioconvert.logger.info("Converting from {} to {}".format(
        conv.in_fmt, conv.out_fmt))

    # params = {"threads": args.threads}

    if args.benchmark:
        conv.boxplot_benchmark(N=args.benchmark_N)
        import pylab

        try:
            outpng = "benchmark_{}.png".format(conv.name)
            pylab.savefig(outpng, dpi=200)
        except:
            outpng = "benchmark_{}.png".format(conv.converter.name)
            pylab.savefig(outpng, dpi=200)
        bioconvert.logger.info("File {} created")
    else:
        # params["method"] = args.method
        conv(**vars(args))
Beispiel #4
0
# along with this program (COPYING file).                                 #
# If not, see <http://www.gnu.org/licenses/>.                             #
###########################################################################
"""
Available methods per converter
=====================================

Plot number of implemented methods per converter.


"""
#################################################
#
from bioconvert.core.registry import Registry

r = Registry()
info = r.get_info()

# The available unique converters
converters = [x for x in info.items()]

# the number of methods per converter
data = [info[k] for k, v in info.items()]

print("Number of converters: {}".format(len(converters)))
print("Number of methods : {}".format(sum(data)))

#####################################################
from pylab import hist, clf, xlabel, grid

clf()
def test_rgistry():
    rr = Registry()
    rr.info()
    print(rr)
Beispiel #6
0
def create_graph(filename, layout="dot", use_singularity=False, color_for_disabled_converter='red'):
    """

    :param filename: should end in .png or .svg or .dot

    If extension is .dot, only the dot file is created.
    This is useful if you have issues installing graphviz.
    If so, under Linux you could use our singularity container
    see github.com/cokelaer/graphviz4all

    """
    from bioconvert.core.registry import Registry
    rr = Registry()

    try:
        if filename.endswith(".dot") or use_singularity is True:
            raise Exception()
        from pygraphviz import AGraph
        dg = AGraph(directed=True)

        for a, b, s in rr.get_all_conversions():
            dg.add_edge(a, b, color='black' if s else color_for_disabled_converter)

        dg.layout(layout)
        dg.draw(filename)

    except Exception as e:
        _log.error(e)
        dot = """
strict digraph{
    node [label="\\N"];

    """
        nodes = set([item for items in rr.get_all_conversions() for item in items[0:1]])

        for node in nodes:
            dot += "\"{}\";\n".format(node)
        for a, b, s in rr.get_all_conversions():
            dot += "\"{}\" -> \"{}\";\n".format(a, b)
        dot += "}\n"

        from easydev import TempFile
        from bioconvert import shell
        dotfile = TempFile(suffix=".dot")
        with open(dotfile.name, "w") as fout:
            fout.write(dot)

        dotpath = ""
        if use_singularity:
            from bioconvert.core.downloader import download_singularity_image
            singfile = download_singularity_image(
                "graphviz.simg",
                "shub://cokelaer/graphviz4all:v1",
                "4288088d91c848e5e3a327282a1ab3d1")

            dotpath = "singularity run {} ".format(singfile)
            on_rtd = environ.get('READTHEDOCS', None) == 'True'
            if on_rtd:
                dotpath = ""

        ext = filename.rsplit(".", 1)[1]
        cmd = "{}dot -T{} {} -o {}".format(dotpath, ext, dotfile.name, filename)
        try:
            shell(cmd)
        except:
            import os
            os.system(cmd)
Beispiel #7
0
def main(args=None):

    # used later on
    registry = Registry()

    if args is None:
        args = sys.argv[1:]

    # convenient variable to check implicit/explicit mode and
    # get information about the arguments.
    ph = ParserHelper(args)

    if not len(sys.argv) == 1:

        if ph.mode == "implicit":

            # Check that the input file exists
            # Fixes https://github.com/bioconvert/bioconvert/issues/204
            if os.path.exists(args[0]) is False:
                _log.error("First input file {} does not exist".format(
                    args[0]))
                sys.exit(1)

            # list of filenames from which we get the extensions
            filenames = ph.get_filelist()
            exts = [
                utils.get_extension(x, remove_compression=True)
                for x in filenames
            ]

            # We need to get the corresponding converter if any.

            # We assume that the input formats are ordered alphabetically
            # (bioconvert API).
            # For instance fasta,qual to fastq can be
            # found but qual,fasta to fastq cannot. Indeed, in more complex
            # cases such as a,b -> c,d we cannot know whether there are 1 or 3
            # inputs. This would require extra code here below
            try:
                L = len(exts)
                converter = []
                # if input is a,b,c,d we want to try a->(b,c,d) and
                # (a,b)->(c,d) and (a,b,c)-> c so L-1 case
                for i in range(1, L):
                    in_ext = tuple(exts[0:i])
                    out_ext = tuple(exts[i:])
                    try:
                        converter.extend(registry.get_ext((in_ext, out_ext)))
                    except KeyError:
                        pass
            except KeyError:
                converter = []

            # For 1-to-1, if the extensions are identical but different
            # compression, this means we just want to decompress and
            # re-compress in another format.
            if not converter and (exts[0] == exts[1]):
                exts_with_comp = [
                    utils.get_extension(x, remove_compression=False)
                    for x in filenames
                ]
                in_ext, out_ext = exts_with_comp[0], exts_with_comp[1]
                comps = ['gz', 'dsrc', 'bz2']
                if in_ext in comps and out_ext in comps:
                    converter.extend(
                        registry.get_ext(((in_ext, ), (out_ext, ))))

            # if no converter is found, print information
            if not converter:
                msg = '\nBioconvert does not support conversion {} -> {}. \n\n'
                msg = msg.format(in_ext, out_ext)

                # maybe it is an indirect conversion ? let us look at the
                # digraph
                try:
                    _path = registry._path_dict_ext[in_ext][out_ext]
                    #Here, we have a transitive list of tuples to go from A to C
                    # example from fq to clustal returns:
                    # [('fq',), ('fa',), ('clustal',)]
                    # If we naively build the converter from those names
                    # (fq2clustal), this is a non official converter name. The
                    # official one is fastq2clustal, so we need some hack here:
                    in_name, int_name, out_name = _path
                    a = registry._ext_registry[
                        in_name, int_name][0].__name__.split("2")[0]
                    b = registry._ext_registry[
                        int_name, out_name][0].__name__.split("2")[1]

                    convname = "2".join([a, b]).lower()

                    msg += "\n".join(
                        textwrap.wrap(
                            "Note, however, that an indirect conversion through"
                            " an intermediate format is possible for your input and "
                            " output format. To do so, you need to use the -a option "
                            " and be explicit about the type of conversion. To get "
                            " the list of possible direct and indirect conversion, "
                            " please use:\n\n"))
                    msg += "\n\n    bioconvert --help -a\n\n"
                    msg += "For help and with your input/output most probably"
                    msg += "the command should be: \n\n    bioconvert {} {} -a\n\n ".format(
                        convname, " ".join(ph.get_filelist()))
                except KeyError:
                    pass  # not converter found in the path
                error(msg)

            # if the ext_pair matches a single converter
            elif len(converter) == 1:
                args.insert(0, converter[0].__name__.lower())
            # if the ext_pair matches multiple converters
            else:
                _log.error("Ambiguous extension.\n"
                           "You must specify the right conversion  Please "
                           "choose a conversion from: \n\n"
                           "{}".format("\n".join(
                               [c.__name__.lower() for c in converter])))
                sys.exit(1)

    # Set the default level
    bioconvert.logger.level = "ERROR"

    # Changing the log level before argparse is run
    try:
        bioconvert.logger.level = args[args.index("-l") + 1]
    except:
        pass
    try:
        bioconvert.logger.level = args[args.index("--level") + 1]
    except:
        pass
    try:
        bioconvert.logger.level = args[args.index("-v") + 1]
    except:
        pass
    try:
        bioconvert.logger.level = args[args.index("--verbosity") + 1]
    except:
        pass

    # if there is the ability to convert from A to B to C, we must set
    # the option -a (--allow_indirect_conversion)
    allow_indirect_conversion = False

    try:
        args.index("--allow-indirect-conversion")
        allow_indirect_conversion = True
    except:
        pass

    try:
        args.index("-a")
        allow_indirect_conversion = True
    except:
        pass

    # Now, the instanciation of the main bioconvert user interface
    arg_parser = argparse.ArgumentParser(
        prog="bioconvert",
        description="",
        #""Convertor infer the
        #formats from the first command. We do
        #not scan the input file. Therefore
        #users must ensure that their input
        #format files are properly
        #formatted.""",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Bioconvert contains tens of converters whose list is available as follows:

    bioconvert --help

Each conversion has its own sub-command and dedicated help. For instance:

    bioconvert fastq2fasta --help

Because the subcommand contains the format, extensions are not important
for the conversion itself. This would convert the test.txt file (fastq
format) into a fasta file:

    bioconvert fastq2fasta test.txt test.fasta

If you use known extensions, the converter may be omitted::

    bioconvert test.fastq test.fasta

Users must ensure that their input format files are properly formatted.

If there is a conversion from A to B and another for B to C, you can also
perform indirect conversion using -a argument (experimental). This command
shows all possible indirect conversions:

    bioconvert --help -a

Please visit http://bioconvert.readthedocs.org for more information about the
project or formats available. Would you wish to help, please join our open 
source collaborative project at https://github/bioconvert/bioconvert
""")

    subparsers = arg_parser.add_subparsers(
        help='sub-command help',
        dest='converter',
    )

    max_converter_width = 2 + max(
        [len(in_fmt) for in_fmt, _, _, _ in registry.iter_converters()])

    def sorting_tuple_string(item):
        if type(item) is tuple:
            return item[0][0]
        if type(item) is str:
            return item[0]

    # show all possible conversion including indirect conversion
    for in_fmt, out_fmt, converter, path in \
            sorted(registry.iter_converters(allow_indirect_conversion), key=sorting_tuple_string):
        in_fmt = ConvBase.lower_tuple(in_fmt)
        in_fmt = ["_".join(in_fmt)]

        out_fmt = ConvBase.lower_tuple(out_fmt)
        out_fmt = ["_".join(out_fmt)]

        sub_parser_name = "{}2{}".format("_".join(in_fmt), "_".join(out_fmt))

        if converter:
            link_char = '-'
            if len(converter.available_methods) < 1:
                help_details = " (no available methods please see the doc" \
                               " for install the necessary libraries) "
            else:
                help_details = " (%i methods)" % len(
                    converter.available_methods)
        else:  #if path:
            link_char = '~'
            if len(path) == 3:
                help_details = " (w/ 1 intermediate)"
            else:
                help_details = " (w/ %i intermediates)" % (len(path) - 2)

        help_text = '{}to{}> {}{}'.format(
            ("_".join(in_fmt) + ' ').ljust(max_converter_width, link_char),
            link_char,
            ("_".join(out_fmt)),
            help_details,
        )
        sub_parser = subparsers.add_parser(
            sub_parser_name,
            help=help_text,
            formatter_class=argparse.ArgumentDefaultsHelpFormatter,
            # aliases=["{}_to_{}".format(in_fmt.lower(), out_fmt.lower()), ],
            epilog="""Bioconvert is an open source collaborative project. 
Please feel free to join us at https://github/biokit/bioconvert
""",
        )
        if converter:
            converter.add_argument_to_parser(sub_parser=sub_parser)
        elif path:
            for a in ConvBase.get_IO_arguments():
                a.add_to_sub_parser(sub_parser)
            for a in ConvBase.get_common_arguments():
                a.add_to_sub_parser(sub_parser)

    # arguments when no explicit conversion provided.

    arg_parser.add_argument(
        "-v",
        "--verbosity",
        default=bioconvert.logger.level,
        help="Set the outpout verbosity.",
        choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
    )

    arg_parser.add_argument(
        "-l",
        "--level",
        default=bioconvert.logger.level,
        help="Set the outpout verbosity. Same as --verbosity",
        choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
    )

    arg_parser.add_argument(
        "--dependency-report",
        action="store_true",
        default=False,
        help="Output all bioconvert dependencies in json and exit")

    arg_parser.add_argument("-a",
                            "--allow-indirect-conversion",
                            action="store_true",
                            help="Show all possible indirect conversions "
                            "(labelled as intermediate)")

    arg_parser.add_argument("--version",
                            action="store_true",
                            default=False,
                            help="Show version")

    arg_parser.add_argument(
        "--conversion-graph",
        nargs="?",
        default=None,
        choices=[
            "cytoscape",
            "cytoscape-all",
        ],
    )

    try:
        args = arg_parser.parse_args(args)
    except SystemExit as e:
        # parsing ask to stop, maybe a normal exit
        if e.code == 0:
            raise e

        # Parsing failed, trying to guess converter
        from bioconvert.core.levenshtein import wf_levenshtein as lev

        sub_command = None
        args_i = 0
        while sub_command is None and args_i < len(args):
            if args[args_i][0] != '-' and (
                    args_i == 0 or args[args_i - 1] != '-v'
                    and args[args_i - 1] != '--verbose'
                    and args[args_i - 1] != '--conversion-graph'):
                sub_command = args[args_i]
            args_i += 1

        if sub_command is None:
            # No sub_command found, so letting the initial exception be risen
            raise e

        conversions = []
        for in_fmt, out_fmt, converter, path in registry.iter_converters(
                allow_indirect_conversion):
            in_fmt = ConvBase.lower_tuple(in_fmt)
            in_fmt = ["_".join(in_fmt)]
            out_fmt = ConvBase.lower_tuple(out_fmt)
            out_fmt = ["_".join(out_fmt)]
            conversion_name = "{}2{}".format("_".join(in_fmt),
                                             "_".join(out_fmt))
            conversions.append((lev(conversion_name,
                                    sub_command), conversion_name))
        matches = sorted(conversions)[:5]
        if matches[0][0] == 0:
            # sub_command was ok, problem comes from elswhere
            raise e
        arg_parser.exit(
            e.code, '\n\nYour converter {}() was not found. \n'
            'Here is a list of possible matches: {} ... '
            '\nYou may also add the -a argument to enfore a '
            'transitive conversion. The whole list is available using\n\n'
            '    bioconvert --help -a \n'.format(
                sub_command, ', '.join([v for _, v in matches])))

    if args.version:
        print("{}".format(bioconvert.version))
        sys.exit(0)

    if args.dependency_report:
        print(
            json.dumps(
                get_known_dependencies_with_availability(as_dict=True),
                sort_keys=True,
                indent=4,
            ))
        sys.exit(0)

    if args.conversion_graph:
        if args.conversion_graph.startswith("cytoscape"):
            all_converter = args.conversion_graph == "cytoscape-all"
            print(
                json.dumps(
                    graph.create_graph_for_cytoscape(
                        all_converter=all_converter),
                    indent=4,
                ))
        sys.exit(0)

    if args.converter is None:
        msg = "No converter specified. "
        msg += "You can list all converters by using:\n\n\tbioconvert --help"
        arg_parser.error(msg)

    if not (getattr(args, "show_methods", False) or args.input_file):
        arg_parser.error('Either specify an input_file (<INPUT_FILE>) or '
                         'ask for available methods (--show-method)')

    if not args.allow_indirect_conversion and \
        ConvMeta.split_converter_to_format(args.converter) not in registry:

        arg_parser.error('The conversion {} is not available directly, '
                         'you have to accept that we chain converter to do'
                         ' so (--allow-indirect-conversion or -a)'.format(
                             args.converter))

    args.raise_exception = args.raise_exception or args.verbosity == "DEBUG"

    # Set the logging level
    bioconvert.logger.level = args.verbosity

    # Figure out whether we have several input files or not
    # Are we in batch mode ?
    if args.batch:
        filenames = glob.glob(args.input_file)
    else:
        filenames = [args.input_file]

    N = len(filenames)
    for i, filename in enumerate(filenames):
        if N > 1:
            _log.info("Converting {} ({}/{})".format(filename, i + 1, N))
        args.input_file = filename
        try:
            analysis(args)
        except Exception as e:
            if args.raise_exception:
                raise e
            else:
                bioconvert.logger.error(e)
            sys.exit(1)
Beispiel #8
0
def analysis(args):
    in_fmt, out_fmt = ConvMeta.split_converter_to_format(args.converter)

    # do we want to know the available methods ? If so, print info and quit
    if getattr(args, "show_methods", False):
        class_converter = Registry()[(in_fmt, out_fmt)]
        print("Methods available: {}".format(
            class_converter.available_methods))
        print("\nPlease see http://bioconvert.readthedocs.io/en/master/"
              "references.html#{} for details ".format(
                  str(class_converter).split("'")[1]))
        if args.raise_exception:
            return
        sys.exit(0)

    # Input and output filename
    infile = args.input_file

    # Check that the input file exists
    # Fixes https://github.com/bioconvert/bioconvert/issues/204
    if type(infile) is tuple:
        for file in infile:
            if os.path.exists(file) is False:

                # Some convertors uses prefix instead of filename. We could have
                # ambiguities: if we use a prefix without extension,
                # we could be confused with the convertor name. This is true
                # for the plink families
                if "plink" in args.converter:
                    pass
                else:
                    _log.error(
                        "Input file {} does not exist (analysis)".format(file))
                    sys.exit(1)

    if args.output_file is None and infile:
        outext = ConvMeta.split_converter_to_format(args.converter)
        if infile.split(".")[-1] in ["gz", "dsrc", "bz2"]:
            outfile = infile.split(".", 1)[0].split(".", 1)[0]
            outfile += "." + outext[1][0].lower()

        else:
            outfile = infile.rsplit(".", 1)[0] + "." + outext[1][0].lower()

        print(outext, outfile)
    else:
        outfile = args.output_file

    # check whether a valid --thread option was provided
    if "threads" in args:
        threads = args.threads
    else:
        threads = None

    # default will be ""
    if "extra_arguments" in args:
        extra_arguments = args.extra_arguments

    # Call a generic wrapper of all available conversion
    conv = Bioconvert(
        infile,
        outfile,
        #in_fmt=in_fmt,
        #out_fmt=out_fmt,
        force=args.force,
        threads=threads,
        extra=extra_arguments)

    if args.benchmark:
        conv.boxplot_benchmark(N=args.benchmark_N,
                               to_include=args.benchmark_methods)

        print(args.benchmark_methods)
        import pylab

        try:
            outpng = "benchmark_{}.png".format(conv.name)
            pylab.savefig(outpng, dpi=200)
        except:
            outpng = "benchmark_{}.png".format(conv.converter.name)
            pylab.savefig(outpng, dpi=200)
        bioconvert.logger.info("File {} created")
    else:
        # params["method"] = args.method
        conv(**vars(args))
Beispiel #9
0
    def __init__(self, infile, outfile, force=False):
        """.. rubric:: constructor

        :param str infile: The path of the input file.
        :param str outfile: The path of The output file
        :param bool force: overwrite output file if it exists already
            otherwise raises an error

        """
        if os.path.exists(infile) is False:
            msg = "Incorrect input file: %s" % infile
            _log.error(msg)
            raise ValueError(msg)

        # check existence of output file. If it exists,
        # fails except if force argument is set to True
        if os.path.exists(outfile) is True:
            msg = "output file {} exists already".format(outfile)
            _log.warning("output file exists already")
            if force is False:
                _log.critical(
                    "output file exists. If you are using bioconvert, use --force "
                )
                raise ValueError(msg)
            else:
                _log.warning("output file will be overwritten")

        # Only fastq files can be compressed with dsrc
        if outfile.endswith(".dsrc"):
            # only valid for FastQ files extension
            # dsrc accepts only .fastq file extension
            if outfile.endswith(".fastq.dsrc") is False:
                msg = "When compressing with .dsrc extension, " +\
                    "only files ending with .fastq extension are " +\
                    "accepted. This is due to the way dsrc executable +"\
                    "is implemented."
                _log.critical(msg)
                raise IOError

        # case1: fastq.gz to fasta.bz2
        # Here, we want to decompress, convert, compress.
        # so we need the extension without .gz or .bz2
        # We should have inext set to fastq and outext
        # set to fasta.bz2
        self.inext = getext(infile, remove_compression=True)
        self.outext = getext(outfile, remove_compression=True)

        # Case 2, fastq.gz to fastq.bz2
        # data is not changed, just the type of compression, so we want
        # to keep the original extensions, here inext and outext  will contain
        # .gz and .bz2
        if self.inext == self.outext:
            _log.info("decompression/compression mode")
            self.inext = getext(infile)
            self.outext = getext(outfile)

        self.mapper = Registry()

        # From the input parameters 1 and 2, we get the module name
        try:
            _log.info("Input: {}".format(self.inext))
            _log.info("Output: {}".format(self.outext))
            class_converter = self.mapper[(self.inext, self.outext)]
            self.name = class_converter.__name__
        except KeyError:
            # This module name was not found
            msg = "Requested input format ({}) to output format ({}) is not available in bioconvert"
            _log.critical(msg.format(self.inext, self.outext))
            _log.critical(
                "Use --formats to know the available formats and --help for examples"
            )
            sys.exit(1)

        self.converter = class_converter(infile, outfile)
        _log.info("Using {} class".format(self.converter.name))
Beispiel #10
0
def create_graph(filename,
                 layout="dot",
                 use_singularity=False,
                 color_for_disabled_converter='red'):
    """

    :param filename: should end in .png or .svg or .dot

    If extension is .dot, only the dot file is created.
    This is useful if you have issues installing graphviz.
    If so, under Linux you could use our singularity container
    see github.com/cokelaer/graphviz4all

    """
    from bioconvert.core.registry import Registry
    rr = Registry()

    try:
        if filename.endswith(".dot") or use_singularity is True:
            raise Exception()
        from pygraphviz import AGraph
        dg = AGraph(directed=True)

        url = "https://bioconvert.readthedocs.io/en/master/formats.html#{}"

        for a, b, s in rr.get_all_conversions():
            if len(a) == 1 and len(b) == 1:

                dg.add_node(a[0],
                            shape="rectangle",
                            style="filled",
                            url=url.format(a[0].upper()))
                dg.add_node(b[0],
                            shape="rectangle",
                            style="filled",
                            url=url.format(b[0].upper()))
                dg.add_edge(
                    a[0],
                    b[0],
                    color='black' if s else color_for_disabled_converter)
            else:
                and_node = "_".join(a) + "_and_" + "_".join(b)

                dg.add_node(and_node,
                            label="",
                            fillcolor="black",
                            width=.1,
                            height=.1,
                            styled="filled",
                            fixedsize=True,
                            shape="circle")

                for this in a:
                    dg.add_edge(
                        this,
                        and_node,
                        color="black" if s else color_for_disabled_converter)

                for this in b:
                    dg.add_edge(
                        and_node,
                        this,
                        color="black" if s else color_for_disabled_converter)

        for name in dg.nodes():
            if dg.degree(name) < 5:
                dg.get_node(name).attr["fillcolor"] = "white"
            elif dg.degree(name) < 10:
                # yellow
                dg.get_node(name).attr["fillcolor"] = "yellow"
            elif dg.degree(name) < 20:
                # orange
                dg.get_node(name).attr["fillcolor"] = "orange"
            else:
                # red
                dg.get_node(name).attr["fillcolor"] = "red"

        dg.layout(layout)
        dg.draw(filename)
        dg.write("conversion.dot")
        print(list(dg.get_node("FASTQ").attr.values()))

    except Exception as e:
        _log.error(e)
        dot = """
strict digraph{
    node [label="\\N"];

    """
        nodes = set([
            item for items in rr.get_all_conversions() for item in items[0:1]
        ])

        for node in nodes:
            dot += "\"{}\";\n".format(node)
        for a, b, s in rr.get_all_conversions():
            dot += "\"{}\" -> \"{}\";\n".format(a, b)
        dot += "}\n"

        from easydev import TempFile
        from bioconvert import shell
        dotfile = TempFile(suffix=".dot")
        with open(dotfile.name, "w") as fout:
            fout.write(dot)

        dotpath = ""
        if use_singularity:
            from bioconvert.core.downloader import download_singularity_image
            singfile = download_singularity_image(
                "graphviz.simg", "shub://cokelaer/graphviz4all:v1",
                "4288088d91c848e5e3a327282a1ab3d1")

            dotpath = "singularity run {} ".format(singfile)
            on_rtd = environ.get('READTHEDOCS', None) == 'True'
            if on_rtd:
                dotpath = ""

        ext = filename.rsplit(".", 1)[1]
        cmd = "{}dot -T{} {} -o {}".format(dotpath, ext, dotfile.name,
                                           filename)
        print(dotfile.name)
        try:
            shell(cmd)
        except:
            import os
            os.system(cmd)