Example #1
0
    def test_pandas_rst_right(self):
        df = pandas.DataFrame([{
            "A": "x",
            "AA": "xx",
            "AAA": "xxx"
        }, {
            "AA": "xxxxxxx",
            "AAA": "xxx"
        }])
        rst = df2rst(df, align="r")
        exp = """+---+---------+-----+
                 | A |      AA | AAA |
                 +===+=========+=====+
                 | x |      xx | xxx |
                 +---+---------+-----+
                 |   | xxxxxxx | xxx |
                 +---+---------+-----+
                 """.replace("                 ", "")
        self.assertEqual(rst, exp)

        rst = df2rst(df, align="c")
        exp = """+---+---------+-----+
                 | A |   AA    | AAA |
                 +===+=========+=====+
                 | x |   xx    | xxx |
                 +---+---------+-----+
                 |   | xxxxxxx | xxx |
                 +---+---------+-----+
                 """.replace("                 ", "")
        self.assertEqual(rst, exp)
Example #2
0
 def simple():
     df = pandas.DataFrame([{
         "A": "x",
         "AA": "xx",
         "AAA": "xxx"
     }, {
         "AA": "xxxxxxx",
         "AAA": "xxx"
     }])
     for i in range(0, 99):
         df2rst(df)
     return df2rst(df)
Example #3
0
    def test_pandas_rst_right_format_number(self):
        df = pandas.DataFrame([{
            "A": "x",
            "AA": "xx",
            "AAA": "xxx"
        }, {
            "AA": "xxxxxxx",
            "AAA": "xxx"
        }])
        rst = df2rst(df, align="r", number_format=4)
        exp = """+---+---------+-----+
                 | A |      AA | AAA |
                 +===+=========+=====+
                 | x |      xx | xxx |
                 +---+---------+-----+
                 |   | xxxxxxx | xxx |
                 +---+---------+-----+
                 """.replace("                 ", "")
        self.assertEqual(rst, exp)

        rst = df2rst(df, align="c", number_format=4)
        exp = """+---+---------+-----+
                 | A |   AA    | AAA |
                 +===+=========+=====+
                 | x |   xx    | xxx |
                 +---+---------+-----+
                 |   | xxxxxxx | xxx |
                 +---+---------+-----+
                 """.replace("                 ", "")
        self.assertEqual(rst, exp)

        df = pandas.DataFrame([{
            "A": "x",
            "AA": "xx",
            "AAA": "xxx",
            'N': 0.5123456
        }, {
            "AA": "xxxxxxx",
            "AAA": "xxx"
        }])
        rst = df2rst(df, number_format=4)
        exp = """+---+---------+-----+--------+
                 | A | AA      | AAA | N      |
                 +===+=========+=====+========+
                 | x | xx      | xxx | 0.5123 |
                 +---+---------+-----+--------+
                 |   | xxxxxxx | xxx |        |
                 +---+---------+-----+--------+
                 """.replace("                 ", "")
        self.assertEqual(rst, exp)
 def test_df2rst_split_col(self):
     data = os.path.join(os.path.abspath(os.path.dirname(__file__)), "data")
     mara = os.path.join(data, "marathon.txt")
     df = pandas.read_csv(mara,
                          names=["city", "year", "time", "seconds"],
                          sep="\t")
     self.assertRaise(
         lambda: df2rst(df,
                        split_col_common=["city", "time"],
                        split_col_subsets=[['time'], ['seconds']]),
         ValueError)
     conv = df2rst(df,
                   split_col_common=["city", "year"],
                   split_col_subsets=[['time'], ['seconds']])
     self.assertIn("| CHICAGO   | 2005 | 7622    |", conv)
    def test_df2rst_split_row_label(self):
        data = os.path.join(os.path.abspath(os.path.dirname(__file__)), "data")
        mara = os.path.join(data, "marathon.txt")
        df = pandas.read_csv(mara,
                             names=["city", "year", "time", "seconds"],
                             sep="\t")
        conv = df2rst(df, split_row="city")
        self.assertIn("+++++++++", conv)
        self.assertIn("| city      | year | time     | seconds |", conv)
        self.assertIn("| PARIS | 2011 | 02:06:29 | 7589    |", conv)

        conv = df2rst(df, split_row="year", label_pattern=".. _lpy-{section}:")
        self.assertIn("++++", conv)
        self.assertIn("| city      | year | time     | seconds |", conv)
        self.assertIn("| FUKUOKA   | 1976 | 02:12:35 | 7955    |", conv)
        self.assertIn(".. _lpy-1949:", conv)
Example #6
0
    def test_rt_OneVsRestClassifier_python(self):
        fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__")
        logger = getLogger('skl2onnx')
        logger.disabled = True
        verbose = 1 if __name__ == "__main__" else 0

        debug = False
        buffer = []

        def myprint(*args, **kwargs):
            buffer.append(" ".join(map(str, args)))

        rows = list(enumerate_validated_operator_opsets(
            verbose, models={"OneVsRestClassifier"}, opset_min=9,
            opset_max=11, fLOG=myprint, benchmark=True,
            runtime='python', debug=debug,
            filter_exp=lambda m, p: True or 'm-cl' in p))
        self.assertGreater(len(rows), 1)
        self.assertIn('skl_nop', rows[0])
        self.assertIn('onx_size', rows[-1])
        piv = summary_report(DataFrame(rows))
        self.assertGreater(piv.shape[0], 1)
        self.assertGreater(piv.shape[0], 2)
        common, subsets = split_columns_subsets(piv)
        rst = df2rst(piv, number_format=2,
                     replacements={'nan': '', 'ERR: 4convert': ''},
                     split_row=lambda index, dp=piv: build_key_split(
                         dp.loc[index, "name"], index),
                     split_col_common=common,
                     split_col_subsets=subsets,
                     filter_rows=filter_rows,
                     column_size={'problem': 25},
                     label_pattern=".. _lpy-{section}:")
        self.assertIn("opset9 | RT/SKL-N=1", rst)
    def test_rt_KMeans_python(self):
        fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__")
        logger = getLogger('skl2onnx')
        logger.disabled = True
        verbose = 2 if __name__ == "__main__" else 0

        debug = False
        buffer = []

        def myprint(*args, **kwargs):
            buffer.append(" ".join(map(str, args)))

        rows = list(enumerate_validated_operator_opsets(
            verbose, models={"KMeans"}, opset_min=11,
            opset_max=11, fLOG=myprint,
            runtime='python', debug=debug))
        self.assertGreater(len(rows), 1)
        self.assertIn('skl_nop', rows[-1])
        keys = set()
        for row in rows:
            keys.update(set(row))
        self.assertIn('onx_size', keys)
        piv = summary_report(DataFrame(rows))
        opset = [c for c in piv.columns if 'opset' in c]
        self.assertTrue('opset11' in opset or 'opset10' in opset)
        self.assertGreater(len(buffer), 1 if debug else 0)
        common, subsets = split_columns_subsets(piv)
        try:
            conv = df2rst(piv, split_col_common=common,  # pylint: disable=E1123
                          split_col_subsets=subsets)
            self.assertIn('| KMeans |', conv)
        except TypeError as e:
            if "got an unexpected keyword argument 'split_col_common'" in str(e):
                return
            raise e
def rst_table_modules(classifier=False):
    """
    Produces a table with some modules useful
    to do machine learning.

    @param      classifier  keep classifiers?
    @return                 string
    """
    try:
        from pymyinstall.packaged import small_set, classifiers2string
    except KeyError:
        from pyquickhelper.pycode.pip_helper import fix_pip_902
        fix_pip_902()
        from pymyinstall.packaged import small_set, classifiers2string
    mod = small_set()
    mod.sort()
    df = pandas.DataFrame(_.as_dict(rst_link=True) for _ in mod)
    if classifier:
        df = df[[
            "usage", "rst_link", "kind", "version", "license", "purpose",
            "classifier"
        ]]
        df["classifier"] = df.apply(
            lambda row: classifiers2string(row["classifier"]), axis=1)
        df.columns = [
            "usage", "name", "kind", "version", "license", "purpose",
            "classifier"
        ]
    else:
        df = df[["usage", "rst_link", "kind", "version", "license", "purpose"]]
        df.columns = ["usage", "name", "kind", "version", "license", "purpose"]
    df["lname"] = df["name"].apply(lambda s: s.lower())
    df = df.sort_values("lname").drop("lname", axis=1)
    df = df.reset_index(drop=True).reset_index(drop=False)
    return df2rst(df)
    def test_df2rst_split_col_row_ref2_func2(self):
        def build_key_split(key, index):
            new_key = str(key).split('`')[1].split('<')[0].strip()
            return new_key

        df = pandas.DataFrame([
            {
                'name': ':ref:`A <A>`',
                'value': 1
            },
            {
                'name': ':ref:`A <A2>`',
                'value': 2
            },
            {
                'name': ':ref:`B <B>`',
                'value': 3
            },
            {
                'name': ':ref:`B <B2>`',
                'value': 4
            },
            {
                'name': ':ref:`A <A3>`',
                'value': 5
            },
        ])
        conv = df2rst(df,
                      split_row=lambda index: build_key_split(
                          df.loc[index, "name"], index))
        self.assertIn("| :ref:`B <B>`  | 3     |", conv)
Example #10
0
    def test_all_module_summary(self):

        fLOG(__file__,
             self._testMethodName,
             OutputPrint=__name__ == "__main__")

        mod = ensae_fullset()
        mod.sort()
        df = pandas.DataFrame(_.as_dict(rst_link=True) for _ in mod)
        df = df[[
            "usage", "rst_link", "kind", "version", "installed", "license",
            "purpose", "classifier"
        ]]
        df.columns = [
            "usage", "name", "kind", "version", "installed", "license",
            "purpose", "classifier"
        ]
        lic = df[~df.license.isnull()]
        # fLOG(lic[["name","license"]])
        fLOG("license", lic.shape)
        nolic = df[df.license.isnull()]
        fLOG("no license", nolic.shape)
        fLOG(nolic[["name", "license"]])
        assert lic.shape[0] > 0

        rst = df2rst(df)
        # fLOG(rst)
        assert len(rst) > 1000
Example #11
0
def rst_table_modules(classifier=False):
    """
    produces a table with all modules recommended to do machine learning

    @param      classifier  keep classifiers?
    @return                 string
    """
    mod = ensae_fullset()
    mod.sort()
    df = pandas.DataFrame(_.as_dict(rst_link=True) for _ in mod)
    if classifier:
        df = df[[
            "usage", "rst_link", "kind", "version", "license", "purpose",
            "classifier"
        ]]
        df["classifier"] = df.apply(
            lambda row: classifiers2string(row["classifier"]), axis=1)
        df.columns = [
            "usage", "name", "kind", "version", "license", "purpose",
            "classifier"
        ]
    else:
        df = df[["usage", "rst_link", "kind", "version", "license", "purpose"]]
        df.columns = ["usage", "name", "kind", "version", "license", "purpose"]
    df["lname"] = df["name"].apply(lambda s: s.lower())
    df = df.sort_values("lname").drop("lname", axis=1)
    df = df.reset_index(drop=True).reset_index(drop=False)
    return df2rst(df)
Example #12
0
    def test_pandas_rst_size_table_title(self):
        df = pandas.DataFrame([{
            "A": "x",
            "AA": "xx",
            "AAA": "xxx"
        }, {
            "AA": "xxxxxxx",
            "AAA": "xxx"
        }])
        rst = df2rst(df,
                     column_size=[1, 1, 2],
                     list_table=True,
                     title="title__")
        exp = """
                    .. list-table:: title__
                        :widths: 1 1 2
                        :header-rows: 1

                        * - A
                          - AA
                          - AAA
                        * - x
                          - xx
                          - xxx
                        * -
                          - xxxxxxx
                          - xxx
                    """.replace("                    ", "")
        self.assertEqual(rst.strip("\n "), exp.strip("\n "))
Example #13
0
    def test_documentation(self):
        fLOG(
            __file__,
            self._testMethodName,
            OutputPrint=__name__ == "__main__")

        r = name_sets_dataframe()
        for m in r:
            fLOG("**", m)
        assert len(r) >= 6
        df = pandas.DataFrame(r)
        df = df[["name", "description"]]
        rst = df2rst(df)
        fLOG(rst)
        assert len(rst) > 0

        if sys.version_info[0] == 2:
            # less tests on Python 2.7
            return

        nb = 0
        for mod in r:
            lp = get_package_set(mod["name"])
            assert len(lp()) > 0
            nb += 1
        assert nb > 0
Example #14
0
    def test_documentation(self):
        fLOG(__file__,
             self._testMethodName,
             OutputPrint=__name__ == "__main__")

        r = name_sets_dataframe()
        for m in r:
            fLOG("**", m)
        assert len(r) >= 6
        df = pandas.DataFrame(r)
        df = df[["name", "description"]]
        rst = df2rst(df)
        fLOG(rst)
        assert len(rst) > 0

        if sys.version_info[0] == 2:
            # less tests on Python 2.7
            return

        nb = 0
        for mod in r:
            lp = get_package_set(mod["name"])
            if len(lp()) == 0 and mod["name"] != "pywin32":
                raise Exception("issue with module '{0}'".format(mod["name"]))
            nb += 1
        assert nb > 0
    def test_all_module_summary(self):

        fLOG(
            __file__,
            self._testMethodName,
            OutputPrint=__name__ == "__main__")

        mod = ensae_fullset()
        mod.sort()
        df = pandas.DataFrame(_.as_dict(rst_link=True) for _ in mod)
        df = df[["usage", "rst_link", "kind", "version", "installed",
                 "license", "purpose", "classifier"]]
        df.columns = ["usage", "name", "kind", "version", "installed",
                      "license", "purpose", "classifier"]
        lic = df[~df.license.isnull()]
        # fLOG(lic[["name","license"]])
        fLOG("license", lic.shape)
        nolic = df[df.license.isnull()]
        fLOG("no license", nolic.shape)
        fLOG(nolic[["name", "license"]])
        assert lic.shape[0] > 0

        rst = df2rst(df)
        # fLOG(rst)
        assert len(rst) > 1000
Example #16
0
    def test_pandas_rst_size_table_auto_replace(self):
        df = pandas.DataFrame([{
            "A": "x",
            "AA": "xx",
            "AAA": "xxx"
        }, {
            "AA": "xxxxxxx",
            "AAA": "xxx"
        }])
        rst = df2rst(df, list_table=True, replacements={'xxxxxxx': 'gg'})
        exp = """
                    .. list-table::
                        :widths: auto
                        :header-rows: 1

                        * - A
                          - AA
                          - AAA
                        * - x
                          - xx
                          - xxx
                        * -
                          - gg
                          - xxx
                    """.replace("                    ", "")
        self.assertEqual(rst.strip("\n "), exp.strip("\n "))
 def test_df2rst(self):
     data = os.path.join(os.path.abspath(os.path.dirname(__file__)), "data")
     mara = os.path.join(data, "marathon.txt")
     df = pandas.read_csv(mara,
                          names=["city", "year", "time", "seconds"],
                          sep="\t")
     conv = df2rst(df)
     self.assertIn("| city      | year | time     | seconds |", conv)
     self.assertIn("| PARIS     | 2011 | 02:06:29 | 7589    |", conv)
Example #18
0
 def simple2():
     df = pandas.DataFrame([{
         "A": "x",
         "AA": "xx",
         "AAA": "xxx"
     }, {
         "AA": "xxxxxxx",
         "AAA": "xxx"
     }])
     return df2rst(df)
    def test_df2rst_split_row(self):
        data = os.path.join(os.path.abspath(os.path.dirname(__file__)), "data")
        mara = os.path.join(data, "marathon.txt")
        df = pandas.read_csv(mara,
                             names=["city", "year", "time", "seconds"],
                             sep="\t")
        conv = df2rst(df, split_row="city")
        self.assertIn("+++++++++", conv)
        self.assertIn("| city      | year | time     | seconds |", conv)
        self.assertIn("| PARIS | 2011 | 02:06:29 | 7589    |", conv)

        conv = df2rst(df, split_row="year")
        self.assertIn("++++", conv)
        self.assertIn("| city      | year | time     | seconds |", conv)
        self.assertIn("| FUKUOKA   | 1976 | 02:12:35 | 7955    |", conv)

        conv = df2rst(df, split_row=["city", "year"])
        self.assertIn("'AMSTERDAM', 1975", conv)
        self.assertIn("| city      | year | time     | seconds |", conv)
 def test_df2rst_split_col_row(self):
     data = os.path.join(os.path.abspath(os.path.dirname(__file__)), "data")
     mara = os.path.join(data, "marathon.txt")
     df = pandas.read_csv(mara,
                          names=["city", "year", "time", "seconds"],
                          sep="\t")
     conv = df2rst(df,
                   split_row="city",
                   split_col_common=["city", "year"],
                   split_col_subsets=[['time'], ['seconds']])
     self.assertIn("+++++++++", conv)
     self.assertIn("| STOCKOLM | 2007 | 8456    |", conv)
 def test_df2rst_split_col_row_ref2(self):
     data = os.path.join(os.path.abspath(os.path.dirname(__file__)), "data")
     mara = os.path.join(data, "marathon.txt")
     df = pandas.read_csv(mara,
                          names=["city", "year", "time", "seconds"],
                          sep="\t")
     df['city'] = df.city.apply(lambda v: ':ref:`{0}`'.format(v))  # pylint: disable=W0108
     conv = df2rst(df,
                   split_row="city",
                   split_col_common=["city", "year"],
                   split_col_subsets=[['time'], ['seconds']])
     self.assertIn("+++++++++", conv)
     self.assertIn("| :ref:`AMSTERDAM` | 1982 | 02:12:15 |", conv)
 def test_df2rst_column_size_i(self):
     data = os.path.join(os.path.abspath(os.path.dirname(__file__)), "data")
     mara = os.path.join(data, "marathon.txt")
     df = pandas.read_csv(mara,
                          names=["city", "year", "time", "seconds"],
                          sep="\t")
     conv = df2rst(df, column_size={0: 40})
     self.assertIn(
         "| city                                     | year | time     | seconds |",
         conv)
     self.assertIn(
         "| PARIS                                    | 2006 | 02:08:03 | 7683    |",
         conv)
Example #23
0
 def test_pandas_rst_size(self):
     df = pandas.DataFrame([{"A": "x", "AA": "xx", "AAA": "xxx"},
                            {"AA": "xxxxxxx", "AAA": "xxx"}])
     rst = df2rst(df, column_size=[1, 1, 2])
     exp = """+-----+---------+--------+
              | A   | AA      | AAA    |
              +=====+=========+========+
              | x   | xx      | xxx    |
              +-----+---------+--------+
              |     | xxxxxxx | xxx    |
              +-----+---------+--------+
              """.replace("                 ", "")
     self.assertEqual(rst, exp)
Example #24
0
def df2rsthtml(df, format="html", fillna=""):
    """
    Writes a table into RST or HTML format.

    @param      df          dataframe
    @param      format      format
    @param      fillna      fill empty values
    @return                 string
    """
    df = df.fillna(fillna)
    if format == "html":
        return df2html(df)
    elif format == "rst":
        return df2rst(df)
Example #25
0
    def test_pandas_rst_right(self):
        df = pandas.DataFrame([{"A": "x", "AA": "xx", "AAA": "xxx"},
                               {"AA": "xxxxxxx", "AAA": "xxx"}])
        rst = df2rst(df, align="r")
        exp = """+-----+---------+-----+
                 |   A |      AA | AAA |
                 +=====+=========+=====+
                 |   x |      xx | xxx |
                 +-----+---------+-----+
                 |     | xxxxxxx | xxx |
                 +-----+---------+-----+
                 """.replace("                 ", "")
        self.assertEqual(rst, exp)

        rst = df2rst(df, align="c")
        exp = """+-----+---------+-----+
                 |  A  |   AA    | AAA |
                 +=====+=========+=====+
                 |  x  |   xx    | xxx |
                 +-----+---------+-----+
                 |     | xxxxxxx | xxx |
                 +-----+---------+-----+
                 """.replace("                 ", "")
        self.assertEqual(rst, exp)
    def test_documentation(self):
        fLOG(
            __file__,
            self._testMethodName,
            OutputPrint=__name__ == "__main__")

        mod = all_set()
        mod.sort()
        df = pandas.DataFrame(_.as_dict(rst_link=True) for _ in mod)
        df = df[["usage", "rst_link", "kind", "version",
                 "license", "purpose", "classifier"]]
        df["classifier"] = df.apply(
            lambda row: classifiers2string(row["classifier"]), axis=1)
        df.columns = ["usage", "name", "kind", "version",
                      "license", "purpose", "classifier"]
        fLOG(df2rst(df))
Example #27
0
    def test_pandas_rst_size_table_noheader(self):
        df = pandas.DataFrame([{"A": "x", "AA": "xx", "AAA": "xxx"},
                               {"AA": "xxxxxxx", "AAA": "xxx"}])
        rst = df2rst(df, list_table=True, header=False)
        exp = """
                    .. list-table::
                        :widths: auto

                        * - x
                          - xx
                          - xxx
                        * -
                          - xxxxxxx
                          - xxx
                    """.replace("                    ", "")
        self.assertEqual(rst.strip("\n "), exp.strip("\n "))
Example #28
0
    def test_pandas_rst_size_table_number_format(self):
        df = pandas.DataFrame([{"A": 2.12345678, "AA": 3.12345678,
                                "AAA": 4.12345678},
                               {"AA": 2.12345678e10, "AAA": 2.12345678e-10}])
        rst = df2rst(df, list_table=True, header=False, number_format=3)
        exp = """
                    .. list-table::
                        :widths: auto

                        * - 2.12
                          - 3.12
                          - 4.12
                        * - nan
                          - 2.12e+10
                          - 2.12e-10
                    """.replace("                    ", "")
        self.assertEqual(rst.strip("\n "), exp.strip("\n "))
Example #29
0
def rst_table_modules():
    """
    produces a table with all modules recommended to do machine learning

    @return         string
    """
    mod = ensae_fullset()
    mod.sort()
    df = pandas.DataFrame(_.as_dict(rst_link=True) for _ in mod)
    df = df[["usage", "rst_link", "kind", "version",
             "license", "purpose", "classifier"]]
    df["classifier"] = df.apply(
        lambda row: classifiers2string(row["classifier"]), axis=1)
    df.columns = ["usage", "name", "kind", "version",
                  "license", "purpose", "classifier"]
    df["lname"] = df["name"].apply(lambda s: s.lower())
    df = df.sort_values("lname").drop("lname", axis=1)
    return df2rst(df)
    def test_documentation(self):
        fLOG(__file__,
             self._testMethodName,
             OutputPrint=__name__ == "__main__")

        mod = all_set()
        mod.sort()
        df = pandas.DataFrame(_.as_dict(rst_link=True) for _ in mod)
        df = df[[
            "usage", "rst_link", "kind", "version", "license", "purpose",
            "classifier"
        ]]
        df["classifier"] = df.apply(
            lambda row: classifiers2string(row["classifier"]), axis=1)
        df.columns = [
            "usage", "name", "kind", "version", "license", "purpose",
            "classifier"
        ]
        fLOG(df2rst(df))
Example #31
0
 def test_pandas_rst_size_replace(self):
     df = pandas.DataFrame([{
         "A": "x",
         "AA": "xx",
         "AAA": "xxx"
     }, {
         "AA": "xxxxxxx",
         "AAA": "xxx"
     }])
     rst = df2rst(df, column_size=[1, 1, 2], replacements={'xxx': 'rrrr'})
     exp = """+---+---------+--------+
              | A | AA      | AAA    |
              +===+=========+========+
              | x | xx      | rrrr   |
              +---+---------+--------+
              |   | xxxxxxx | rrrr   |
              +---+---------+--------+
              """.replace("                 ", "")
     self.assertEqual(rst, exp)
Example #32
0
    def test_pandas_rst_size_table(self):
        df = pandas.DataFrame([{"A": "x", "AA": "xx", "AAA": "xxx"},
                               {"AA": "xxxxxxx", "AAA": "xxx"}])
        rst = df2rst(df, column_size=[1, 1, 2], list_table=True)
        exp = """
                    .. list-table::
                        :widths: 1 1 2
                        :header-rows: 1

                        * - A
                          - AA
                          - AAA
                        * - x
                          - xx
                          - xxx
                        * -
                          - xxxxxxx
                          - xxx
                    """.replace("                    ", "")
        self.assertEqual(rst.strip("\n "), exp.strip("\n "))
    def test_df2rst_split_col_complex(self):
        data = os.path.join(os.path.abspath(os.path.dirname(__file__)), "data")
        mara = os.path.join(data, "unittst.csv")
        df = pandas.read_csv(mara)
        common = ['name', 'problem', 'scenario']
        subsets = [['opset11', 'opset10', 'opset9'], ['ERROR-msg'],
                   [
                       'RT/SKL-N=1', 'N=10', 'N=100', 'N=1000', 'N=10000',
                       'N=100000', 'RT/SKL-N=1-min', 'RT/SKL-N=1-max',
                       'N=10-min', 'N=10-max', 'N=100-min', 'N=100-max',
                       'N=1000-min', 'N=1000-max', 'N=10000-min',
                       'N=10000-max', 'N=100000-min', 'N=100000-max'
                   ]]

        def build_key_split(key, index):
            new_key = str(key).split('`')[1].split('<')[0].strip()
            return new_key

        def filter_rows(df):
            for c in ['ERROR-msg', 'RT/SKL-N=1']:
                if c in df.columns:
                    return df[df[c].apply(lambda x: pandas.notnull(x) and x
                                          not in (None, '', 'nan'))]
            return df

        conv = df2rst(df,
                      number_format=2,
                      replacements={
                          'nan': '',
                          'ERR: 4convert': ''
                      },
                      split_row=lambda index, dp=df: build_key_split(
                          dp.loc[index, "name"], index),
                      split_col_common=common,
                      split_col_subsets=subsets,
                      filter_rows=filter_rows)
        self.assertIn(
            "| :ref:`ARDRegression <l-ARDRegression-b-reg-default>`     | b-reg     | default  "
            "|               | ?       | ?      |", conv)
        spl = conv.split("+=============================")
        self.assertEqual(len(spl), 7)
Example #34
0
    def test_pandas_rst_size_table_number_format(self):
        df = pandas.DataFrame([{
            "A": 2.12345678,
            "AA": 3.12345678,
            "AAA": 4.12345678
        }, {
            "AA": 2.12345678e10,
            "AAA": 2.12345678e-10
        }])
        rst = df2rst(df, list_table=True, header=False, number_format=3)
        exp = """
                    .. list-table::
                        :widths: auto

                        * - 2.12
                          - 3.12
                          - 4.12
                        * -
                          - 2.12e+10
                          - 2.12e-10
                    """.replace("                    ", "")
        self.assertEqual(rst.strip("\n "), exp.strip("\n "))
Example #35
0
    def test_pandas_rst_size_table_noheader(self):
        df = pandas.DataFrame([{
            "A": "x",
            "AA": "xx",
            "AAA": "xxx"
        }, {
            "AA": "xxxxxxx",
            "AAA": "xxx"
        }])
        rst = df2rst(df, list_table=True, header=False)
        exp = """
                    .. list-table::
                        :widths: auto

                        * - x
                          - xx
                          - xxx
                        * -
                          - xxxxxxx
                          - xxx
                    """.replace("                    ", "")
        self.assertEqual(rst.strip("\n "), exp.strip("\n "))
 def simple():
     df = pandas.DataFrame([{"A": "x", "AA": "xx", "AAA": "xxx"},
                            {"AA": "xxxxxxx", "AAA": "xxx"}])
     return df2rst(df)
Example #37
0
def write_page_onnxrt_benches(app, runtime, skip=None, white_list=None):

    from mlprodict.onnxrt.validate.validate import enumerate_validated_operator_opsets
    logger = getLogger('mlprodict')
    srcdir = app.builder.srcdir if app is not None else ".."

    if runtime in ('python', 'python_compiled'):
        whe = os.path.join(os.path.abspath(srcdir),
                           "skl_converters", "bench_python.rst")
    elif runtime == 'onnxruntime2':
        whe = os.path.join(os.path.abspath(srcdir),
                           "skl_converters", "bench_onnxrt2.rst")
    elif runtime == 'onnxruntime1':
        whe = os.path.join(os.path.abspath(srcdir),
                           "skl_converters", "bench_onnxrt1.rst")
    else:
        raise RuntimeError("Unsupported runtime '{}'.".format(runtime))

    logger.info("[mlprodict] create page '{}'.".format(whe))
    print("[mlprodict-sphinx] create page runtime '{}' - '{}'.".format(runtime, whe))

    filenames = run_benchmark(runtime, srcdir, logger, skip,
                              white_list=white_list)
    dfs_raw = [read_csv(name[0])
               for name in filenames if os.path.exists(name[0])]
    dfs_sum = [read_csv(name[1])
               for name in filenames if os.path.exists(name[1])]
    df_raw = concat(dfs_raw, sort=False)
    piv = concat(dfs_sum, sort=False)

    opset_cols = [(int(oc.replace("opset", "")), oc)
                  for oc in piv.columns if 'opset' in oc]
    opset_cols.sort(reverse=True)
    opset_cols = [oc[1] for oc in opset_cols]
    new_cols = opset_cols[:1]
    bench_cols = ["RT/SKL-N=1", "N=10", "N=100",
                  "N=1000", "N=10000", "N=100000"]
    new_cols.extend(["ERROR-msg", "name", "problem", "scenario", 'optim'])
    new_cols.extend(bench_cols)
    new_cols.extend(opset_cols[1:])
    for c in bench_cols:
        new_cols.append(c + '-min')
        new_cols.append(c + '-max')
    for c in piv.columns:
        if c.startswith("skl_") or c.startswith("onx_"):
            new_cols.append(c)
    new_cols = [_ for _ in new_cols if _ in piv.columns]
    piv = piv[new_cols]

    out_sum = os.path.join(srcdir, "bench_sum_%s.xlsx" % runtime)
    piv.to_excel(out_sum, index=False)
    logger.info("[mlprodict] wrote '{}'.".format(out_sum))
    print("[mlprodict-sphinx] wrote '{}'".format(out_sum))

    out_raw = os.path.join(srcdir, "bench_raw_%s.xlsx" % runtime)
    df_raw.to_excel(out_raw, index=False)
    logger.info("[mlprodict] wrote '{}'.".format(out_raw))
    print("[mlprodict-sphinx] wrote '{}'".format(out_raw))

    logger.info("[mlprodict] shape '{}'.".format(piv.shape))
    print("[mlprodict-sphinx] shape '{}'".format(piv.shape))

    def make_link(row):
        link = ":ref:`{name} <l-{name}-{problem}-{scenario}-{optim}-{opset}>`"
        name = row['name']
        problem = row['problem']
        scenario = row['scenario']
        optim = _clean_values_optim(
            str(row.get('optim', '')).replace("nan", ""))
        opset = _make_opset(row)
        return link.format(name=name, problem=problem,
                           scenario=scenario, optim=optim,
                           opset=opset)

    piv['name'] = piv.apply(lambda row: make_link(row), axis=1)
    piv.reset_index(drop=True, inplace=True)

    if "ERROR-msg" in piv.columns:
        def shorten(text):
            text = str(text)
            if len(text) > 75:
                text = text[:75] + "..."
            return text

        piv["ERROR-msg"] = piv["ERROR-msg"].apply(shorten)

    logger.info("[mlprodict] write '{}'.".format(whe))
    print("[mlprodict-sphinx] write '{}'".format(whe))

    with open(whe, 'w', encoding='utf-8') as f:
        title = "Available of scikit-learn model for runtime {0}".format(
            runtime)
        f.write(dedent('''
        .. _l-onnx-bench-{0}:

        {1}
        {2}

        The following metrics measure the ratio between the prediction time
        for the runtime compare to :epkg:`scikit-learn`.
        It gives an order of magnitude. They are done by setting
        ``assume_finite=True`` (see `config_context
        <https://scikit-learn.org/stable/modules/generated/sklearn.config_context.html>`_).
        The computed ratio is:

        .. math::

            \\frac{{\\textit{{execution when predicting with a custom ONNX runtime}}}}
            {{\\textit{{execution when predicting with scikit-learn (assume\\_finite=True)}}}}

        Due to float32 conversion, it may happen than the highest difference
        is quite high. The proposition :math:`a < b \\Rightarrow [a] < [b]`
        is usually true and but not true all the time. It is the same after number
        where rounded to float32, that's why the result considers the
        fourth highest difference and not the first three.

        Some figures are missing when the number of observations is high.
        That means the prediction is slow for one of the runtime
        (ONNX, scikit-learn) and it would take too long to go further.
        The list of problems can be found in the documentation of
        function :func:`find_suitable_problem
        <mlprodict.onnxrt.validate.validate_problems.find_suitable_problem>`.
        Default values are usually used to create models but other
        scenarios are defined by :func:`build_custom_scenarios
        <mlprodict.onnxrt.validate.validate_scenarios.build_custom_scenarios>`
        and :func:`build_custom_scenarios (2)
        <from mlprodict.onnxrt.validate.validate_scenarios.build_custom_scenarios>`.
        The benchmark can be generated with a command line:

        ::

            python -m mlprodict validate_runtime --verbose=1 --out_raw=data.csv --out_summary=summary.xlsx --benchmark=1 --dump_folder=. --runtime={0}

        The option ``-se 1`` may be used if the process crashes. The command line
        can also be extended to test only one model or to skip another one. The whole
        batch takes between 5 and 15 minutes depending on the machine.

        Full data: :download:`{3} <../{3}>`

        .. contents::
            :local:

        '''.format(runtime, title, "=" * len(title),
                   "bench_sum_%s.xlsx" % runtime)))
        common, subsets = split_columns_subsets(piv)
        f.write(df2rst(piv, number_format=2,
                       replacements={'nan': '', 'ERR: 4convert': ''},
                       split_row=lambda index, dp=piv: build_key_split(
                           dp.loc[index, "name"], index),
                       split_col_common=common,
                       split_col_subsets=subsets,
                       filter_rows=filter_rows,
                       column_size={'problem': 25},
                       label_pattern=".. _lpy-{section}:"))
    logger.info(
        "[mlprodict] done page '{}'.".format(whe))
    print("[mlprodict-sphinx] done page runtime '{}' - '{}'.".format(runtime, whe))
 def test_module_list(self):
     res = df2rst(DataFrame(modules_list()))
     self.assertIn('sklearn', res)
     self.assertIn('numpy', res)
     self.assertIn('skl2onnx', res)