def test_pandas_rst_right(self): df = pandas.DataFrame([{ "A": "x", "AA": "xx", "AAA": "xxx" }, { "AA": "xxxxxxx", "AAA": "xxx" }]) rst = df2rst(df, align="r") exp = """+---+---------+-----+ | A | AA | AAA | +===+=========+=====+ | x | xx | xxx | +---+---------+-----+ | | xxxxxxx | xxx | +---+---------+-----+ """.replace(" ", "") self.assertEqual(rst, exp) rst = df2rst(df, align="c") exp = """+---+---------+-----+ | A | AA | AAA | +===+=========+=====+ | x | xx | xxx | +---+---------+-----+ | | xxxxxxx | xxx | +---+---------+-----+ """.replace(" ", "") self.assertEqual(rst, exp)
def simple(): df = pandas.DataFrame([{ "A": "x", "AA": "xx", "AAA": "xxx" }, { "AA": "xxxxxxx", "AAA": "xxx" }]) for i in range(0, 99): df2rst(df) return df2rst(df)
def test_pandas_rst_right_format_number(self): df = pandas.DataFrame([{ "A": "x", "AA": "xx", "AAA": "xxx" }, { "AA": "xxxxxxx", "AAA": "xxx" }]) rst = df2rst(df, align="r", number_format=4) exp = """+---+---------+-----+ | A | AA | AAA | +===+=========+=====+ | x | xx | xxx | +---+---------+-----+ | | xxxxxxx | xxx | +---+---------+-----+ """.replace(" ", "") self.assertEqual(rst, exp) rst = df2rst(df, align="c", number_format=4) exp = """+---+---------+-----+ | A | AA | AAA | +===+=========+=====+ | x | xx | xxx | +---+---------+-----+ | | xxxxxxx | xxx | +---+---------+-----+ """.replace(" ", "") self.assertEqual(rst, exp) df = pandas.DataFrame([{ "A": "x", "AA": "xx", "AAA": "xxx", 'N': 0.5123456 }, { "AA": "xxxxxxx", "AAA": "xxx" }]) rst = df2rst(df, number_format=4) exp = """+---+---------+-----+--------+ | A | AA | AAA | N | +===+=========+=====+========+ | x | xx | xxx | 0.5123 | +---+---------+-----+--------+ | | xxxxxxx | xxx | | +---+---------+-----+--------+ """.replace(" ", "") self.assertEqual(rst, exp)
def test_df2rst_split_col(self): data = os.path.join(os.path.abspath(os.path.dirname(__file__)), "data") mara = os.path.join(data, "marathon.txt") df = pandas.read_csv(mara, names=["city", "year", "time", "seconds"], sep="\t") self.assertRaise( lambda: df2rst(df, split_col_common=["city", "time"], split_col_subsets=[['time'], ['seconds']]), ValueError) conv = df2rst(df, split_col_common=["city", "year"], split_col_subsets=[['time'], ['seconds']]) self.assertIn("| CHICAGO | 2005 | 7622 |", conv)
def test_df2rst_split_row_label(self): data = os.path.join(os.path.abspath(os.path.dirname(__file__)), "data") mara = os.path.join(data, "marathon.txt") df = pandas.read_csv(mara, names=["city", "year", "time", "seconds"], sep="\t") conv = df2rst(df, split_row="city") self.assertIn("+++++++++", conv) self.assertIn("| city | year | time | seconds |", conv) self.assertIn("| PARIS | 2011 | 02:06:29 | 7589 |", conv) conv = df2rst(df, split_row="year", label_pattern=".. _lpy-{section}:") self.assertIn("++++", conv) self.assertIn("| city | year | time | seconds |", conv) self.assertIn("| FUKUOKA | 1976 | 02:12:35 | 7955 |", conv) self.assertIn(".. _lpy-1949:", conv)
def test_rt_OneVsRestClassifier_python(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__") logger = getLogger('skl2onnx') logger.disabled = True verbose = 1 if __name__ == "__main__" else 0 debug = False buffer = [] def myprint(*args, **kwargs): buffer.append(" ".join(map(str, args))) rows = list(enumerate_validated_operator_opsets( verbose, models={"OneVsRestClassifier"}, opset_min=9, opset_max=11, fLOG=myprint, benchmark=True, runtime='python', debug=debug, filter_exp=lambda m, p: True or 'm-cl' in p)) self.assertGreater(len(rows), 1) self.assertIn('skl_nop', rows[0]) self.assertIn('onx_size', rows[-1]) piv = summary_report(DataFrame(rows)) self.assertGreater(piv.shape[0], 1) self.assertGreater(piv.shape[0], 2) common, subsets = split_columns_subsets(piv) rst = df2rst(piv, number_format=2, replacements={'nan': '', 'ERR: 4convert': ''}, split_row=lambda index, dp=piv: build_key_split( dp.loc[index, "name"], index), split_col_common=common, split_col_subsets=subsets, filter_rows=filter_rows, column_size={'problem': 25}, label_pattern=".. _lpy-{section}:") self.assertIn("opset9 | RT/SKL-N=1", rst)
def test_rt_KMeans_python(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__") logger = getLogger('skl2onnx') logger.disabled = True verbose = 2 if __name__ == "__main__" else 0 debug = False buffer = [] def myprint(*args, **kwargs): buffer.append(" ".join(map(str, args))) rows = list(enumerate_validated_operator_opsets( verbose, models={"KMeans"}, opset_min=11, opset_max=11, fLOG=myprint, runtime='python', debug=debug)) self.assertGreater(len(rows), 1) self.assertIn('skl_nop', rows[-1]) keys = set() for row in rows: keys.update(set(row)) self.assertIn('onx_size', keys) piv = summary_report(DataFrame(rows)) opset = [c for c in piv.columns if 'opset' in c] self.assertTrue('opset11' in opset or 'opset10' in opset) self.assertGreater(len(buffer), 1 if debug else 0) common, subsets = split_columns_subsets(piv) try: conv = df2rst(piv, split_col_common=common, # pylint: disable=E1123 split_col_subsets=subsets) self.assertIn('| KMeans |', conv) except TypeError as e: if "got an unexpected keyword argument 'split_col_common'" in str(e): return raise e
def rst_table_modules(classifier=False): """ Produces a table with some modules useful to do machine learning. @param classifier keep classifiers? @return string """ try: from pymyinstall.packaged import small_set, classifiers2string except KeyError: from pyquickhelper.pycode.pip_helper import fix_pip_902 fix_pip_902() from pymyinstall.packaged import small_set, classifiers2string mod = small_set() mod.sort() df = pandas.DataFrame(_.as_dict(rst_link=True) for _ in mod) if classifier: df = df[[ "usage", "rst_link", "kind", "version", "license", "purpose", "classifier" ]] df["classifier"] = df.apply( lambda row: classifiers2string(row["classifier"]), axis=1) df.columns = [ "usage", "name", "kind", "version", "license", "purpose", "classifier" ] else: df = df[["usage", "rst_link", "kind", "version", "license", "purpose"]] df.columns = ["usage", "name", "kind", "version", "license", "purpose"] df["lname"] = df["name"].apply(lambda s: s.lower()) df = df.sort_values("lname").drop("lname", axis=1) df = df.reset_index(drop=True).reset_index(drop=False) return df2rst(df)
def test_df2rst_split_col_row_ref2_func2(self): def build_key_split(key, index): new_key = str(key).split('`')[1].split('<')[0].strip() return new_key df = pandas.DataFrame([ { 'name': ':ref:`A <A>`', 'value': 1 }, { 'name': ':ref:`A <A2>`', 'value': 2 }, { 'name': ':ref:`B <B>`', 'value': 3 }, { 'name': ':ref:`B <B2>`', 'value': 4 }, { 'name': ':ref:`A <A3>`', 'value': 5 }, ]) conv = df2rst(df, split_row=lambda index: build_key_split( df.loc[index, "name"], index)) self.assertIn("| :ref:`B <B>` | 3 |", conv)
def test_all_module_summary(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__") mod = ensae_fullset() mod.sort() df = pandas.DataFrame(_.as_dict(rst_link=True) for _ in mod) df = df[[ "usage", "rst_link", "kind", "version", "installed", "license", "purpose", "classifier" ]] df.columns = [ "usage", "name", "kind", "version", "installed", "license", "purpose", "classifier" ] lic = df[~df.license.isnull()] # fLOG(lic[["name","license"]]) fLOG("license", lic.shape) nolic = df[df.license.isnull()] fLOG("no license", nolic.shape) fLOG(nolic[["name", "license"]]) assert lic.shape[0] > 0 rst = df2rst(df) # fLOG(rst) assert len(rst) > 1000
def rst_table_modules(classifier=False): """ produces a table with all modules recommended to do machine learning @param classifier keep classifiers? @return string """ mod = ensae_fullset() mod.sort() df = pandas.DataFrame(_.as_dict(rst_link=True) for _ in mod) if classifier: df = df[[ "usage", "rst_link", "kind", "version", "license", "purpose", "classifier" ]] df["classifier"] = df.apply( lambda row: classifiers2string(row["classifier"]), axis=1) df.columns = [ "usage", "name", "kind", "version", "license", "purpose", "classifier" ] else: df = df[["usage", "rst_link", "kind", "version", "license", "purpose"]] df.columns = ["usage", "name", "kind", "version", "license", "purpose"] df["lname"] = df["name"].apply(lambda s: s.lower()) df = df.sort_values("lname").drop("lname", axis=1) df = df.reset_index(drop=True).reset_index(drop=False) return df2rst(df)
def test_pandas_rst_size_table_title(self): df = pandas.DataFrame([{ "A": "x", "AA": "xx", "AAA": "xxx" }, { "AA": "xxxxxxx", "AAA": "xxx" }]) rst = df2rst(df, column_size=[1, 1, 2], list_table=True, title="title__") exp = """ .. list-table:: title__ :widths: 1 1 2 :header-rows: 1 * - A - AA - AAA * - x - xx - xxx * - - xxxxxxx - xxx """.replace(" ", "") self.assertEqual(rst.strip("\n "), exp.strip("\n "))
def test_documentation(self): fLOG( __file__, self._testMethodName, OutputPrint=__name__ == "__main__") r = name_sets_dataframe() for m in r: fLOG("**", m) assert len(r) >= 6 df = pandas.DataFrame(r) df = df[["name", "description"]] rst = df2rst(df) fLOG(rst) assert len(rst) > 0 if sys.version_info[0] == 2: # less tests on Python 2.7 return nb = 0 for mod in r: lp = get_package_set(mod["name"]) assert len(lp()) > 0 nb += 1 assert nb > 0
def test_documentation(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__") r = name_sets_dataframe() for m in r: fLOG("**", m) assert len(r) >= 6 df = pandas.DataFrame(r) df = df[["name", "description"]] rst = df2rst(df) fLOG(rst) assert len(rst) > 0 if sys.version_info[0] == 2: # less tests on Python 2.7 return nb = 0 for mod in r: lp = get_package_set(mod["name"]) if len(lp()) == 0 and mod["name"] != "pywin32": raise Exception("issue with module '{0}'".format(mod["name"])) nb += 1 assert nb > 0
def test_all_module_summary(self): fLOG( __file__, self._testMethodName, OutputPrint=__name__ == "__main__") mod = ensae_fullset() mod.sort() df = pandas.DataFrame(_.as_dict(rst_link=True) for _ in mod) df = df[["usage", "rst_link", "kind", "version", "installed", "license", "purpose", "classifier"]] df.columns = ["usage", "name", "kind", "version", "installed", "license", "purpose", "classifier"] lic = df[~df.license.isnull()] # fLOG(lic[["name","license"]]) fLOG("license", lic.shape) nolic = df[df.license.isnull()] fLOG("no license", nolic.shape) fLOG(nolic[["name", "license"]]) assert lic.shape[0] > 0 rst = df2rst(df) # fLOG(rst) assert len(rst) > 1000
def test_pandas_rst_size_table_auto_replace(self): df = pandas.DataFrame([{ "A": "x", "AA": "xx", "AAA": "xxx" }, { "AA": "xxxxxxx", "AAA": "xxx" }]) rst = df2rst(df, list_table=True, replacements={'xxxxxxx': 'gg'}) exp = """ .. list-table:: :widths: auto :header-rows: 1 * - A - AA - AAA * - x - xx - xxx * - - gg - xxx """.replace(" ", "") self.assertEqual(rst.strip("\n "), exp.strip("\n "))
def test_df2rst(self): data = os.path.join(os.path.abspath(os.path.dirname(__file__)), "data") mara = os.path.join(data, "marathon.txt") df = pandas.read_csv(mara, names=["city", "year", "time", "seconds"], sep="\t") conv = df2rst(df) self.assertIn("| city | year | time | seconds |", conv) self.assertIn("| PARIS | 2011 | 02:06:29 | 7589 |", conv)
def simple2(): df = pandas.DataFrame([{ "A": "x", "AA": "xx", "AAA": "xxx" }, { "AA": "xxxxxxx", "AAA": "xxx" }]) return df2rst(df)
def test_df2rst_split_row(self): data = os.path.join(os.path.abspath(os.path.dirname(__file__)), "data") mara = os.path.join(data, "marathon.txt") df = pandas.read_csv(mara, names=["city", "year", "time", "seconds"], sep="\t") conv = df2rst(df, split_row="city") self.assertIn("+++++++++", conv) self.assertIn("| city | year | time | seconds |", conv) self.assertIn("| PARIS | 2011 | 02:06:29 | 7589 |", conv) conv = df2rst(df, split_row="year") self.assertIn("++++", conv) self.assertIn("| city | year | time | seconds |", conv) self.assertIn("| FUKUOKA | 1976 | 02:12:35 | 7955 |", conv) conv = df2rst(df, split_row=["city", "year"]) self.assertIn("'AMSTERDAM', 1975", conv) self.assertIn("| city | year | time | seconds |", conv)
def test_df2rst_split_col_row(self): data = os.path.join(os.path.abspath(os.path.dirname(__file__)), "data") mara = os.path.join(data, "marathon.txt") df = pandas.read_csv(mara, names=["city", "year", "time", "seconds"], sep="\t") conv = df2rst(df, split_row="city", split_col_common=["city", "year"], split_col_subsets=[['time'], ['seconds']]) self.assertIn("+++++++++", conv) self.assertIn("| STOCKOLM | 2007 | 8456 |", conv)
def test_df2rst_split_col_row_ref2(self): data = os.path.join(os.path.abspath(os.path.dirname(__file__)), "data") mara = os.path.join(data, "marathon.txt") df = pandas.read_csv(mara, names=["city", "year", "time", "seconds"], sep="\t") df['city'] = df.city.apply(lambda v: ':ref:`{0}`'.format(v)) # pylint: disable=W0108 conv = df2rst(df, split_row="city", split_col_common=["city", "year"], split_col_subsets=[['time'], ['seconds']]) self.assertIn("+++++++++", conv) self.assertIn("| :ref:`AMSTERDAM` | 1982 | 02:12:15 |", conv)
def test_df2rst_column_size_i(self): data = os.path.join(os.path.abspath(os.path.dirname(__file__)), "data") mara = os.path.join(data, "marathon.txt") df = pandas.read_csv(mara, names=["city", "year", "time", "seconds"], sep="\t") conv = df2rst(df, column_size={0: 40}) self.assertIn( "| city | year | time | seconds |", conv) self.assertIn( "| PARIS | 2006 | 02:08:03 | 7683 |", conv)
def test_pandas_rst_size(self): df = pandas.DataFrame([{"A": "x", "AA": "xx", "AAA": "xxx"}, {"AA": "xxxxxxx", "AAA": "xxx"}]) rst = df2rst(df, column_size=[1, 1, 2]) exp = """+-----+---------+--------+ | A | AA | AAA | +=====+=========+========+ | x | xx | xxx | +-----+---------+--------+ | | xxxxxxx | xxx | +-----+---------+--------+ """.replace(" ", "") self.assertEqual(rst, exp)
def df2rsthtml(df, format="html", fillna=""): """ Writes a table into RST or HTML format. @param df dataframe @param format format @param fillna fill empty values @return string """ df = df.fillna(fillna) if format == "html": return df2html(df) elif format == "rst": return df2rst(df)
def test_pandas_rst_right(self): df = pandas.DataFrame([{"A": "x", "AA": "xx", "AAA": "xxx"}, {"AA": "xxxxxxx", "AAA": "xxx"}]) rst = df2rst(df, align="r") exp = """+-----+---------+-----+ | A | AA | AAA | +=====+=========+=====+ | x | xx | xxx | +-----+---------+-----+ | | xxxxxxx | xxx | +-----+---------+-----+ """.replace(" ", "") self.assertEqual(rst, exp) rst = df2rst(df, align="c") exp = """+-----+---------+-----+ | A | AA | AAA | +=====+=========+=====+ | x | xx | xxx | +-----+---------+-----+ | | xxxxxxx | xxx | +-----+---------+-----+ """.replace(" ", "") self.assertEqual(rst, exp)
def test_documentation(self): fLOG( __file__, self._testMethodName, OutputPrint=__name__ == "__main__") mod = all_set() mod.sort() df = pandas.DataFrame(_.as_dict(rst_link=True) for _ in mod) df = df[["usage", "rst_link", "kind", "version", "license", "purpose", "classifier"]] df["classifier"] = df.apply( lambda row: classifiers2string(row["classifier"]), axis=1) df.columns = ["usage", "name", "kind", "version", "license", "purpose", "classifier"] fLOG(df2rst(df))
def test_pandas_rst_size_table_noheader(self): df = pandas.DataFrame([{"A": "x", "AA": "xx", "AAA": "xxx"}, {"AA": "xxxxxxx", "AAA": "xxx"}]) rst = df2rst(df, list_table=True, header=False) exp = """ .. list-table:: :widths: auto * - x - xx - xxx * - - xxxxxxx - xxx """.replace(" ", "") self.assertEqual(rst.strip("\n "), exp.strip("\n "))
def test_pandas_rst_size_table_number_format(self): df = pandas.DataFrame([{"A": 2.12345678, "AA": 3.12345678, "AAA": 4.12345678}, {"AA": 2.12345678e10, "AAA": 2.12345678e-10}]) rst = df2rst(df, list_table=True, header=False, number_format=3) exp = """ .. list-table:: :widths: auto * - 2.12 - 3.12 - 4.12 * - nan - 2.12e+10 - 2.12e-10 """.replace(" ", "") self.assertEqual(rst.strip("\n "), exp.strip("\n "))
def rst_table_modules(): """ produces a table with all modules recommended to do machine learning @return string """ mod = ensae_fullset() mod.sort() df = pandas.DataFrame(_.as_dict(rst_link=True) for _ in mod) df = df[["usage", "rst_link", "kind", "version", "license", "purpose", "classifier"]] df["classifier"] = df.apply( lambda row: classifiers2string(row["classifier"]), axis=1) df.columns = ["usage", "name", "kind", "version", "license", "purpose", "classifier"] df["lname"] = df["name"].apply(lambda s: s.lower()) df = df.sort_values("lname").drop("lname", axis=1) return df2rst(df)
def test_documentation(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__") mod = all_set() mod.sort() df = pandas.DataFrame(_.as_dict(rst_link=True) for _ in mod) df = df[[ "usage", "rst_link", "kind", "version", "license", "purpose", "classifier" ]] df["classifier"] = df.apply( lambda row: classifiers2string(row["classifier"]), axis=1) df.columns = [ "usage", "name", "kind", "version", "license", "purpose", "classifier" ] fLOG(df2rst(df))
def test_pandas_rst_size_replace(self): df = pandas.DataFrame([{ "A": "x", "AA": "xx", "AAA": "xxx" }, { "AA": "xxxxxxx", "AAA": "xxx" }]) rst = df2rst(df, column_size=[1, 1, 2], replacements={'xxx': 'rrrr'}) exp = """+---+---------+--------+ | A | AA | AAA | +===+=========+========+ | x | xx | rrrr | +---+---------+--------+ | | xxxxxxx | rrrr | +---+---------+--------+ """.replace(" ", "") self.assertEqual(rst, exp)
def test_pandas_rst_size_table(self): df = pandas.DataFrame([{"A": "x", "AA": "xx", "AAA": "xxx"}, {"AA": "xxxxxxx", "AAA": "xxx"}]) rst = df2rst(df, column_size=[1, 1, 2], list_table=True) exp = """ .. list-table:: :widths: 1 1 2 :header-rows: 1 * - A - AA - AAA * - x - xx - xxx * - - xxxxxxx - xxx """.replace(" ", "") self.assertEqual(rst.strip("\n "), exp.strip("\n "))
def test_df2rst_split_col_complex(self): data = os.path.join(os.path.abspath(os.path.dirname(__file__)), "data") mara = os.path.join(data, "unittst.csv") df = pandas.read_csv(mara) common = ['name', 'problem', 'scenario'] subsets = [['opset11', 'opset10', 'opset9'], ['ERROR-msg'], [ 'RT/SKL-N=1', 'N=10', 'N=100', 'N=1000', 'N=10000', 'N=100000', 'RT/SKL-N=1-min', 'RT/SKL-N=1-max', 'N=10-min', 'N=10-max', 'N=100-min', 'N=100-max', 'N=1000-min', 'N=1000-max', 'N=10000-min', 'N=10000-max', 'N=100000-min', 'N=100000-max' ]] def build_key_split(key, index): new_key = str(key).split('`')[1].split('<')[0].strip() return new_key def filter_rows(df): for c in ['ERROR-msg', 'RT/SKL-N=1']: if c in df.columns: return df[df[c].apply(lambda x: pandas.notnull(x) and x not in (None, '', 'nan'))] return df conv = df2rst(df, number_format=2, replacements={ 'nan': '', 'ERR: 4convert': '' }, split_row=lambda index, dp=df: build_key_split( dp.loc[index, "name"], index), split_col_common=common, split_col_subsets=subsets, filter_rows=filter_rows) self.assertIn( "| :ref:`ARDRegression <l-ARDRegression-b-reg-default>` | b-reg | default " "| | ? | ? |", conv) spl = conv.split("+=============================") self.assertEqual(len(spl), 7)
def test_pandas_rst_size_table_number_format(self): df = pandas.DataFrame([{ "A": 2.12345678, "AA": 3.12345678, "AAA": 4.12345678 }, { "AA": 2.12345678e10, "AAA": 2.12345678e-10 }]) rst = df2rst(df, list_table=True, header=False, number_format=3) exp = """ .. list-table:: :widths: auto * - 2.12 - 3.12 - 4.12 * - - 2.12e+10 - 2.12e-10 """.replace(" ", "") self.assertEqual(rst.strip("\n "), exp.strip("\n "))
def test_pandas_rst_size_table_noheader(self): df = pandas.DataFrame([{ "A": "x", "AA": "xx", "AAA": "xxx" }, { "AA": "xxxxxxx", "AAA": "xxx" }]) rst = df2rst(df, list_table=True, header=False) exp = """ .. list-table:: :widths: auto * - x - xx - xxx * - - xxxxxxx - xxx """.replace(" ", "") self.assertEqual(rst.strip("\n "), exp.strip("\n "))
def simple(): df = pandas.DataFrame([{"A": "x", "AA": "xx", "AAA": "xxx"}, {"AA": "xxxxxxx", "AAA": "xxx"}]) return df2rst(df)
def write_page_onnxrt_benches(app, runtime, skip=None, white_list=None): from mlprodict.onnxrt.validate.validate import enumerate_validated_operator_opsets logger = getLogger('mlprodict') srcdir = app.builder.srcdir if app is not None else ".." if runtime in ('python', 'python_compiled'): whe = os.path.join(os.path.abspath(srcdir), "skl_converters", "bench_python.rst") elif runtime == 'onnxruntime2': whe = os.path.join(os.path.abspath(srcdir), "skl_converters", "bench_onnxrt2.rst") elif runtime == 'onnxruntime1': whe = os.path.join(os.path.abspath(srcdir), "skl_converters", "bench_onnxrt1.rst") else: raise RuntimeError("Unsupported runtime '{}'.".format(runtime)) logger.info("[mlprodict] create page '{}'.".format(whe)) print("[mlprodict-sphinx] create page runtime '{}' - '{}'.".format(runtime, whe)) filenames = run_benchmark(runtime, srcdir, logger, skip, white_list=white_list) dfs_raw = [read_csv(name[0]) for name in filenames if os.path.exists(name[0])] dfs_sum = [read_csv(name[1]) for name in filenames if os.path.exists(name[1])] df_raw = concat(dfs_raw, sort=False) piv = concat(dfs_sum, sort=False) opset_cols = [(int(oc.replace("opset", "")), oc) for oc in piv.columns if 'opset' in oc] opset_cols.sort(reverse=True) opset_cols = [oc[1] for oc in opset_cols] new_cols = opset_cols[:1] bench_cols = ["RT/SKL-N=1", "N=10", "N=100", "N=1000", "N=10000", "N=100000"] new_cols.extend(["ERROR-msg", "name", "problem", "scenario", 'optim']) new_cols.extend(bench_cols) new_cols.extend(opset_cols[1:]) for c in bench_cols: new_cols.append(c + '-min') new_cols.append(c + '-max') for c in piv.columns: if c.startswith("skl_") or c.startswith("onx_"): new_cols.append(c) new_cols = [_ for _ in new_cols if _ in piv.columns] piv = piv[new_cols] out_sum = os.path.join(srcdir, "bench_sum_%s.xlsx" % runtime) piv.to_excel(out_sum, index=False) logger.info("[mlprodict] wrote '{}'.".format(out_sum)) print("[mlprodict-sphinx] wrote '{}'".format(out_sum)) out_raw = os.path.join(srcdir, "bench_raw_%s.xlsx" % runtime) df_raw.to_excel(out_raw, index=False) logger.info("[mlprodict] wrote '{}'.".format(out_raw)) print("[mlprodict-sphinx] wrote '{}'".format(out_raw)) logger.info("[mlprodict] shape '{}'.".format(piv.shape)) print("[mlprodict-sphinx] shape '{}'".format(piv.shape)) def make_link(row): link = ":ref:`{name} <l-{name}-{problem}-{scenario}-{optim}-{opset}>`" name = row['name'] problem = row['problem'] scenario = row['scenario'] optim = _clean_values_optim( str(row.get('optim', '')).replace("nan", "")) opset = _make_opset(row) return link.format(name=name, problem=problem, scenario=scenario, optim=optim, opset=opset) piv['name'] = piv.apply(lambda row: make_link(row), axis=1) piv.reset_index(drop=True, inplace=True) if "ERROR-msg" in piv.columns: def shorten(text): text = str(text) if len(text) > 75: text = text[:75] + "..." return text piv["ERROR-msg"] = piv["ERROR-msg"].apply(shorten) logger.info("[mlprodict] write '{}'.".format(whe)) print("[mlprodict-sphinx] write '{}'".format(whe)) with open(whe, 'w', encoding='utf-8') as f: title = "Available of scikit-learn model for runtime {0}".format( runtime) f.write(dedent(''' .. _l-onnx-bench-{0}: {1} {2} The following metrics measure the ratio between the prediction time for the runtime compare to :epkg:`scikit-learn`. It gives an order of magnitude. They are done by setting ``assume_finite=True`` (see `config_context <https://scikit-learn.org/stable/modules/generated/sklearn.config_context.html>`_). The computed ratio is: .. math:: \\frac{{\\textit{{execution when predicting with a custom ONNX runtime}}}} {{\\textit{{execution when predicting with scikit-learn (assume\\_finite=True)}}}} Due to float32 conversion, it may happen than the highest difference is quite high. The proposition :math:`a < b \\Rightarrow [a] < [b]` is usually true and but not true all the time. It is the same after number where rounded to float32, that's why the result considers the fourth highest difference and not the first three. Some figures are missing when the number of observations is high. That means the prediction is slow for one of the runtime (ONNX, scikit-learn) and it would take too long to go further. The list of problems can be found in the documentation of function :func:`find_suitable_problem <mlprodict.onnxrt.validate.validate_problems.find_suitable_problem>`. Default values are usually used to create models but other scenarios are defined by :func:`build_custom_scenarios <mlprodict.onnxrt.validate.validate_scenarios.build_custom_scenarios>` and :func:`build_custom_scenarios (2) <from mlprodict.onnxrt.validate.validate_scenarios.build_custom_scenarios>`. The benchmark can be generated with a command line: :: python -m mlprodict validate_runtime --verbose=1 --out_raw=data.csv --out_summary=summary.xlsx --benchmark=1 --dump_folder=. --runtime={0} The option ``-se 1`` may be used if the process crashes. The command line can also be extended to test only one model or to skip another one. The whole batch takes between 5 and 15 minutes depending on the machine. Full data: :download:`{3} <../{3}>` .. contents:: :local: '''.format(runtime, title, "=" * len(title), "bench_sum_%s.xlsx" % runtime))) common, subsets = split_columns_subsets(piv) f.write(df2rst(piv, number_format=2, replacements={'nan': '', 'ERR: 4convert': ''}, split_row=lambda index, dp=piv: build_key_split( dp.loc[index, "name"], index), split_col_common=common, split_col_subsets=subsets, filter_rows=filter_rows, column_size={'problem': 25}, label_pattern=".. _lpy-{section}:")) logger.info( "[mlprodict] done page '{}'.".format(whe)) print("[mlprodict-sphinx] done page runtime '{}' - '{}'.".format(runtime, whe))
def test_module_list(self): res = df2rst(DataFrame(modules_list())) self.assertIn('sklearn', res) self.assertIn('numpy', res) self.assertIn('skl2onnx', res)