def test_benchmark(): f1 = tdir + "diff2.xlsx" f2 = tdir + "diff3.xlsx" a = readrow(f1) b = readrow(f2) t = dt.now() differ(a, b, sort=False) print("test_nosort_differ: time {}".format(dt.now() - t))
def profiler( path_or_buffer, header=None, top=10, na_val=[None, "", "N/A", "NULL", "null", "none", "na"], headerout=True, ): kw = dict( header=header, top=top, na_val=na_val, ) head = [] try: if guesstype(path_or_buffer) in { "ppt", "doc", "csv", "txt", "html", "pickle" }: raise ValueError rows = grouprow(path_or_buffer) if headerout: head = [["targetname", "columns", *ret._fields]] return head + [[pk, ck, *cv] for _, pk, row in rows for ck, cv in profile_data(row, **kw).items()] except ValueError: rows = (x.value for x in readrow(path_or_buffer)) if headerout: head = [["columns", *ret._fields]] return head + [[k, *v] for k, v in profile_data(rows, **kw).items()]
def render(path_or_buffer=None, template=join(dirname(util.__file__), "libs/inet-henge.tar.xz"), outpath=join(gettempdir(), "temp-inet.html"), width=1200, height=900, updatejs=False): if path_or_buffer: rows = readrow(path_or_buffer) else: rows = readrow.clipboard("csv") wdir = dirname(outpath) js = join(wdir, "inet-henge.min.js") if updatejs or not os.path.exists(js): with tarfile.open(template, 'r:xz') as fp: fp.extractall(wdir) html = htmltmpl.format(*parse(rows), width, height) with open(outpath, "w", encoding="utf-8") as fp: fp.write(html) if os.name == "nt": code, dat = getstatusoutput("start " + outpath) if code != 0: raise RuntimeError(dat) print("output: " + outpath)
def test_differcsv(): a = (x.value for x in readrow(tdir + "diff1.csv")) b = (x.value for x in readrow(tdir + "diff2.csv")) assert ([x for x in differ(a, b) if x[0] != "equal"] == [ ('replace', 1, 1, 'b ---> 10', '8', '307', '130', '3504', '12', '70', '1', 'chevrolet chevelle malibue'), ('insert', None, 15, '22', '6', '198', '95', '2833', '15.5', '70', '1', 'plymouth duster'), ('insert', None, 23, '26', '4', '121', '113', '2234', '12.5', '70', '2', 'bmw 2002'), ('replace', 37, 39, '14', '9 ---> 8', '351', '153', '4154', '13.5', '71', '1', 'ford galaxie 500'), ('replace', 46, 48, '23', '4', '122', '86', '2220', '14', '71', '1', 'mercury capri 2001 ---> mercury capri 2000'), ('delete', 55, None, '25', '4', '97.5', '80', '2126', '17', '72', '1', 'dodge colt hardtop') ])
def main(): import os from argparse import ArgumentParser from operator import attrgetter from util.io import readrow, grouprow, to_csv, to_tsv, unicode_escape from util.filetype import guesstype ps = ArgumentParser(prog="differ", description="2 file diff compare program\n") padd = ps.add_argument padd('-V', '--version', action='version', version='%(prog)s ' + __version__) padd('file1', nargs=1, help='diff before file') padd('file2', nargs=1, help='diff after file') padd('-a', '--all', action='store_true', default=False, help='All Line print(default False)') padd('-o', '--outfile', type=str, default=None, help='output filepath (default `stdout`)') padd('-e', '--encoding', type=str, default="cp932", help='output fileencoding (default `cp932`)') padd('-r', '--rate', type=float, default=0.6, help='matched score rate (default `0.6`)') padd('-s', '--sep', type=unicode_escape, default="\t", help='output separater (default `\\t`)') padd('-l', '--lineterminator', type=unicode_escape, default="\r\n", help='output llineterminator (default `\\r\\n`)') padd('-n', '--noheader', action="store_true", default=False, help='file no header (default `False`)') padd('-N', '--navalue', type=str, default="-", help='output index N/A value (default `-`)') padd('-C', '--condition_value', type=str, default=" ---> ", help='Delimiter String of Replace Value (default ` ---> `)') padd('-t', '--target', type=selector, default=None, help='target table names or sheetname (ex. Sheet1, Sheet3)') padd( '-t1', '--target1', type=selector, default=None, help='target table names or sheetname of filename1 (ex. Sheet1, Sheet3)' ) padd( '-t2', '--target2', type=selector, default=None, help='target table names or sheetname of filename2 (ex. Sheet1, Sheet3)' ) args = ps.parse_args() p1 = args.file1[0] p2 = args.file2[0] na_value = args.navalue diffonly = not args.all outputfile = args.outfile header = not args.noheader encoding = args.encoding sep = args.sep lineterminator = args.lineterminator rep_rate = args.rate conditional_value = args.condition_value notarget = ["ppt", "doc", "csv", "txt", "html", "pickle"] if guesstype(p1) not in notarget and guesstype(p2) not in notarget: #TODO very big data dictionary memory NG tar1select = args.target1 or args.target tar2select = args.target2 or args.target a = { x.target: x.value for x in (tar1select(grouprow(p1)) if tar1select else grouprow(p1)) } b = { x.target: x.value for x in (tar2select(grouprow(p2)) if tar2select else grouprow(p2)) } #similar target kw = dict(header=header, diffonly=diffonly, rep_rate=rep_rate, na_val=na_value, startidx=1, conditional_value=conditional_value) def _simtar(tag, tar, a, b, i): kw["header"] = i == 0 and kw["header"] if tag == "equal": row = ([tar, *r] for r in differ(a[tar], b[tar], **kw)) elif tag == "replace": ta, tb = tar.split(conditional_value) row = ([tar, *r] for r in differ(a[ta], b[tb], **kw)) elif tag == "delete": row = ([tar, tag, j, na_value, *r] for j, r in enumerate(a[tar], 1)) elif tag == "insert": row = ([tar, tag, na_value, j, *r] for j, r in enumerate(b[tar], 1)) else: return [] return row it = (["targetname", *r[1:]] if r[1] == "tag" else r for i, (tag, *_, tar) in enumerate(differ(a, b)) for r in _simtar(tag, tar, a, b, i)) else: a = map(attrgetter("value"), readrow(p1)) b = map(attrgetter("value"), readrow(p2)) it = differ(a, b, header=header, diffonly=diffonly, rep_rate=rep_rate, na_val=na_value, startidx=1, conditional_value=conditional_value) if outputfile is None: return to_csv(it, sys.stdout, encoding=encoding, sep=sep) ext = os.path.splitext(outputfile)[-1].startswith if ext(".xls"): to_excel(it, outputfile, header=True, conditional_value=conditional_value) elif ext(".tsv"): to_tsv(it, outputfile, encoding=encoding, lineterminator=lineterminator) else: to_csv(it, outputfile, encoding=encoding, lineterminator=lineterminator)
def test_guess_key(): a = profile_data((x.value for x in readrow(tdir + "diff1.xlsx")), header=0) assert (len(guess_key(a, 10)) == 10)