Beispiel #1
0
    def test_benchmark():
        f1 = tdir + "diff2.xlsx"
        f2 = tdir + "diff3.xlsx"
        a = readrow(f1)
        b = readrow(f2)

        t = dt.now()
        differ(a, b, sort=False)
        print("test_nosort_differ: time {}".format(dt.now() - t))
Beispiel #2
0
def profiler(
    path_or_buffer,
    header=None,
    top=10,
    na_val=[None, "", "N/A", "NULL", "null", "none", "na"],
    headerout=True,
):

    kw = dict(
        header=header,
        top=top,
        na_val=na_val,
    )

    head = []

    try:
        if guesstype(path_or_buffer) in {
                "ppt", "doc", "csv", "txt", "html", "pickle"
        }:
            raise ValueError

        rows = grouprow(path_or_buffer)

        if headerout:
            head = [["targetname", "columns", *ret._fields]]
        return head + [[pk, ck, *cv] for _, pk, row in rows
                       for ck, cv in profile_data(row, **kw).items()]
    except ValueError:
        rows = (x.value for x in readrow(path_or_buffer))
        if headerout:
            head = [["columns", *ret._fields]]
        return head + [[k, *v] for k, v in profile_data(rows, **kw).items()]
Beispiel #3
0
def render(path_or_buffer=None,
           template=join(dirname(util.__file__), "libs/inet-henge.tar.xz"),
           outpath=join(gettempdir(), "temp-inet.html"),
           width=1200,
           height=900,
           updatejs=False):

    if path_or_buffer:
        rows = readrow(path_or_buffer)
    else:
        rows = readrow.clipboard("csv")

    wdir = dirname(outpath)
    js = join(wdir, "inet-henge.min.js")
    if updatejs or not os.path.exists(js):
        with tarfile.open(template, 'r:xz') as fp:
            fp.extractall(wdir)

    html = htmltmpl.format(*parse(rows), width, height)

    with open(outpath, "w", encoding="utf-8") as fp:
        fp.write(html)

    if os.name == "nt":
        code, dat = getstatusoutput("start " + outpath)
        if code != 0:
            raise RuntimeError(dat)

    print("output: " + outpath)
Beispiel #4
0
 def test_differcsv():
     a = (x.value for x in readrow(tdir + "diff1.csv"))
     b = (x.value for x in readrow(tdir + "diff2.csv"))
     assert ([x for x in differ(a, b) if x[0] != "equal"] == [
         ('replace', 1, 1, 'b ---> 10', '8', '307', '130', '3504', '12',
          '70', '1', 'chevrolet chevelle malibue'),
         ('insert', None, 15, '22', '6', '198', '95', '2833', '15.5', '70',
          '1', 'plymouth duster'),
         ('insert', None, 23, '26', '4', '121', '113', '2234', '12.5', '70',
          '2', 'bmw 2002'),
         ('replace', 37, 39, '14', '9 ---> 8', '351', '153', '4154', '13.5',
          '71', '1', 'ford galaxie 500'),
         ('replace', 46, 48, '23', '4', '122', '86', '2220', '14', '71',
          '1', 'mercury capri 2001 ---> mercury capri 2000'),
         ('delete', 55, None, '25', '4', '97.5', '80', '2126', '17', '72',
          '1', 'dodge colt hardtop')
     ])
Beispiel #5
0
def main():
    import os
    from argparse import ArgumentParser
    from operator import attrgetter
    from util.io import readrow, grouprow, to_csv, to_tsv, unicode_escape
    from util.filetype import guesstype

    ps = ArgumentParser(prog="differ",
                        description="2 file diff compare program\n")
    padd = ps.add_argument

    padd('-V',
         '--version',
         action='version',
         version='%(prog)s ' + __version__)
    padd('file1', nargs=1, help='diff before file')
    padd('file2', nargs=1, help='diff after file')

    padd('-a',
         '--all',
         action='store_true',
         default=False,
         help='All Line print(default False)')
    padd('-o',
         '--outfile',
         type=str,
         default=None,
         help='output filepath (default `stdout`)')
    padd('-e',
         '--encoding',
         type=str,
         default="cp932",
         help='output fileencoding (default `cp932`)')
    padd('-r',
         '--rate',
         type=float,
         default=0.6,
         help='matched score rate (default `0.6`)')
    padd('-s',
         '--sep',
         type=unicode_escape,
         default="\t",
         help='output separater (default `\\t`)')
    padd('-l',
         '--lineterminator',
         type=unicode_escape,
         default="\r\n",
         help='output llineterminator (default `\\r\\n`)')
    padd('-n',
         '--noheader',
         action="store_true",
         default=False,
         help='file no header (default `False`)')
    padd('-N',
         '--navalue',
         type=str,
         default="-",
         help='output index N/A value (default `-`)')
    padd('-C',
         '--condition_value',
         type=str,
         default=" ---> ",
         help='Delimiter String of Replace Value (default ` ---> `)')

    padd('-t',
         '--target',
         type=selector,
         default=None,
         help='target table names or sheetname (ex. Sheet1, Sheet3)')
    padd(
        '-t1',
        '--target1',
        type=selector,
        default=None,
        help='target table names or sheetname of filename1 (ex. Sheet1, Sheet3)'
    )
    padd(
        '-t2',
        '--target2',
        type=selector,
        default=None,
        help='target table names or sheetname of filename2 (ex. Sheet1, Sheet3)'
    )

    args = ps.parse_args()

    p1 = args.file1[0]
    p2 = args.file2[0]

    na_value = args.navalue
    diffonly = not args.all
    outputfile = args.outfile
    header = not args.noheader
    encoding = args.encoding
    sep = args.sep
    lineterminator = args.lineterminator
    rep_rate = args.rate

    conditional_value = args.condition_value

    notarget = ["ppt", "doc", "csv", "txt", "html", "pickle"]
    if guesstype(p1) not in notarget and guesstype(p2) not in notarget:

        #TODO very big data dictionary memory NG
        tar1select = args.target1 or args.target
        tar2select = args.target2 or args.target
        a = {
            x.target: x.value
            for x in (tar1select(grouprow(p1)) if tar1select else grouprow(p1))
        }
        b = {
            x.target: x.value
            for x in (tar2select(grouprow(p2)) if tar2select else grouprow(p2))
        }

        #similar target
        kw = dict(header=header,
                  diffonly=diffonly,
                  rep_rate=rep_rate,
                  na_val=na_value,
                  startidx=1,
                  conditional_value=conditional_value)

        def _simtar(tag, tar, a, b, i):
            kw["header"] = i == 0 and kw["header"]

            if tag == "equal":
                row = ([tar, *r] for r in differ(a[tar], b[tar], **kw))
            elif tag == "replace":
                ta, tb = tar.split(conditional_value)
                row = ([tar, *r] for r in differ(a[ta], b[tb], **kw))
            elif tag == "delete":
                row = ([tar, tag, j, na_value, *r]
                       for j, r in enumerate(a[tar], 1))
            elif tag == "insert":
                row = ([tar, tag, na_value, j, *r]
                       for j, r in enumerate(b[tar], 1))
            else:
                return []

            return row

        it = (["targetname", *r[1:]] if r[1] == "tag" else r
              for i, (tag, *_, tar) in enumerate(differ(a, b))
              for r in _simtar(tag, tar, a, b, i))

    else:
        a = map(attrgetter("value"), readrow(p1))
        b = map(attrgetter("value"), readrow(p2))

        it = differ(a,
                    b,
                    header=header,
                    diffonly=diffonly,
                    rep_rate=rep_rate,
                    na_val=na_value,
                    startidx=1,
                    conditional_value=conditional_value)

    if outputfile is None:
        return to_csv(it, sys.stdout, encoding=encoding, sep=sep)

    ext = os.path.splitext(outputfile)[-1].startswith

    if ext(".xls"):
        to_excel(it,
                 outputfile,
                 header=True,
                 conditional_value=conditional_value)
    elif ext(".tsv"):
        to_tsv(it,
               outputfile,
               encoding=encoding,
               lineterminator=lineterminator)
    else:
        to_csv(it,
               outputfile,
               encoding=encoding,
               lineterminator=lineterminator)
Beispiel #6
0
 def test_guess_key():
     a = profile_data((x.value for x in readrow(tdir + "diff1.xlsx")),
                      header=0)
     assert (len(guess_key(a, 10)) == 10)