def check_one(params,
              line_sep,
              sep,
              data,
              want_res=None,
              prefix="",
              quotes=False,
              leave_bad=False):
    sep_c = chr(sep)
    # Can't have separator character in unquoted values
    if not quotes and not leave_bad:
        data = [[el.replace(sep_c, "") for el in line] for line in data]
    if not want_res:
        want_res = [
            tuple(s.encode("ascii") for s in line) for line in data[1:]
        ]
    filename = "%s_csv.%d.%r.txt" % (prefix, sep, line_sep)
    with open(filename, "w") as fh:
        for line in data:
            if quotes:
                line = [
                    quotes + el.replace(quotes, quotes + quotes) + quotes
                    for el in line
                ]
            fh.write(sep_c.join(line))
            fh.write(line_sep)
    try:
        jid = subjobs.build("csvimport",
                            options=dict(
                                filename=resolve_jobid_filename(
                                    params.jobid, filename),
                                separator=sep_c,
                                quote_support=bool(quotes),
                            ))
    except JobError as e:
        raise CSVImportException(
            "Failed to csvimport for separator %d with line separator %r, csvimport error was:\n%s"
            % (sep, line_sep, e.format_msg()))
    ds = Dataset(jid)
    labels = sorted(ds.columns)
    if labels != data[0]:
        raise WrongLabelsException(
            "csvimport gave wrong labels for separator %d with line separator %r: %r (expected %r)"
            % (
                sep,
                line_sep,
                labels,
                data[0],
            ))
    res = list(ds.iterate(None, data[0]))
    if res != want_res:
        raise WrongDataException(
            "csvimport gave wrong data for separator %d with line separator %r: %r (expected %r)"
            % (
                sep,
                line_sep,
                res,
                want_res,
            ))
Esempio n. 2
0
def verify(zipname, inside_filenames, want_ds, **kw):
    opts = dict(
        filename=resolve_jobid_filename(g.jobid, zipname),
        inside_filenames=inside_filenames,
    )
    opts.update(kw)
    jid = subjobs.build('csvimport_zip', options=opts)
    for dsn, want_data in want_ds.items():
        got_data = list(Dataset(jid, dsn).iterate(None, '0'))
        assert got_data == want_data, "%s/%s from %s didn't contain %r, instead contained %r" % (
            jid, dsn, zipname, want_data, got_data)
Esempio n. 3
0
 def _printlist(self, returndict):
     # print (return list) in neat format
     for method, item in sorted(returndict.items(),
                                key=lambda x: x[1].link):
         if item.make == True:
             make_msg = 'MAKE'
         else:
             make_msg = item.make or 'link'
         print('        -  %44s' % method.ljust(44), end=' ')
         print(' %s' % (make_msg, ), end=' ')
         if self.print_full_jobpath:
             print(' %s' % resolve_jobid_filename(item.link, ''), end=' ')
         else:
             print(' %s' % item.link, end=' ')
         if item.make != True:
             print(' %s' % fmttime(item.total_time), end=' ')
         print()
Esempio n. 4
0
def check_no_separator(params):
    def write(data):
        fh.write(data + nl_b)
        wrote_c[data] += 1
        if q_b:
            data = q_b + data + q_b
            fh.write(q_b + data.replace(q_b, q_b + q_b) + q_b + nl_b)
            wrote_c[data] += 1

    for nl in (10, 0, 255):
        for q in (None, 0, 34, 13, 10, 228):
            if nl == q:
                continue
            filename = "no separator.%r.%r.txt" % (
                nl,
                q,
            )
            nl_b = bytechr(nl)
            q_b = bytechr(q) if q else b''
            wrote_c = Counter()
            with openx(filename) as fh:
                for splitpoint in range(256):
                    write(byteline(0, splitpoint, nl, q))
                    write(byteline(splitpoint, 256, nl, q))
            try:
                jid = subjobs.build("csvimport",
                                    options=dict(
                                        filename=resolve_jobid_filename(
                                            params.jobid, filename),
                                        quotes=q_b.decode("iso-8859-1"),
                                        newline=nl_b.decode("iso-8859-1"),
                                        separator='',
                                        labelsonfirstline=False,
                                        labels=["data"],
                                    ))
            except JobError:
                raise Exception("Importing %r failed" % (filename, ))
            got_c = Counter(Dataset(jid).iterate(None, "data"))
            assert got_c == wrote_c, "Importing %r (%s) gave wrong contents" % (
                filename,
                jid,
            )
def check_array(params, lines, filename, bad_lines=(), **options):
    d = {}
    with openx(filename) as fh:
        for ix, data in enumerate(bad_lines, 1):
            ix = str(ix).encode("ascii")
            fh.write(ix + b"," + data + b"," + data + b"\n")
        for ix, data in enumerate(lines, len(bad_lines) + 1):
            if isinstance(data, tuple):
                data, d[ix] = data
            else:
                d[ix] = data
            ix = str(ix).encode("ascii")
            fh.write(ix + b"," + data + b"," + data + b"\n")
    options.update(
        filename=resolve_jobid_filename(params.jobid, filename),
        allow_bad=bool(bad_lines),
        labelsonfirstline=False,
        labels=["ix", "0", "1"],
    )
    verify_ds(options, d, filename)
Esempio n. 6
0
 def tmpfn():
     cnt = 0
     while True:
         cnt += 1
         yield resolve_jobid_filename(params.jobid, str(cnt))
Esempio n. 7
0
def check_good_file(params, name, data, d, d_bad={}, d_skipped={}, **options):
    filename = name + ".txt"
    with openx(filename) as fh:
        fh.write(data)
    options.update(filename=resolve_jobid_filename(params.jobid, filename), )
    verify_ds(options, d, d_bad, d_skipped, filename)
Esempio n. 8
0
def check_bad_file(params, name, data):
    filename = name + ".txt"
    with openx(filename) as fh:
        fh.write(data)
    options = dict(filename=resolve_jobid_filename(params.jobid, filename), )
    require_failure(name, options)
Esempio n. 9
0
def main(urd):
    resetlocale()

    if False:
        # One BILLION rows
        # This takes about half an hour on a fast machine
        num_rows = int(1e7)
        num_datasets = 100
    else:
        # One MILLION rows
        num_rows = int(1e6)
        num_datasets = 10

    # Create datasets
    print("\x1b[1m(1) Create chain of datasets.\x1b[m")
    jid = None
    for _ in range(num_datasets):
        jid = urd.build('example_perf_gendata',
                        options=dict(num_rows=num_rows),
                        datasets=dict(previous=jid))

    # Export chain of datasets to CSV-file.
    print("\x1b[1m(2) Export dataset chain to CSV file.\x1b[m")
    jid = urd.build('csvexport',
                    datasets=dict(source=jid),
                    options=dict(filename='out.csv.gz', chain_source=True))

    filename = resolve_jobid_filename(jid, 'out.csv.gz')
    print('Exported file stored in \"%s\"' % (filename, ))

    # Import and type previously exported CSV-file.
    print("\x1b[1m(3) Import dataset from CVS file.\x1b[m")
    jid = urd.build('csvimport', options=dict(filename=filename))
    opts = dict(
        column2type={
            'a string': 'ascii',
            'large number': 'number',
            'small number': 'number',
            'small integer': 'int32_10',  # you must specify base for integers
            'gauss number': 'number',
            'gauss float': 'float64',
        }, )
    print("\x1b[1m(4) Type imported dataset.\x1b[m")
    jid = urd.build('dataset_type', datasets=dict(source=jid), options=opts)

    # Sum all values in a column.  Repeat for a set of columns with different types.
    print("\x1b[1m(5) Run some methods on the typed dataset.\x1b[m")
    jid_single = jid
    source = jid_single
    for colname in ('small number', 'small integer', 'large number',
                    'gauss number', 'gauss float'):
        print(colname)
        jid = urd.build('example_perf_sum',
                        datasets=dict(source=source),
                        options=dict(colname=colname),
                        name='sum ' + colname)
        jid = urd.build('example_perf_sum_positive',
                        datasets=dict(source=source),
                        options=dict(colname=colname),
                        name='sum positive ' + colname)

    # Compute histograms of a column
    print('histogram')
    jid = urd.build('example_perf_histogram',
                    datasets=dict(source=source),
                    options=dict(colname='gauss number'),
                    name='histogram_number')
    jid = urd.build('example_perf_histogram',
                    datasets=dict(source=source),
                    options=dict(colname='gauss float'),
                    name='histogram_float')

    # Find string
    print('find string')
    jid = urd.build('example_perf_find_string',
                    datasets=dict(source=source),
                    options=dict(colname='a string', text='ExAx'),
                    name='find_string')
    print(
        "Number of lines containing string \"%s\" is %d." %
        (job_params(jid).options['text'], blob.load(jobid=jid)), )

    # Print resulting profiling information
    from automata_common import profile_jobs
    print()

    def pl(text, time):
        print("%-30s %10.3f %14s" % (
            text,
            time,
            '{0:n}'.format(round(num_rows * num_datasets / time)),
        ))

    print()
    print('-' * 56)
    print("operation                       exec time         rows/s")
    print()
    pl('csvexport', profile_jobs(urd.joblist.find('csvexport')))
    print()
    pl(
        'reimport total',
        profile_jobs(
            urd.joblist.find('csvimport') + urd.joblist.find('dataset_type')))
    pl("   csvimport         ", profile_jobs(urd.joblist.find('csvimport')))
    pl("   type              ", profile_jobs(urd.joblist.find('dataset_type')))
    print()
    print("sum")
    pl("  small number       ",
       profile_jobs(urd.joblist.find('sum small number')))
    pl("  small integer      ",
       profile_jobs(urd.joblist.find('sum small integer')))
    pl("  large number       ",
       profile_jobs(urd.joblist.find('sum large number')))
    pl("  gauss number       ",
       profile_jobs(urd.joblist.find('sum gauss number')))
    pl("  gauss float        ",
       profile_jobs(urd.joblist.find('sum gauss float')))
    print()
    print("sum positive")
    pl("  small number       ",
       profile_jobs(urd.joblist.find('sum positive small number')))
    pl("  small integer      ",
       profile_jobs(urd.joblist.find('sum positive small integer')))
    pl("  large number       ",
       profile_jobs(urd.joblist.find('sum positive large number')))
    pl("  gauss number       ",
       profile_jobs(urd.joblist.find('sum positive gauss number')))
    pl("  gauss float        ",
       profile_jobs(urd.joblist.find('sum positive gauss float')))
    print()
    print("histogram")
    pl("  number             ",
       profile_jobs(urd.joblist.find('histogram_number')))
    pl("  float              ",
       profile_jobs(urd.joblist.find('histogram_float')))
    print()
    pl("find string          ", profile_jobs(urd.joblist.find('find_string')))
    print()
    print("Total test time                %10.3f" %
          (profile_jobs(urd.joblist), ))
    print()
    print('Example size is %s lines.' %
          ('{0:n}'.format(num_datasets * num_rows), ))
    print('Number of slices is %d.' % (urd.info.slices, ))
    print('-' * 56)