def test_scidblib_unparse_schema():
    """Unit test for the Python schema un-parser."""
    print '*** testing scidblib.scidb_schema.unparse...'
    schema1 = "<z1:string DEFAULT 'aa aa',z2:int64 NULL DEFAULT -2," + \
        "z3:int32 NULL,z4:float DEFAULT -0.5,z5:char char('x')," + \
        "z4:datetime DEFAULT datetime(\'25Nov2009:16:11:19\')," + \
        "z5:datetimetz DEFAULT datetimetz(\'10/13/2008 15:10:20 +9:00\')>" + \
        "[dim1=-77:*,23,0,dim2=0:99,?,1,dim3=-100:7,?,1]"
    attrs1,dims1 = SS.parse(schema1)
    schema2 = SS.unparse(attrs1,dims1)

    attrs2,dims2 = SS.parse(schema2)
    # Check attributes:
    print 'checking attributes...'
    for attr1,attr2 in zip(attrs1,attrs2):
        assert attr1.name == attr2.name
        assert attr1.type == attr2.type
        assert attr1.nullable == attr2.nullable
        assert attr1.default == attr2.default

    # Check dimensions:
    print 'checking dimensions...'
    for dim1,dim2 in zip(dims1,dims2):
        assert dim1.name == dim2.name
        assert dim1.lo == dim2.lo
        assert dim1.hi == dim2.hi
        assert dim1.chunk == dim2.chunk
        assert dim1.overlap == dim2.overlap
Beispiel #2
0
def test_scidblib_unparse_schema():
    """Unit test for the Python schema un-parser."""
    print '*** testing scidblib.scidb_schema.unparse...'
    schema1 = "<z1:string DEFAULT 'aa aa',z2:int64 NULL DEFAULT -2," + \
        "z3:int32 NULL,z4:float DEFAULT -0.5,z5:char char('x')," + \
        "z4:datetime DEFAULT datetime(\'25Nov2009:16:11:19\')," + \
        "z5:datetimetz DEFAULT datetimetz(\'10/13/2008 15:10:20 +9:00\')>" + \
        "[dim1=-77:*,23,0,dim2=0:99,?,1,dim3=-100:7,?,1]"
    attrs1, dims1 = SS.parse(schema1)
    schema2 = SS.unparse(attrs1, dims1)

    attrs2, dims2 = SS.parse(schema2)
    # Check attributes:
    print 'checking attributes...'
    for attr1, attr2 in zip(attrs1, attrs2):
        assert attr1.name == attr2.name
        assert attr1.type == attr2.type
        assert attr1.nullable == attr2.nullable
        assert attr1.default == attr2.default

    # Check dimensions:
    print 'checking dimensions...'
    for dim1, dim2 in zip(dims1, dims2):
        assert dim1.name == dim2.name
        assert dim1.lo == dim2.lo
        assert dim1.hi == dim2.hi
        assert dim1.chunk == dim2.chunk
        assert dim1.overlap == dim2.overlap
Beispiel #3
0
def reparse(attrs, dims):
    """Rebuild a schema from attrs and dims... it should match!"""
    schema = SS.unparse(attrs, dims)
    aa, dd = SS.parse(schema)
    for x, y in zip(attrs, aa):
        assert x == y, "Reparse attribute mismatch: '%s' != '%s'" % (x, y)
    for x, y in zip(dims, dd):
        assert x == y, "Reparse dimension mismatch: '%s' != '%s'" % (x, y)
Beispiel #4
0
def reparse(attrs, dims):
    """Rebuild a schema from attrs and dims... it should match!"""
    # Don't use old-style dimension syntax for this "mirroring".  The
    # old syntax won't "mirror" None values (they must become '*').
    schema = SS.unparse(attrs, dims, compat=False)
    aa, dd = SS.parse(schema)
    for x, y in zip(attrs, aa):
        assert x == y, "Reparse attribute mismatch: '%s' != '%s'" % (x, y)
    for x, y in zip(dims, dd):
        assert x == y, "Reparse dimension mismatch: '%s' != '%s'" % (x, y)
Beispiel #5
0
def test_scidblib_unparse_schema():
    """Unit test for the Python schema un-parser."""
    print '*** testing scidblib.scidb_schema.unparse...'
    schema1 = ''.join(
        ("<z1:string DEFAULT 'aa aa',z2:int64 NULL DEFAULT -2,",
         "z3:int32 NULL,z4:float DEFAULT -0.5,z5:char dEfAuLt char('x'),",
         "z4:datetime DEFAULT datetime(\'25Nov2009:16:11:19\')",
         " compression 'gzip'",
         "z5:datetimetz DEFAULT datetimetz(\'10/13/2008 15:10:20 +9:00\')",
         " reserve 32", ">[dim1=-77:*,23,0,dim2=0:99,?,1,dim3=-100:7,?,1]"))

    # TODO: Fix for default_nullable=True, see SDB-5138.
    attrs1, dims1 = SS.parse(schema1, default_nullable=False)
    schema2 = SS.unparse(attrs1, dims1, default_nullable=False)
    attrs2, dims2 = SS.parse(schema2, default_nullable=False)

    # Check attributes:
    print 'checking attributes...'
    for i, (attr1, attr2) in enumerate(zip(attrs1, attrs2)):
        assert attr1.name == attr2.name, "%d: %s != %s" % (i, attr1.name,
                                                           attr2.name)
        assert attr1.type == attr2.type, "%d: %s != %s" % (i, attr1.type,
                                                           attr2.type)
        assert attr1.nullable == attr2.nullable, "%d: %s != %s" % (
            i, attr1.nullable, attr2.nullable)
        assert attr1.default == attr2.default, "%d: %s != %s" % (
            i, attr1.default, attr2.default)
        assert attr1.compression == attr2.compression, "%d: %s != %s" % (
            i, attr1.compression, attr2.compression)
        assert attr1.reserve == attr2.reserve, "%d: %s != %s" % (
            i, attr1.reserve, attr2.reserve)

    # Check dimensions:
    print 'checking dimensions...'
    for dim1, dim2 in zip(dims1, dims2):
        assert dim1.name == dim2.name
        assert dim1.lo == dim2.lo
        assert dim1.hi == dim2.hi
        assert dim1.chunk == dim2.chunk
        assert dim1.overlap == dim2.overlap
Beispiel #6
0
def calculate_chunk_length(args):
    """Calculate chunk length and other fields which were '?', and print out the schema.

    @param args  the result of argparse.ArgumentParser.parse_args().
    @return 0
    @exception AppError if anything goes wrong.
    """
    iquery_cmd = scidb_afl.get_iquery_cmd(args)
    load_array = args.load_array
    raw_dims_str = args.raw_dims

    calculated_dims = parse_dimensions(raw_dims_str)
    dbg("Calculated dims:", [x.to_tuple() for x in calculated_dims])

    # Initialize the progress tracker
    progress_tracker = scidb_progress.ProgressTracker(
        sys.stdout,
        '',
        args.verbose,  # if_print_start
        args.verbose,  # if_print_end
        args.verbose  # if_print_skip
    )
    progress_tracker.register_step(
        'min_max_dc',
        'Get min_coord, max_coord, and ApproxDC for each dim from load_array.')
    progress_tracker.register_step('overall_dc',
                                   'Get overall ApproxDC from load_array.')
    progress_tracker.register_step(
        'calculate', 'Calculate and adjust dimension specification.')

    # S = dims where chunk_length is Specified;
    # N = dims where chunk_length is Not specified.
    S = []
    N = []
    for i, the_dim in enumerate(calculated_dims):
        if the_dim.chunk_length == '?':
            N.append(i)
        else:
            S.append(i)
    dbg("S:", S)
    dbg("N:", N)

    # Get the (dimension and attribute) names of the load_array.
    names_in_load_array = NamesInLoadArray(iquery_cmd, load_array)
    dbg("names...:", names_in_load_array.list)

    # for each i in [0..d), calculate min_coord[i], max_coord[i], and distinct_count[i]
    progress_tracker.start_step('min_max_dc')
    for the_dim in calculated_dims:
        index = names_in_load_array.find_index(the_dim.dim_name)
        the_name_in_load_array = names_in_load_array.list[index]

        if the_name_in_load_array.is_dim:
            tmp = names_in_load_array.gen_uniq_name()
            cmd = ('aggregate(apply(aggregate(' + load_array + ', count(*), ' +
                   the_dim.dim_name + '), ' + tmp + ', ' + the_dim.dim_name +
                   '), min(' + tmp + '), max(' + tmp + '), count(*))')
        else:
            cmd = ('aggregate(' + load_array + ', min(' + the_dim.dim_name +
                   '), max(' + the_dim.dim_name + '), approxdc(' +
                   the_dim.dim_name + '))')
        dbg("Cmd:", cmd)
        min_coord, max_coord, distinct_count = scidb_afl.single_cell_afl(
            iquery_cmd, cmd, 3)
        dbg("(min,max,dc):", (min_coord, max_coord, distinct_count))
        try:
            min_coord_int = int(min_coord)
            max_coord_int = int(max_coord)
            distinct_count_int = int(distinct_count)
            if args.verbose:
                print 'For ' + the_dim.dim_name + ', min_coord=' + str(min_coord_int) +\
                    ', max_coord=' + str(max_coord_int) +\
                    ', distinct_count=' + str(distinct_count_int)
        except ValueError:
            raise scidblib.AppError('Error: I cannot proceed because for ' +
                                    the_dim.dim_name + ' in array ' +
                                    load_array + ', not all of min_coord (=' +
                                    min_coord + '), max_coord (=' + max_coord +
                                    '), and distinct_count (=' +
                                    distinct_count + ') are integers.')
        the_dim.set_min_max_dc(min_coord_int, max_coord_int,
                               distinct_count_int)
    progress_tracker.end_step('min_max_dc')

    # Fill dim_low, dim_high, and chunk_overlap (which was a '?' before).
    for the_dim in calculated_dims:
        if the_dim.dim_low == '?':
            the_dim.dim_low = the_dim.min_coord
        if the_dim.dim_high == '?':
            the_dim.dim_high = the_dim.max_coord
        if the_dim.chunk_overlap == '?':
            the_dim.chunk_overlap = 0

    # Generate string_concat_of_dim_values in the form of:
    # string(dim_name1) + '|' + string(dim_name2) + '|' + string(dim_name3)
    string_values = []
    for i, the_dim in enumerate(calculated_dims):
        string_values.append('string(' + the_dim.dim_name + ')')
    string_concat_of_dim_values = ' + \'|\' + '.join(string_values)

    # Calculate overall_distinct_count.
    tmp = names_in_load_array.gen_uniq_name()
    cmd = ('aggregate(apply(' + load_array + ', ' + tmp + ', ' +
           string_concat_of_dim_values + '), approxdc(' + tmp + '))')
    progress_tracker.start_step('overall_dc')
    overall_distinct_count = scidb_afl.single_cell_afl(iquery_cmd, cmd, 1)
    overall_count = scidb_afl.single_cell_afl(
        iquery_cmd, 'aggregate(' + load_array + ', count(*))', 1)
    try:
        overall_distinct_count = int(overall_distinct_count)
        overall_count = int(overall_count)
        if overall_distinct_count > overall_count:
            overall_distinct_count = overall_count
    except ValueError:
        raise scidblib.AppError(
            'Error: The query to get overall_distinct_count failed to return an integer.'
        )
    if args.verbose:
        print 'overall_distinct_count=' + str(overall_distinct_count)
    progress_tracker.end_step('overall_dc')

    progress_tracker.start_step('calculate')

    # Shortcut: if |N| == 0, we are done.
    if len(N) == 0:
        print scidb_schema.unparse(
            dims=[x.to_tuple() for x in calculated_dims])
        return 0

    # Set num_chunks_from_n.
    num_chunks_from_n = scidb_math.ceil_of_division(
        overall_distinct_count, args.desired_values_per_chunk)
    for i in S:
        the_dim = calculated_dims[i]
        chunk_count = scidb_math.ceil_of_division(the_dim.distinct_count,
                                                  int(the_dim.chunk_length))
        num_chunks_from_n = scidb_math.ceil_of_division(
            num_chunks_from_n, chunk_count)
    if num_chunks_from_n <= 1:
        num_chunks_from_n = 1

    # For each dimension i in N, calculate chunk_count[i], then set chunk_length.
    for i in N:
        the_dim = calculated_dims[i]
        chunk_count = math.pow(num_chunks_from_n, 1.0 / len(N))
        if not args.keep_shape:
            # calculate geomean
            product = 1.0
            for k in N:
                product *= calculated_dims[k].distinct_count
            geomean = math.pow(product, 1.0 / len(N))
            chunk_count *= the_dim.distinct_count / geomean
        if chunk_count < 1:
            chunk_count = 1.0
        the_dim.chunk_length = int(
            math.ceil(
                (the_dim.max_coord - the_dim.min_coord + 1) / chunk_count))
        if chunk_count > 1:
            the_dim.chunk_length = scidb_math.snap_to_grid(
                the_dim.chunk_length,
                args.grid_threshold,
                use_binary=(not args.grid_base10))
    progress_tracker.end_step('calculate')

    # Print result.
    print scidb_schema.unparse(dims=[x.to_tuple() for x in calculated_dims])
    return 0