def test_scidblib_unparse_schema(): """Unit test for the Python schema un-parser.""" print '*** testing scidblib.scidb_schema.unparse...' schema1 = "<z1:string DEFAULT 'aa aa',z2:int64 NULL DEFAULT -2," + \ "z3:int32 NULL,z4:float DEFAULT -0.5,z5:char char('x')," + \ "z4:datetime DEFAULT datetime(\'25Nov2009:16:11:19\')," + \ "z5:datetimetz DEFAULT datetimetz(\'10/13/2008 15:10:20 +9:00\')>" + \ "[dim1=-77:*,23,0,dim2=0:99,?,1,dim3=-100:7,?,1]" attrs1,dims1 = SS.parse(schema1) schema2 = SS.unparse(attrs1,dims1) attrs2,dims2 = SS.parse(schema2) # Check attributes: print 'checking attributes...' for attr1,attr2 in zip(attrs1,attrs2): assert attr1.name == attr2.name assert attr1.type == attr2.type assert attr1.nullable == attr2.nullable assert attr1.default == attr2.default # Check dimensions: print 'checking dimensions...' for dim1,dim2 in zip(dims1,dims2): assert dim1.name == dim2.name assert dim1.lo == dim2.lo assert dim1.hi == dim2.hi assert dim1.chunk == dim2.chunk assert dim1.overlap == dim2.overlap
def test_scidblib_unparse_schema(): """Unit test for the Python schema un-parser.""" print '*** testing scidblib.scidb_schema.unparse...' schema1 = "<z1:string DEFAULT 'aa aa',z2:int64 NULL DEFAULT -2," + \ "z3:int32 NULL,z4:float DEFAULT -0.5,z5:char char('x')," + \ "z4:datetime DEFAULT datetime(\'25Nov2009:16:11:19\')," + \ "z5:datetimetz DEFAULT datetimetz(\'10/13/2008 15:10:20 +9:00\')>" + \ "[dim1=-77:*,23,0,dim2=0:99,?,1,dim3=-100:7,?,1]" attrs1, dims1 = SS.parse(schema1) schema2 = SS.unparse(attrs1, dims1) attrs2, dims2 = SS.parse(schema2) # Check attributes: print 'checking attributes...' for attr1, attr2 in zip(attrs1, attrs2): assert attr1.name == attr2.name assert attr1.type == attr2.type assert attr1.nullable == attr2.nullable assert attr1.default == attr2.default # Check dimensions: print 'checking dimensions...' for dim1, dim2 in zip(dims1, dims2): assert dim1.name == dim2.name assert dim1.lo == dim2.lo assert dim1.hi == dim2.hi assert dim1.chunk == dim2.chunk assert dim1.overlap == dim2.overlap
def reparse(attrs, dims): """Rebuild a schema from attrs and dims... it should match!""" schema = SS.unparse(attrs, dims) aa, dd = SS.parse(schema) for x, y in zip(attrs, aa): assert x == y, "Reparse attribute mismatch: '%s' != '%s'" % (x, y) for x, y in zip(dims, dd): assert x == y, "Reparse dimension mismatch: '%s' != '%s'" % (x, y)
def reparse(attrs, dims): """Rebuild a schema from attrs and dims... it should match!""" # Don't use old-style dimension syntax for this "mirroring". The # old syntax won't "mirror" None values (they must become '*'). schema = SS.unparse(attrs, dims, compat=False) aa, dd = SS.parse(schema) for x, y in zip(attrs, aa): assert x == y, "Reparse attribute mismatch: '%s' != '%s'" % (x, y) for x, y in zip(dims, dd): assert x == y, "Reparse dimension mismatch: '%s' != '%s'" % (x, y)
def test_scidblib_unparse_schema(): """Unit test for the Python schema un-parser.""" print '*** testing scidblib.scidb_schema.unparse...' schema1 = ''.join( ("<z1:string DEFAULT 'aa aa',z2:int64 NULL DEFAULT -2,", "z3:int32 NULL,z4:float DEFAULT -0.5,z5:char dEfAuLt char('x'),", "z4:datetime DEFAULT datetime(\'25Nov2009:16:11:19\')", " compression 'gzip'", "z5:datetimetz DEFAULT datetimetz(\'10/13/2008 15:10:20 +9:00\')", " reserve 32", ">[dim1=-77:*,23,0,dim2=0:99,?,1,dim3=-100:7,?,1]")) # TODO: Fix for default_nullable=True, see SDB-5138. attrs1, dims1 = SS.parse(schema1, default_nullable=False) schema2 = SS.unparse(attrs1, dims1, default_nullable=False) attrs2, dims2 = SS.parse(schema2, default_nullable=False) # Check attributes: print 'checking attributes...' for i, (attr1, attr2) in enumerate(zip(attrs1, attrs2)): assert attr1.name == attr2.name, "%d: %s != %s" % (i, attr1.name, attr2.name) assert attr1.type == attr2.type, "%d: %s != %s" % (i, attr1.type, attr2.type) assert attr1.nullable == attr2.nullable, "%d: %s != %s" % ( i, attr1.nullable, attr2.nullable) assert attr1.default == attr2.default, "%d: %s != %s" % ( i, attr1.default, attr2.default) assert attr1.compression == attr2.compression, "%d: %s != %s" % ( i, attr1.compression, attr2.compression) assert attr1.reserve == attr2.reserve, "%d: %s != %s" % ( i, attr1.reserve, attr2.reserve) # Check dimensions: print 'checking dimensions...' for dim1, dim2 in zip(dims1, dims2): assert dim1.name == dim2.name assert dim1.lo == dim2.lo assert dim1.hi == dim2.hi assert dim1.chunk == dim2.chunk assert dim1.overlap == dim2.overlap
def calculate_chunk_length(args): """Calculate chunk length and other fields which were '?', and print out the schema. @param args the result of argparse.ArgumentParser.parse_args(). @return 0 @exception AppError if anything goes wrong. """ iquery_cmd = scidb_afl.get_iquery_cmd(args) load_array = args.load_array raw_dims_str = args.raw_dims calculated_dims = parse_dimensions(raw_dims_str) dbg("Calculated dims:", [x.to_tuple() for x in calculated_dims]) # Initialize the progress tracker progress_tracker = scidb_progress.ProgressTracker( sys.stdout, '', args.verbose, # if_print_start args.verbose, # if_print_end args.verbose # if_print_skip ) progress_tracker.register_step( 'min_max_dc', 'Get min_coord, max_coord, and ApproxDC for each dim from load_array.') progress_tracker.register_step('overall_dc', 'Get overall ApproxDC from load_array.') progress_tracker.register_step( 'calculate', 'Calculate and adjust dimension specification.') # S = dims where chunk_length is Specified; # N = dims where chunk_length is Not specified. S = [] N = [] for i, the_dim in enumerate(calculated_dims): if the_dim.chunk_length == '?': N.append(i) else: S.append(i) dbg("S:", S) dbg("N:", N) # Get the (dimension and attribute) names of the load_array. names_in_load_array = NamesInLoadArray(iquery_cmd, load_array) dbg("names...:", names_in_load_array.list) # for each i in [0..d), calculate min_coord[i], max_coord[i], and distinct_count[i] progress_tracker.start_step('min_max_dc') for the_dim in calculated_dims: index = names_in_load_array.find_index(the_dim.dim_name) the_name_in_load_array = names_in_load_array.list[index] if the_name_in_load_array.is_dim: tmp = names_in_load_array.gen_uniq_name() cmd = ('aggregate(apply(aggregate(' + load_array + ', count(*), ' + the_dim.dim_name + '), ' + tmp + ', ' + the_dim.dim_name + '), min(' + tmp + '), max(' + tmp + '), count(*))') else: cmd = ('aggregate(' + load_array + ', min(' + the_dim.dim_name + '), max(' + the_dim.dim_name + '), approxdc(' + the_dim.dim_name + '))') dbg("Cmd:", cmd) min_coord, max_coord, distinct_count = scidb_afl.single_cell_afl( iquery_cmd, cmd, 3) dbg("(min,max,dc):", (min_coord, max_coord, distinct_count)) try: min_coord_int = int(min_coord) max_coord_int = int(max_coord) distinct_count_int = int(distinct_count) if args.verbose: print 'For ' + the_dim.dim_name + ', min_coord=' + str(min_coord_int) +\ ', max_coord=' + str(max_coord_int) +\ ', distinct_count=' + str(distinct_count_int) except ValueError: raise scidblib.AppError('Error: I cannot proceed because for ' + the_dim.dim_name + ' in array ' + load_array + ', not all of min_coord (=' + min_coord + '), max_coord (=' + max_coord + '), and distinct_count (=' + distinct_count + ') are integers.') the_dim.set_min_max_dc(min_coord_int, max_coord_int, distinct_count_int) progress_tracker.end_step('min_max_dc') # Fill dim_low, dim_high, and chunk_overlap (which was a '?' before). for the_dim in calculated_dims: if the_dim.dim_low == '?': the_dim.dim_low = the_dim.min_coord if the_dim.dim_high == '?': the_dim.dim_high = the_dim.max_coord if the_dim.chunk_overlap == '?': the_dim.chunk_overlap = 0 # Generate string_concat_of_dim_values in the form of: # string(dim_name1) + '|' + string(dim_name2) + '|' + string(dim_name3) string_values = [] for i, the_dim in enumerate(calculated_dims): string_values.append('string(' + the_dim.dim_name + ')') string_concat_of_dim_values = ' + \'|\' + '.join(string_values) # Calculate overall_distinct_count. tmp = names_in_load_array.gen_uniq_name() cmd = ('aggregate(apply(' + load_array + ', ' + tmp + ', ' + string_concat_of_dim_values + '), approxdc(' + tmp + '))') progress_tracker.start_step('overall_dc') overall_distinct_count = scidb_afl.single_cell_afl(iquery_cmd, cmd, 1) overall_count = scidb_afl.single_cell_afl( iquery_cmd, 'aggregate(' + load_array + ', count(*))', 1) try: overall_distinct_count = int(overall_distinct_count) overall_count = int(overall_count) if overall_distinct_count > overall_count: overall_distinct_count = overall_count except ValueError: raise scidblib.AppError( 'Error: The query to get overall_distinct_count failed to return an integer.' ) if args.verbose: print 'overall_distinct_count=' + str(overall_distinct_count) progress_tracker.end_step('overall_dc') progress_tracker.start_step('calculate') # Shortcut: if |N| == 0, we are done. if len(N) == 0: print scidb_schema.unparse( dims=[x.to_tuple() for x in calculated_dims]) return 0 # Set num_chunks_from_n. num_chunks_from_n = scidb_math.ceil_of_division( overall_distinct_count, args.desired_values_per_chunk) for i in S: the_dim = calculated_dims[i] chunk_count = scidb_math.ceil_of_division(the_dim.distinct_count, int(the_dim.chunk_length)) num_chunks_from_n = scidb_math.ceil_of_division( num_chunks_from_n, chunk_count) if num_chunks_from_n <= 1: num_chunks_from_n = 1 # For each dimension i in N, calculate chunk_count[i], then set chunk_length. for i in N: the_dim = calculated_dims[i] chunk_count = math.pow(num_chunks_from_n, 1.0 / len(N)) if not args.keep_shape: # calculate geomean product = 1.0 for k in N: product *= calculated_dims[k].distinct_count geomean = math.pow(product, 1.0 / len(N)) chunk_count *= the_dim.distinct_count / geomean if chunk_count < 1: chunk_count = 1.0 the_dim.chunk_length = int( math.ceil( (the_dim.max_coord - the_dim.min_coord + 1) / chunk_count)) if chunk_count > 1: the_dim.chunk_length = scidb_math.snap_to_grid( the_dim.chunk_length, args.grid_threshold, use_binary=(not args.grid_base10)) progress_tracker.end_step('calculate') # Print result. print scidb_schema.unparse(dims=[x.to_tuple() for x in calculated_dims]) return 0