def test_scidb_math_module(): """Testing all public methods in scidblib.scidb_math.""" print '*** testing scidblib.scidb_math...' a = scidb_math.comma_separated_number(1234.1234) assert a == '1,234.1234' print 'comma-separate_number(1234.1234) =', a a = scidb_math.fraction_if_less_than_one(0.125) assert a == '1/8' print 'fraction_if_less_than_one(0.125) =', a a = scidb_math.ceil_of_division(8, 3) assert a == 3 print 'ceil_of_division(8, 3) =', a a = scidb_math.round_up(3248, 2) assert a == 3300 print 'round_up(3248, 2) =', a a = scidb_math.round_down(3248, 2) assert a == 3200 print 'round_down(3248, 2) =', a a = scidb_math.snap_to_grid(3161, 0.01, use_binary=False) assert a == 3160 print 'snap_to_grid(3161, 0.01, use_binary=False) =', a a = scidb_math.snap_to_grid(3161, 0.1, use_binary=False) assert a == 3000 print 'snap_to_grid(3161, 0.1, use_binary=False) =', a a = scidb_math.snap_to_grid(1021, 0.01, use_binary=True) assert a == 1024 print 'snap_to_grid(1021, 0.01, use_binary=True) =', a a = scidb_math.geomean([3, 3, 4, 8]) assert round(a, 10) == 4.1195342878 print 'geomean([3, 3, 4, 8]) =', a print
def calculate_chunk_length(args): """Calculate chunk length and other fields which were '?', and print out the schema. @param args the result of argparse.ArgumentParser.parse_args(). @return 0 @exception AppError if anything goes wrong. """ iquery_cmd = scidb_afl.get_iquery_cmd(args) load_array = args.load_array raw_dims_str = args.raw_dims calculated_dims = parse_dimensions(raw_dims_str) dbg("Calculated dims:", [x.to_tuple() for x in calculated_dims]) # Initialize the progress tracker progress_tracker = scidb_progress.ProgressTracker( sys.stdout, '', args.verbose, # if_print_start args.verbose, # if_print_end args.verbose # if_print_skip ) progress_tracker.register_step( 'min_max_dc', 'Get min_coord, max_coord, and ApproxDC for each dim from load_array.') progress_tracker.register_step('overall_dc', 'Get overall ApproxDC from load_array.') progress_tracker.register_step( 'calculate', 'Calculate and adjust dimension specification.') # S = dims where chunk_length is Specified; # N = dims where chunk_length is Not specified. S = [] N = [] for i, the_dim in enumerate(calculated_dims): if the_dim.chunk_length == '?': N.append(i) else: S.append(i) dbg("S:", S) dbg("N:", N) # Get the (dimension and attribute) names of the load_array. names_in_load_array = NamesInLoadArray(iquery_cmd, load_array) dbg("names...:", names_in_load_array.list) # for each i in [0..d), calculate min_coord[i], max_coord[i], and distinct_count[i] progress_tracker.start_step('min_max_dc') for the_dim in calculated_dims: index = names_in_load_array.find_index(the_dim.dim_name) the_name_in_load_array = names_in_load_array.list[index] if the_name_in_load_array.is_dim: tmp = names_in_load_array.gen_uniq_name() cmd = ('aggregate(apply(aggregate(' + load_array + ', count(*), ' + the_dim.dim_name + '), ' + tmp + ', ' + the_dim.dim_name + '), min(' + tmp + '), max(' + tmp + '), count(*))') else: cmd = ('aggregate(' + load_array + ', min(' + the_dim.dim_name + '), max(' + the_dim.dim_name + '), approxdc(' + the_dim.dim_name + '))') dbg("Cmd:", cmd) min_coord, max_coord, distinct_count = scidb_afl.single_cell_afl( iquery_cmd, cmd, 3) dbg("(min,max,dc):", (min_coord, max_coord, distinct_count)) try: min_coord_int = int(min_coord) max_coord_int = int(max_coord) distinct_count_int = int(distinct_count) if args.verbose: print 'For ' + the_dim.dim_name + ', min_coord=' + str(min_coord_int) +\ ', max_coord=' + str(max_coord_int) +\ ', distinct_count=' + str(distinct_count_int) except ValueError: raise scidblib.AppError('Error: I cannot proceed because for ' + the_dim.dim_name + ' in array ' + load_array + ', not all of min_coord (=' + min_coord + '), max_coord (=' + max_coord + '), and distinct_count (=' + distinct_count + ') are integers.') the_dim.set_min_max_dc(min_coord_int, max_coord_int, distinct_count_int) progress_tracker.end_step('min_max_dc') # Fill dim_low, dim_high, and chunk_overlap (which was a '?' before). for the_dim in calculated_dims: if the_dim.dim_low == '?': the_dim.dim_low = the_dim.min_coord if the_dim.dim_high == '?': the_dim.dim_high = the_dim.max_coord if the_dim.chunk_overlap == '?': the_dim.chunk_overlap = 0 # Generate string_concat_of_dim_values in the form of: # string(dim_name1) + '|' + string(dim_name2) + '|' + string(dim_name3) string_values = [] for i, the_dim in enumerate(calculated_dims): string_values.append('string(' + the_dim.dim_name + ')') string_concat_of_dim_values = ' + \'|\' + '.join(string_values) # Calculate overall_distinct_count. tmp = names_in_load_array.gen_uniq_name() cmd = ('aggregate(apply(' + load_array + ', ' + tmp + ', ' + string_concat_of_dim_values + '), approxdc(' + tmp + '))') progress_tracker.start_step('overall_dc') overall_distinct_count = scidb_afl.single_cell_afl(iquery_cmd, cmd, 1) overall_count = scidb_afl.single_cell_afl( iquery_cmd, 'aggregate(' + load_array + ', count(*))', 1) try: overall_distinct_count = int(overall_distinct_count) overall_count = int(overall_count) if overall_distinct_count > overall_count: overall_distinct_count = overall_count except ValueError: raise scidblib.AppError( 'Error: The query to get overall_distinct_count failed to return an integer.' ) if args.verbose: print 'overall_distinct_count=' + str(overall_distinct_count) progress_tracker.end_step('overall_dc') progress_tracker.start_step('calculate') # Shortcut: if |N| == 0, we are done. if len(N) == 0: print scidb_schema.unparse( dims=[x.to_tuple() for x in calculated_dims]) return 0 # Set num_chunks_from_n. num_chunks_from_n = scidb_math.ceil_of_division( overall_distinct_count, args.desired_values_per_chunk) for i in S: the_dim = calculated_dims[i] chunk_count = scidb_math.ceil_of_division(the_dim.distinct_count, int(the_dim.chunk_length)) num_chunks_from_n = scidb_math.ceil_of_division( num_chunks_from_n, chunk_count) if num_chunks_from_n <= 1: num_chunks_from_n = 1 # For each dimension i in N, calculate chunk_count[i], then set chunk_length. for i in N: the_dim = calculated_dims[i] chunk_count = math.pow(num_chunks_from_n, 1.0 / len(N)) if not args.keep_shape: # calculate geomean product = 1.0 for k in N: product *= calculated_dims[k].distinct_count geomean = math.pow(product, 1.0 / len(N)) chunk_count *= the_dim.distinct_count / geomean if chunk_count < 1: chunk_count = 1.0 the_dim.chunk_length = int( math.ceil( (the_dim.max_coord - the_dim.min_coord + 1) / chunk_count)) if chunk_count > 1: the_dim.chunk_length = scidb_math.snap_to_grid( the_dim.chunk_length, args.grid_threshold, use_binary=(not args.grid_base10)) progress_tracker.end_step('calculate') # Print result. print scidb_schema.unparse(dims=[x.to_tuple() for x in calculated_dims]) return 0
def calculate_chunk_length(args): """Calculate chunk length and other fields which were '?', and print out the schema. @param args the result of argparse.ArgumentParser.parse_args(). @return 0 @exception AppError if anything goes wrong. """ iquery_cmd = scidb_afl.get_iquery_cmd(args) load_array = args.load_array raw_dims_str = args.raw_dims calculated_dims = Dimensions(raw_dims_str) # Initialize the progress tracker progress_tracker = scidb_progress.ProgressTracker(sys.stdout, '', args.verbose, # if_print_start args.verbose, # if_print_end args.verbose # if_print_skip ) progress_tracker.register_step('min_max_dc', 'Get min_coord, max_coord, and ApproxDC for each dim from load_array.') progress_tracker.register_step('overall_dc', 'Get overall ApproxDC from load_array.') progress_tracker.register_step('calculate', 'Calculate and adjust dimension specification.') # S = dims where chunk_length is Specified; # N = dims where chunk_length is Not specified. S = [] N = [] for i, the_dim in enumerate(calculated_dims.list): if the_dim.chunk_length == '?': N.append(i) else: S.append(i) # Get the (dimension and attribute) names of the load_array. names_in_load_array = NamesInLoadArray(iquery_cmd, load_array) # for each i in [0..d), calculate min_coord[i], max_coord[i], and distinct_count[i] progress_tracker.start_step('min_max_dc') for the_dim in calculated_dims.list: index = names_in_load_array.find_index(the_dim.dim_name) the_name_in_load_array = names_in_load_array.list[index] if the_name_in_load_array.is_dim: tmp = names_in_load_array.gen_uniq_name() cmd = ('aggregate(apply(aggregate(' + load_array + ', count(*), ' + the_dim.dim_name + '), ' + tmp + ', ' + the_dim.dim_name + '), min(' + tmp + '), max(' + tmp + '), count(*))' ) else: cmd = ('aggregate(' + load_array + ', min(' + the_dim.dim_name + '), max(' + the_dim.dim_name + '), approxdc(' + the_dim.dim_name + '))' ) min_coord, max_coord, distinct_count = scidb_afl.single_cell_afl(iquery_cmd, cmd, 3) try: min_coord_int = int(min_coord) max_coord_int = int(max_coord) distinct_count_int = int(distinct_count) if args.verbose: print 'For ' + the_dim.dim_name + ', min_coord=' + str(min_coord_int) +\ ', max_coord=' + str(max_coord_int) +\ ', distinct_count=' + str(distinct_count_int) except ValueError: raise scidblib.AppError('Error: I cannot proceed because for ' + the_dim.dim_name + ' in array ' + load_array + ', not all of min_coord (=' + min_coord + '), max_coord (=' + max_coord + '), and distinct_count (=' + distinct_count + ') are integers.') the_dim.set_min_max_dc(min_coord_int, max_coord_int, distinct_count_int) progress_tracker.end_step('min_max_dc') # Fill dim_low, dim_high, and chunk_overlap (which was a '?' before). for the_dim in calculated_dims.list: if the_dim.dim_low == '?': the_dim.dim_low = the_dim.min_coord if the_dim.dim_high == '?': the_dim.dim_high = the_dim.max_coord if the_dim.chunk_overlap == '?': the_dim.chunk_overlap = 0 # Generate string_concat_of_dim_values in the form of: # string(dim_name1) + '|' + string(dim_name2) + '|' + string(dim_name3) string_values = [] for i, the_dim in enumerate(calculated_dims.list): string_values.append('string(' + the_dim.dim_name + ')') string_concat_of_dim_values = ' + \'|\' + '.join(string_values) # Calculate overall_distinct_count. tmp = names_in_load_array.gen_uniq_name() cmd = ('aggregate(apply(' + load_array + ', ' + tmp + ', ' + string_concat_of_dim_values + '), approxdc(' + tmp + '))' ) progress_tracker.start_step('overall_dc') overall_distinct_count = scidb_afl.single_cell_afl(iquery_cmd, cmd, 1) overall_count = scidb_afl.single_cell_afl(iquery_cmd, 'aggregate(' + load_array + ', count(*))', 1) try: overall_distinct_count = int(overall_distinct_count) overall_count = int(overall_count) if overall_distinct_count > overall_count: overall_distinct_count = overall_count except ValueError: raise scidblib.AppError('Error: The query to get overall_distinct_count failed to return an integer.') if args.verbose: print 'overall_distinct_count=' + str(overall_distinct_count) progress_tracker.end_step('overall_dc') progress_tracker.start_step('calculate') # Shortcut: if |N| == 0, we are done. if len(N)==0: print calculated_dims.__str__() return 0 # Set num_chunks_from_n. num_chunks_from_n = scidb_math.ceil_of_division(overall_distinct_count, args.desired_values_per_chunk) for i in S: the_dim = calculated_dims.list[i] chunk_count = scidb_math.ceil_of_division(the_dim.distinct_count, int(the_dim.chunk_length)) num_chunks_from_n = scidb_math.ceil_of_division(num_chunks_from_n, chunk_count) if num_chunks_from_n <= 1: num_chunks_from_n = 1 # For each dimension i in N, calculate chunk_count[i], then set chunk_length. for i in N: the_dim = calculated_dims.list[i] chunk_count = math.pow(num_chunks_from_n, 1.0/len(N)) if not args.keep_shape: # calculate geomean product = 1.0 for k in N: product *= calculated_dims.list[k].distinct_count geomean = math.pow(product, 1.0/len(N)) chunk_count *= the_dim.distinct_count / geomean if chunk_count<1: chunk_count = 1.0 the_dim.chunk_length = int(math.ceil( (the_dim.max_coord-the_dim.min_coord+1)/chunk_count )) if chunk_count>1: the_dim.chunk_length = scidb_math.snap_to_grid( the_dim.chunk_length, args.grid_threshold, use_binary=(not args.grid_base10)) progress_tracker.end_step('calculate') # Print result. print calculated_dims.__str__() return 0