Example #1
0
def test_scidb_math_module():
    """Testing all public methods in scidblib.scidb_math."""
    print '*** testing scidblib.scidb_math...'

    a = scidb_math.comma_separated_number(1234.1234)
    assert a == '1,234.1234'
    print 'comma-separate_number(1234.1234) =', a

    a = scidb_math.fraction_if_less_than_one(0.125)
    assert a == '1/8'
    print 'fraction_if_less_than_one(0.125) =', a

    a = scidb_math.ceil_of_division(8, 3)
    assert a == 3
    print 'ceil_of_division(8, 3) =', a

    a = scidb_math.round_up(3248, 2)
    assert a == 3300
    print 'round_up(3248, 2) =', a

    a = scidb_math.round_down(3248, 2)
    assert a == 3200
    print 'round_down(3248, 2) =', a

    a = scidb_math.snap_to_grid(3161, 0.01, use_binary=False)
    assert a == 3160
    print 'snap_to_grid(3161, 0.01, use_binary=False) =', a

    a = scidb_math.snap_to_grid(3161, 0.1, use_binary=False)
    assert a == 3000
    print 'snap_to_grid(3161, 0.1, use_binary=False) =', a

    a = scidb_math.snap_to_grid(1021, 0.01, use_binary=True)
    assert a == 1024
    print 'snap_to_grid(1021, 0.01, use_binary=True) =', a

    a = scidb_math.geomean([3, 3, 4, 8])
    assert round(a, 10) == 4.1195342878
    print 'geomean([3, 3, 4, 8]) =', a
    print
Example #2
0
def test_scidb_math_module():
    """Testing all public methods in scidblib.scidb_math."""
    print '*** testing scidblib.scidb_math...'

    a = scidb_math.comma_separated_number(1234.1234)
    assert a == '1,234.1234'
    print 'comma-separate_number(1234.1234) =', a

    a = scidb_math.fraction_if_less_than_one(0.125)
    assert a == '1/8'
    print 'fraction_if_less_than_one(0.125) =', a

    a = scidb_math.ceil_of_division(8, 3)
    assert a == 3
    print 'ceil_of_division(8, 3) =', a

    a = scidb_math.round_up(3248, 2)
    assert a == 3300
    print 'round_up(3248, 2) =', a

    a = scidb_math.round_down(3248, 2)
    assert a == 3200
    print 'round_down(3248, 2) =', a

    a = scidb_math.snap_to_grid(3161, 0.01, use_binary=False)
    assert a == 3160
    print 'snap_to_grid(3161, 0.01, use_binary=False) =', a

    a = scidb_math.snap_to_grid(3161, 0.1, use_binary=False)
    assert a == 3000
    print 'snap_to_grid(3161, 0.1, use_binary=False) =', a

    a = scidb_math.snap_to_grid(1021, 0.01, use_binary=True)
    assert a == 1024
    print 'snap_to_grid(1021, 0.01, use_binary=True) =', a

    a = scidb_math.geomean([3, 3, 4, 8])
    assert round(a, 10) == 4.1195342878
    print 'geomean([3, 3, 4, 8]) =', a
    print
Example #3
0
def calculate_chunk_length(args):
    """Calculate chunk length and other fields which were '?', and print out the schema.

    @param args  the result of argparse.ArgumentParser.parse_args().
    @return 0
    @exception AppError if anything goes wrong.
    """
    iquery_cmd = scidb_afl.get_iquery_cmd(args)
    load_array = args.load_array
    raw_dims_str = args.raw_dims

    calculated_dims = parse_dimensions(raw_dims_str)
    dbg("Calculated dims:", [x.to_tuple() for x in calculated_dims])

    # Initialize the progress tracker
    progress_tracker = scidb_progress.ProgressTracker(
        sys.stdout,
        '',
        args.verbose,  # if_print_start
        args.verbose,  # if_print_end
        args.verbose  # if_print_skip
    )
    progress_tracker.register_step(
        'min_max_dc',
        'Get min_coord, max_coord, and ApproxDC for each dim from load_array.')
    progress_tracker.register_step('overall_dc',
                                   'Get overall ApproxDC from load_array.')
    progress_tracker.register_step(
        'calculate', 'Calculate and adjust dimension specification.')

    # S = dims where chunk_length is Specified;
    # N = dims where chunk_length is Not specified.
    S = []
    N = []
    for i, the_dim in enumerate(calculated_dims):
        if the_dim.chunk_length == '?':
            N.append(i)
        else:
            S.append(i)
    dbg("S:", S)
    dbg("N:", N)

    # Get the (dimension and attribute) names of the load_array.
    names_in_load_array = NamesInLoadArray(iquery_cmd, load_array)
    dbg("names...:", names_in_load_array.list)

    # for each i in [0..d), calculate min_coord[i], max_coord[i], and distinct_count[i]
    progress_tracker.start_step('min_max_dc')
    for the_dim in calculated_dims:
        index = names_in_load_array.find_index(the_dim.dim_name)
        the_name_in_load_array = names_in_load_array.list[index]

        if the_name_in_load_array.is_dim:
            tmp = names_in_load_array.gen_uniq_name()
            cmd = ('aggregate(apply(aggregate(' + load_array + ', count(*), ' +
                   the_dim.dim_name + '), ' + tmp + ', ' + the_dim.dim_name +
                   '), min(' + tmp + '), max(' + tmp + '), count(*))')
        else:
            cmd = ('aggregate(' + load_array + ', min(' + the_dim.dim_name +
                   '), max(' + the_dim.dim_name + '), approxdc(' +
                   the_dim.dim_name + '))')
        dbg("Cmd:", cmd)
        min_coord, max_coord, distinct_count = scidb_afl.single_cell_afl(
            iquery_cmd, cmd, 3)
        dbg("(min,max,dc):", (min_coord, max_coord, distinct_count))
        try:
            min_coord_int = int(min_coord)
            max_coord_int = int(max_coord)
            distinct_count_int = int(distinct_count)
            if args.verbose:
                print 'For ' + the_dim.dim_name + ', min_coord=' + str(min_coord_int) +\
                    ', max_coord=' + str(max_coord_int) +\
                    ', distinct_count=' + str(distinct_count_int)
        except ValueError:
            raise scidblib.AppError('Error: I cannot proceed because for ' +
                                    the_dim.dim_name + ' in array ' +
                                    load_array + ', not all of min_coord (=' +
                                    min_coord + '), max_coord (=' + max_coord +
                                    '), and distinct_count (=' +
                                    distinct_count + ') are integers.')
        the_dim.set_min_max_dc(min_coord_int, max_coord_int,
                               distinct_count_int)
    progress_tracker.end_step('min_max_dc')

    # Fill dim_low, dim_high, and chunk_overlap (which was a '?' before).
    for the_dim in calculated_dims:
        if the_dim.dim_low == '?':
            the_dim.dim_low = the_dim.min_coord
        if the_dim.dim_high == '?':
            the_dim.dim_high = the_dim.max_coord
        if the_dim.chunk_overlap == '?':
            the_dim.chunk_overlap = 0

    # Generate string_concat_of_dim_values in the form of:
    # string(dim_name1) + '|' + string(dim_name2) + '|' + string(dim_name3)
    string_values = []
    for i, the_dim in enumerate(calculated_dims):
        string_values.append('string(' + the_dim.dim_name + ')')
    string_concat_of_dim_values = ' + \'|\' + '.join(string_values)

    # Calculate overall_distinct_count.
    tmp = names_in_load_array.gen_uniq_name()
    cmd = ('aggregate(apply(' + load_array + ', ' + tmp + ', ' +
           string_concat_of_dim_values + '), approxdc(' + tmp + '))')
    progress_tracker.start_step('overall_dc')
    overall_distinct_count = scidb_afl.single_cell_afl(iquery_cmd, cmd, 1)
    overall_count = scidb_afl.single_cell_afl(
        iquery_cmd, 'aggregate(' + load_array + ', count(*))', 1)
    try:
        overall_distinct_count = int(overall_distinct_count)
        overall_count = int(overall_count)
        if overall_distinct_count > overall_count:
            overall_distinct_count = overall_count
    except ValueError:
        raise scidblib.AppError(
            'Error: The query to get overall_distinct_count failed to return an integer.'
        )
    if args.verbose:
        print 'overall_distinct_count=' + str(overall_distinct_count)
    progress_tracker.end_step('overall_dc')

    progress_tracker.start_step('calculate')

    # Shortcut: if |N| == 0, we are done.
    if len(N) == 0:
        print scidb_schema.unparse(
            dims=[x.to_tuple() for x in calculated_dims])
        return 0

    # Set num_chunks_from_n.
    num_chunks_from_n = scidb_math.ceil_of_division(
        overall_distinct_count, args.desired_values_per_chunk)
    for i in S:
        the_dim = calculated_dims[i]
        chunk_count = scidb_math.ceil_of_division(the_dim.distinct_count,
                                                  int(the_dim.chunk_length))
        num_chunks_from_n = scidb_math.ceil_of_division(
            num_chunks_from_n, chunk_count)
    if num_chunks_from_n <= 1:
        num_chunks_from_n = 1

    # For each dimension i in N, calculate chunk_count[i], then set chunk_length.
    for i in N:
        the_dim = calculated_dims[i]
        chunk_count = math.pow(num_chunks_from_n, 1.0 / len(N))
        if not args.keep_shape:
            # calculate geomean
            product = 1.0
            for k in N:
                product *= calculated_dims[k].distinct_count
            geomean = math.pow(product, 1.0 / len(N))
            chunk_count *= the_dim.distinct_count / geomean
        if chunk_count < 1:
            chunk_count = 1.0
        the_dim.chunk_length = int(
            math.ceil(
                (the_dim.max_coord - the_dim.min_coord + 1) / chunk_count))
        if chunk_count > 1:
            the_dim.chunk_length = scidb_math.snap_to_grid(
                the_dim.chunk_length,
                args.grid_threshold,
                use_binary=(not args.grid_base10))
    progress_tracker.end_step('calculate')

    # Print result.
    print scidb_schema.unparse(dims=[x.to_tuple() for x in calculated_dims])
    return 0
Example #4
0
def calculate_chunk_length(args):
    """Calculate chunk length and other fields which were '?', and print out the schema.

    @param args  the result of argparse.ArgumentParser.parse_args().
    @return 0
    @exception AppError if anything goes wrong.
    """
    iquery_cmd = scidb_afl.get_iquery_cmd(args)
    load_array = args.load_array
    raw_dims_str = args.raw_dims

    calculated_dims = Dimensions(raw_dims_str)

    # Initialize the progress tracker
    progress_tracker = scidb_progress.ProgressTracker(sys.stdout,
                                      '',
                                      args.verbose,     # if_print_start
                                      args.verbose,     # if_print_end
                                      args.verbose      # if_print_skip
                                      )
    progress_tracker.register_step('min_max_dc', 'Get min_coord, max_coord, and ApproxDC for each dim from load_array.')
    progress_tracker.register_step('overall_dc', 'Get overall ApproxDC from load_array.')
    progress_tracker.register_step('calculate', 'Calculate and adjust dimension specification.')

    # S = dims where chunk_length is Specified;
    # N = dims where chunk_length is Not specified.
    S = []
    N = []
    for i, the_dim in enumerate(calculated_dims.list):
        if the_dim.chunk_length == '?':
            N.append(i)
        else:
            S.append(i)

    # Get the (dimension and attribute) names of the load_array.
    names_in_load_array = NamesInLoadArray(iquery_cmd, load_array)

    # for each i in [0..d), calculate min_coord[i], max_coord[i], and distinct_count[i]
    progress_tracker.start_step('min_max_dc')
    for the_dim in calculated_dims.list:
        index = names_in_load_array.find_index(the_dim.dim_name)
        the_name_in_load_array = names_in_load_array.list[index]

        if the_name_in_load_array.is_dim:
            tmp = names_in_load_array.gen_uniq_name()
            cmd = ('aggregate(apply(aggregate(' + load_array + ', count(*), ' + the_dim.dim_name +
                  '), ' + tmp + ', ' + the_dim.dim_name + '), min(' + tmp + '), max(' + tmp + '), count(*))'
                  )
        else:
            cmd = ('aggregate(' + load_array + ', min(' + the_dim.dim_name + '), max(' + the_dim.dim_name +
                   '), approxdc(' + the_dim.dim_name + '))'
                   )
        min_coord, max_coord, distinct_count = scidb_afl.single_cell_afl(iquery_cmd, cmd, 3)
        try:
            min_coord_int = int(min_coord)
            max_coord_int = int(max_coord)
            distinct_count_int = int(distinct_count)
            if args.verbose:
                print 'For ' + the_dim.dim_name + ', min_coord=' + str(min_coord_int) +\
                    ', max_coord=' + str(max_coord_int) +\
                    ', distinct_count=' + str(distinct_count_int)
        except ValueError:
            raise scidblib.AppError('Error: I cannot proceed because for ' + the_dim.dim_name + ' in array ' + load_array +
                            ', not all of min_coord (=' + min_coord + '), max_coord (=' + max_coord +
                            '), and distinct_count (=' + distinct_count + ') are integers.')
        the_dim.set_min_max_dc(min_coord_int, max_coord_int, distinct_count_int)
    progress_tracker.end_step('min_max_dc')

    # Fill dim_low, dim_high, and chunk_overlap (which was a '?' before).
    for the_dim in calculated_dims.list:
        if the_dim.dim_low == '?':
            the_dim.dim_low = the_dim.min_coord
        if the_dim.dim_high == '?':
            the_dim.dim_high = the_dim.max_coord
        if the_dim.chunk_overlap == '?':
            the_dim.chunk_overlap = 0

    # Generate string_concat_of_dim_values in the form of:
    # string(dim_name1) + '|' + string(dim_name2) + '|' + string(dim_name3)
    string_values = []
    for i, the_dim in enumerate(calculated_dims.list):
        string_values.append('string(' + the_dim.dim_name + ')')
    string_concat_of_dim_values = ' + \'|\' + '.join(string_values)

    # Calculate overall_distinct_count.
    tmp = names_in_load_array.gen_uniq_name()
    cmd = ('aggregate(apply(' + load_array + ', ' + tmp + ', ' + string_concat_of_dim_values + '), approxdc(' + tmp + '))'
           )
    progress_tracker.start_step('overall_dc')
    overall_distinct_count = scidb_afl.single_cell_afl(iquery_cmd, cmd, 1)
    overall_count = scidb_afl.single_cell_afl(iquery_cmd, 'aggregate(' + load_array + ', count(*))', 1)
    try:
        overall_distinct_count = int(overall_distinct_count)
        overall_count = int(overall_count)
        if overall_distinct_count > overall_count:
            overall_distinct_count = overall_count
    except ValueError:
        raise scidblib.AppError('Error: The query to get overall_distinct_count failed to return an integer.')
    if args.verbose:
        print 'overall_distinct_count=' + str(overall_distinct_count)
    progress_tracker.end_step('overall_dc')

    progress_tracker.start_step('calculate')

    # Shortcut: if |N| == 0, we are done.
    if len(N)==0:
        print calculated_dims.__str__()
        return 0

    # Set num_chunks_from_n.
    num_chunks_from_n = scidb_math.ceil_of_division(overall_distinct_count, args.desired_values_per_chunk)
    for i in S:
        the_dim = calculated_dims.list[i]
        chunk_count = scidb_math.ceil_of_division(the_dim.distinct_count, int(the_dim.chunk_length))
        num_chunks_from_n = scidb_math.ceil_of_division(num_chunks_from_n, chunk_count)
    if num_chunks_from_n <= 1:
        num_chunks_from_n = 1

    # For each dimension i in N, calculate chunk_count[i], then set chunk_length.
    for i in N:
        the_dim = calculated_dims.list[i]
        chunk_count = math.pow(num_chunks_from_n, 1.0/len(N))
        if not args.keep_shape:
            # calculate geomean
            product = 1.0
            for k in N:
                product *= calculated_dims.list[k].distinct_count
            geomean = math.pow(product, 1.0/len(N))
            chunk_count *= the_dim.distinct_count / geomean
        if chunk_count<1:
            chunk_count = 1.0
        the_dim.chunk_length = int(math.ceil(
                                           (the_dim.max_coord-the_dim.min_coord+1)/chunk_count
                                           ))
        if chunk_count>1:
            the_dim.chunk_length = scidb_math.snap_to_grid(
                                   the_dim.chunk_length, args.grid_threshold, use_binary=(not args.grid_base10))
    progress_tracker.end_step('calculate')

    # Print result.
    print calculated_dims.__str__()

    return 0