def test_scidb_afl_module():
    """Testing all public methods in scidblib.scidb_afl."""
    print '*** testing scidblib.scidb_afl...'

    class TmpArgs:
        def __init__(self):
            self.host = ''
            self.port = ''

    args = TmpArgs()
    iquery_cmd = scidb_afl.get_iquery_cmd(args)
    scidb_afl.execute_it_return_out_err('ls')
    scidb_afl.afl(iquery_cmd, 'list()')

    print 'time_afl(..., \'list()\') =', scidb_afl.time_afl(
        iquery_cmd, 'list()')

    print 'single_cell_afl(..., \'build(<v:int64>[i=0:0,1,0], 5)\', 1) =', \
        scidb_afl.single_cell_afl(iquery_cmd, 'build(<v:int64>[i=0:0,1,0], 5)', 1)

    print 'single_cell_afl(..., \'apply(build(<v:int64>[i=0:0,1,0], 5), v2, 6)\', 2) =', \
        scidb_afl.single_cell_afl(iquery_cmd, 'apply(build(<v:int64>[i=0:0,1,0], 5), v2, 6)', 2)

    print 'get_num_instances(...) =', scidb_afl.get_num_instances(iquery_cmd)
    print 'get_array_names(...) =', scidb_afl.get_array_names(iquery_cmd)
    print
Esempio n. 2
0
def test_scidb_afl_module():
    """Testing all public methods in scidblib.scidb_afl."""
    print '*** testing scidblib.scidb_afl...'
    class TmpArgs:
        def __init__(self):
            self.host = ''
            self.port = ''

    args = TmpArgs()
    iquery_cmd = scidb_afl.get_iquery_cmd(args)
    scidb_afl.execute_it_return_out_err('ls')
    scidb_afl.afl(iquery_cmd, 'list()')

    print 'time_afl(..., \'list()\') =', scidb_afl.time_afl(iquery_cmd, 'list()')

    print 'single_cell_afl(..., \'build(<v:int64>[i=0:0,1,0], 5)\', 1) =', \
        scidb_afl.single_cell_afl(iquery_cmd, 'build(<v:int64>[i=0:0,1,0], 5)', 1)

    print 'single_cell_afl(..., \'apply(build(<v:int64>[i=0:0,1,0], 5), v2, 6)\', 2) =', \
        scidb_afl.single_cell_afl(iquery_cmd, 'apply(build(<v:int64>[i=0:0,1,0], 5), v2, 6)', 2)

    print 'get_num_instances(...) =', scidb_afl.get_num_instances(iquery_cmd)
    print 'get_array_names(...) =', scidb_afl.get_array_names(iquery_cmd)
    print
    def __init__(self, iquery_cmd, load_array):
        """Call iquery -aq "show(load_array)" to get the schema of the load array, and fill in data members.

        @param iquery_cmd  the iquery command.
        @param load_array  the name of the load array.
        @exception AppError if the show() command does not produce a valid schema,
                             e.g. if load_array is not a valid array name in the database.
        """
        self.list = []

        schema_str = scidb_afl.single_cell_afl(iquery_cmd,
                                               'show(' + load_array + ')', 1)
        re_schema = (
            r'^.*' +  # array_name
            r'\<(.*)\>\s*' +  # <attributes>
            r'\[(.*)\]$'  # [dimensions]
        )

        match_schema = re.match(re_schema, schema_str, re.M | re.I)
        if not match_schema:
            raise scidblib.AppError(
                'System Error! I failed to parse the schema of the load_array.'
            )
        str_attrs = match_schema.group(1)
        str_dims = match_schema.group(2)

        # attributes
        self.attrs = Attributes(str_attrs)
        attrs = self.attrs.list
        for i, attr in enumerate(attrs):
            one_name = NameInLoadArray(attr.attr_name,
                                       is_dim=False,
                                       is_int64=attr.attr_type == 'int64',
                                       local_index=i)
            self.list.append(one_name)

        # dimensions
        self.dims = Dimensions(str_dims)
        dims = self.dims.list
        for i, dim in enumerate(dims):
            one_name = NameInLoadArray(dim.dim_name,
                                       is_dim=True,
                                       is_int64=True,
                                       local_index=i)
            self.list.append(one_name)
Esempio n. 4
0
    def __init__(self, iquery_cmd, load_array):
        """Call iquery -aq "show(load_array)" to get the schema of the load array, and fill in data members.

        @param iquery_cmd  the iquery command.
        @param load_array  the name of the load array.
        @exception AppError if the show() command does not produce a valid schema,
                             e.g. if load_array is not a valid array name in the database.
        """
        self.list = []

        schema_str = scidb_afl.single_cell_afl(iquery_cmd, 'show(' + load_array + ')', 1)
        re_schema = (
            r'^.*' +             # array_name
            r'\<(.*)\>\s*' +     # <attributes>
            r'\[(.*)\]$'         # [dimensions]
        )

        match_schema = re.match(re_schema, schema_str, re.M|re.I)
        if not match_schema:
            raise scidblib.AppError('System Error! I failed to parse the schema of the load_array.')
        str_attrs = match_schema.group(1)
        str_dims = match_schema.group(2)

        # attributes
        self.attrs = Attributes(str_attrs)
        attrs = self.attrs.list
        for i, attr in enumerate(attrs):
            one_name = NameInLoadArray(attr.attr_name,
                                       is_dim = False,
                                       is_int64 = attr.attr_type=='int64',
                                       local_index = i)
            self.list.append(one_name)

        # dimensions
        self.dims = Dimensions(str_dims)
        dims = self.dims.list
        for i, dim in enumerate(dims):
            one_name = NameInLoadArray(dim.dim_name,
                                       is_dim = True,
                                       is_int64 = True,
                                       local_index = i)
            self.list.append(one_name)
Esempio n. 5
0
def calculate_chunk_length(args):
    """Calculate chunk length and other fields which were '?', and print out the schema.

    @param args  the result of argparse.ArgumentParser.parse_args().
    @return 0
    @exception AppError if anything goes wrong.
    """
    iquery_cmd = scidb_afl.get_iquery_cmd(args)
    load_array = args.load_array
    raw_dims_str = args.raw_dims

    calculated_dims = parse_dimensions(raw_dims_str)
    dbg("Calculated dims:", [x.to_tuple() for x in calculated_dims])

    # Initialize the progress tracker
    progress_tracker = scidb_progress.ProgressTracker(
        sys.stdout,
        '',
        args.verbose,  # if_print_start
        args.verbose,  # if_print_end
        args.verbose  # if_print_skip
    )
    progress_tracker.register_step(
        'min_max_dc',
        'Get min_coord, max_coord, and ApproxDC for each dim from load_array.')
    progress_tracker.register_step('overall_dc',
                                   'Get overall ApproxDC from load_array.')
    progress_tracker.register_step(
        'calculate', 'Calculate and adjust dimension specification.')

    # S = dims where chunk_length is Specified;
    # N = dims where chunk_length is Not specified.
    S = []
    N = []
    for i, the_dim in enumerate(calculated_dims):
        if the_dim.chunk_length == '?':
            N.append(i)
        else:
            S.append(i)
    dbg("S:", S)
    dbg("N:", N)

    # Get the (dimension and attribute) names of the load_array.
    names_in_load_array = NamesInLoadArray(iquery_cmd, load_array)
    dbg("names...:", names_in_load_array.list)

    # for each i in [0..d), calculate min_coord[i], max_coord[i], and distinct_count[i]
    progress_tracker.start_step('min_max_dc')
    for the_dim in calculated_dims:
        index = names_in_load_array.find_index(the_dim.dim_name)
        the_name_in_load_array = names_in_load_array.list[index]

        if the_name_in_load_array.is_dim:
            tmp = names_in_load_array.gen_uniq_name()
            cmd = ('aggregate(apply(aggregate(' + load_array + ', count(*), ' +
                   the_dim.dim_name + '), ' + tmp + ', ' + the_dim.dim_name +
                   '), min(' + tmp + '), max(' + tmp + '), count(*))')
        else:
            cmd = ('aggregate(' + load_array + ', min(' + the_dim.dim_name +
                   '), max(' + the_dim.dim_name + '), approxdc(' +
                   the_dim.dim_name + '))')
        dbg("Cmd:", cmd)
        min_coord, max_coord, distinct_count = scidb_afl.single_cell_afl(
            iquery_cmd, cmd, 3)
        dbg("(min,max,dc):", (min_coord, max_coord, distinct_count))
        try:
            min_coord_int = int(min_coord)
            max_coord_int = int(max_coord)
            distinct_count_int = int(distinct_count)
            if args.verbose:
                print 'For ' + the_dim.dim_name + ', min_coord=' + str(min_coord_int) +\
                    ', max_coord=' + str(max_coord_int) +\
                    ', distinct_count=' + str(distinct_count_int)
        except ValueError:
            raise scidblib.AppError('Error: I cannot proceed because for ' +
                                    the_dim.dim_name + ' in array ' +
                                    load_array + ', not all of min_coord (=' +
                                    min_coord + '), max_coord (=' + max_coord +
                                    '), and distinct_count (=' +
                                    distinct_count + ') are integers.')
        the_dim.set_min_max_dc(min_coord_int, max_coord_int,
                               distinct_count_int)
    progress_tracker.end_step('min_max_dc')

    # Fill dim_low, dim_high, and chunk_overlap (which was a '?' before).
    for the_dim in calculated_dims:
        if the_dim.dim_low == '?':
            the_dim.dim_low = the_dim.min_coord
        if the_dim.dim_high == '?':
            the_dim.dim_high = the_dim.max_coord
        if the_dim.chunk_overlap == '?':
            the_dim.chunk_overlap = 0

    # Generate string_concat_of_dim_values in the form of:
    # string(dim_name1) + '|' + string(dim_name2) + '|' + string(dim_name3)
    string_values = []
    for i, the_dim in enumerate(calculated_dims):
        string_values.append('string(' + the_dim.dim_name + ')')
    string_concat_of_dim_values = ' + \'|\' + '.join(string_values)

    # Calculate overall_distinct_count.
    tmp = names_in_load_array.gen_uniq_name()
    cmd = ('aggregate(apply(' + load_array + ', ' + tmp + ', ' +
           string_concat_of_dim_values + '), approxdc(' + tmp + '))')
    progress_tracker.start_step('overall_dc')
    overall_distinct_count = scidb_afl.single_cell_afl(iquery_cmd, cmd, 1)
    overall_count = scidb_afl.single_cell_afl(
        iquery_cmd, 'aggregate(' + load_array + ', count(*))', 1)
    try:
        overall_distinct_count = int(overall_distinct_count)
        overall_count = int(overall_count)
        if overall_distinct_count > overall_count:
            overall_distinct_count = overall_count
    except ValueError:
        raise scidblib.AppError(
            'Error: The query to get overall_distinct_count failed to return an integer.'
        )
    if args.verbose:
        print 'overall_distinct_count=' + str(overall_distinct_count)
    progress_tracker.end_step('overall_dc')

    progress_tracker.start_step('calculate')

    # Shortcut: if |N| == 0, we are done.
    if len(N) == 0:
        print scidb_schema.unparse(
            dims=[x.to_tuple() for x in calculated_dims])
        return 0

    # Set num_chunks_from_n.
    num_chunks_from_n = scidb_math.ceil_of_division(
        overall_distinct_count, args.desired_values_per_chunk)
    for i in S:
        the_dim = calculated_dims[i]
        chunk_count = scidb_math.ceil_of_division(the_dim.distinct_count,
                                                  int(the_dim.chunk_length))
        num_chunks_from_n = scidb_math.ceil_of_division(
            num_chunks_from_n, chunk_count)
    if num_chunks_from_n <= 1:
        num_chunks_from_n = 1

    # For each dimension i in N, calculate chunk_count[i], then set chunk_length.
    for i in N:
        the_dim = calculated_dims[i]
        chunk_count = math.pow(num_chunks_from_n, 1.0 / len(N))
        if not args.keep_shape:
            # calculate geomean
            product = 1.0
            for k in N:
                product *= calculated_dims[k].distinct_count
            geomean = math.pow(product, 1.0 / len(N))
            chunk_count *= the_dim.distinct_count / geomean
        if chunk_count < 1:
            chunk_count = 1.0
        the_dim.chunk_length = int(
            math.ceil(
                (the_dim.max_coord - the_dim.min_coord + 1) / chunk_count))
        if chunk_count > 1:
            the_dim.chunk_length = scidb_math.snap_to_grid(
                the_dim.chunk_length,
                args.grid_threshold,
                use_binary=(not args.grid_base10))
    progress_tracker.end_step('calculate')

    # Print result.
    print scidb_schema.unparse(dims=[x.to_tuple() for x in calculated_dims])
    return 0
Esempio n. 6
0
def calculate_chunk_length(args):
    """Calculate chunk length and other fields which were '?', and print out the schema.

    @param args  the result of argparse.ArgumentParser.parse_args().
    @return 0
    @exception AppError if anything goes wrong.
    """
    iquery_cmd = scidb_afl.get_iquery_cmd(args)
    load_array = args.load_array
    raw_dims_str = args.raw_dims

    calculated_dims = Dimensions(raw_dims_str)

    # Initialize the progress tracker
    progress_tracker = scidb_progress.ProgressTracker(sys.stdout,
                                      '',
                                      args.verbose,     # if_print_start
                                      args.verbose,     # if_print_end
                                      args.verbose      # if_print_skip
                                      )
    progress_tracker.register_step('min_max_dc', 'Get min_coord, max_coord, and ApproxDC for each dim from load_array.')
    progress_tracker.register_step('overall_dc', 'Get overall ApproxDC from load_array.')
    progress_tracker.register_step('calculate', 'Calculate and adjust dimension specification.')

    # S = dims where chunk_length is Specified;
    # N = dims where chunk_length is Not specified.
    S = []
    N = []
    for i, the_dim in enumerate(calculated_dims.list):
        if the_dim.chunk_length == '?':
            N.append(i)
        else:
            S.append(i)

    # Get the (dimension and attribute) names of the load_array.
    names_in_load_array = NamesInLoadArray(iquery_cmd, load_array)

    # for each i in [0..d), calculate min_coord[i], max_coord[i], and distinct_count[i]
    progress_tracker.start_step('min_max_dc')
    for the_dim in calculated_dims.list:
        index = names_in_load_array.find_index(the_dim.dim_name)
        the_name_in_load_array = names_in_load_array.list[index]

        if the_name_in_load_array.is_dim:
            tmp = names_in_load_array.gen_uniq_name()
            cmd = ('aggregate(apply(aggregate(' + load_array + ', count(*), ' + the_dim.dim_name +
                  '), ' + tmp + ', ' + the_dim.dim_name + '), min(' + tmp + '), max(' + tmp + '), count(*))'
                  )
        else:
            cmd = ('aggregate(' + load_array + ', min(' + the_dim.dim_name + '), max(' + the_dim.dim_name +
                   '), approxdc(' + the_dim.dim_name + '))'
                   )
        min_coord, max_coord, distinct_count = scidb_afl.single_cell_afl(iquery_cmd, cmd, 3)
        try:
            min_coord_int = int(min_coord)
            max_coord_int = int(max_coord)
            distinct_count_int = int(distinct_count)
            if args.verbose:
                print 'For ' + the_dim.dim_name + ', min_coord=' + str(min_coord_int) +\
                    ', max_coord=' + str(max_coord_int) +\
                    ', distinct_count=' + str(distinct_count_int)
        except ValueError:
            raise scidblib.AppError('Error: I cannot proceed because for ' + the_dim.dim_name + ' in array ' + load_array +
                            ', not all of min_coord (=' + min_coord + '), max_coord (=' + max_coord +
                            '), and distinct_count (=' + distinct_count + ') are integers.')
        the_dim.set_min_max_dc(min_coord_int, max_coord_int, distinct_count_int)
    progress_tracker.end_step('min_max_dc')

    # Fill dim_low, dim_high, and chunk_overlap (which was a '?' before).
    for the_dim in calculated_dims.list:
        if the_dim.dim_low == '?':
            the_dim.dim_low = the_dim.min_coord
        if the_dim.dim_high == '?':
            the_dim.dim_high = the_dim.max_coord
        if the_dim.chunk_overlap == '?':
            the_dim.chunk_overlap = 0

    # Generate string_concat_of_dim_values in the form of:
    # string(dim_name1) + '|' + string(dim_name2) + '|' + string(dim_name3)
    string_values = []
    for i, the_dim in enumerate(calculated_dims.list):
        string_values.append('string(' + the_dim.dim_name + ')')
    string_concat_of_dim_values = ' + \'|\' + '.join(string_values)

    # Calculate overall_distinct_count.
    tmp = names_in_load_array.gen_uniq_name()
    cmd = ('aggregate(apply(' + load_array + ', ' + tmp + ', ' + string_concat_of_dim_values + '), approxdc(' + tmp + '))'
           )
    progress_tracker.start_step('overall_dc')
    overall_distinct_count = scidb_afl.single_cell_afl(iquery_cmd, cmd, 1)
    overall_count = scidb_afl.single_cell_afl(iquery_cmd, 'aggregate(' + load_array + ', count(*))', 1)
    try:
        overall_distinct_count = int(overall_distinct_count)
        overall_count = int(overall_count)
        if overall_distinct_count > overall_count:
            overall_distinct_count = overall_count
    except ValueError:
        raise scidblib.AppError('Error: The query to get overall_distinct_count failed to return an integer.')
    if args.verbose:
        print 'overall_distinct_count=' + str(overall_distinct_count)
    progress_tracker.end_step('overall_dc')

    progress_tracker.start_step('calculate')

    # Shortcut: if |N| == 0, we are done.
    if len(N)==0:
        print calculated_dims.__str__()
        return 0

    # Set num_chunks_from_n.
    num_chunks_from_n = scidb_math.ceil_of_division(overall_distinct_count, args.desired_values_per_chunk)
    for i in S:
        the_dim = calculated_dims.list[i]
        chunk_count = scidb_math.ceil_of_division(the_dim.distinct_count, int(the_dim.chunk_length))
        num_chunks_from_n = scidb_math.ceil_of_division(num_chunks_from_n, chunk_count)
    if num_chunks_from_n <= 1:
        num_chunks_from_n = 1

    # For each dimension i in N, calculate chunk_count[i], then set chunk_length.
    for i in N:
        the_dim = calculated_dims.list[i]
        chunk_count = math.pow(num_chunks_from_n, 1.0/len(N))
        if not args.keep_shape:
            # calculate geomean
            product = 1.0
            for k in N:
                product *= calculated_dims.list[k].distinct_count
            geomean = math.pow(product, 1.0/len(N))
            chunk_count *= the_dim.distinct_count / geomean
        if chunk_count<1:
            chunk_count = 1.0
        the_dim.chunk_length = int(math.ceil(
                                           (the_dim.max_coord-the_dim.min_coord+1)/chunk_count
                                           ))
        if chunk_count>1:
            the_dim.chunk_length = scidb_math.snap_to_grid(
                                   the_dim.chunk_length, args.grid_threshold, use_binary=(not args.grid_base10))
    progress_tracker.end_step('calculate')

    # Print result.
    print calculated_dims.__str__()

    return 0