def get_array_names(iquery_cmd=None, temp_only=False): """Get a list of array names. @param iquery_cmd the iquery command to use. @param temp_only only get the names of temp arrays. @return a list of array names that are in SciDB, returned by AFL query project(list(), name). @exception AppError if SciDB is not running or if the AFL query failed. """ if not iquery_cmd: iquery_cmd = get_iquery_cmd() query = 'project(filter(list(), temporary=true), name)' if temp_only else 'project(list(), name)' out_data, err_data = afl(iquery_cmd, query, want_output=True) lines = out_data.strip().splitlines() if not lines: raise scidblib.AppError(query + ' is expected to return at least one line.') ret = [] for line in lines[1:]: # Skip the header line. re_name = r'^\{\d+\}\s\'(.+)\'$' # e.g.: {4} 'MyArray' match_name = re.match(re_name, line) if not match_name: raise scidblib.AppError('I don\'t understand the result line ' + str(i + 1) + ': ' + line) ret.append(match_name.group(1)) return ret
def get_user_names(iquery_cmd=None): """Get a list of user names. @param iquery_cmd the iquery command to use. @return a list of namespace names that are in SciDB, returned by AFL query project(list('namespaces'), name). @exception AppError if SciDB is not running or if the AFL query failed. """ if not iquery_cmd: iquery_cmd = get_iquery_cmd() query = 'project(list(\'users\'), name)' out_data, err_data = afl(iquery_cmd, query, want_output=True) lines = out_data.strip().splitlines() if not lines: raise scidblib.AppError(query + ' is expected to return at least one line.') ret = [] i = 1 for line in lines[1:]: # Skip the header line. re_name = r'^\{\d+\}\s\'(.+)\'$' # e.g.: {4} 'username' match_name = re.match(re_name, line) if not match_name: raise scidblib.AppError('I don\'t understand the result line ' + str(i + 1) + ': ' + line) ret.append(match_name.group(1)) i += 1 return ret
def get_libraries(iquery_cmd=None): """Get a list of user names. @param iquery_cmd the iquery command to use. @return a tuple containing a list of library info objects (found in SciDb) and err_data ([LibraryInfo(...)], err_data) @exception AppError if SciDB is not running or if the AFL query failed. """ if not iquery_cmd: iquery_cmd = get_iquery_cmd() query = "list('libraries')" out_data, err_data = afl(iquery_cmd, query, want_output=True) if len(err_data) > 0: raise scidblib.AppError( "Failed to list('libraries')\nerr={0}".format(err_data)) # Successful query execution - out_data format # {inst,n} name,major,minor,patch,build,build_type # {0,0} 'SciDB',15,12,0,1,'Debug' # {1,0} 'SciDB',15,12,0,1,'Debug' # {2,0} 'SciDB',15,12,0,1,'Debug' # {3,0} 'SciDB',15,12,0,1,'Debug' lines = out_data.strip().splitlines() if not lines: raise scidblib.AppError(query + ' is expected to return at least one line.') ret = [] i = 1 regex1 = re.compile( r'^\{(\d+)\,(\d+)\}\s\'(.+)\',(\d+),(\d+),(\d+),(\d+),\s*\'(.+)\'$') regex2 = re.compile( r'^\{(\d+)\,(\d+)\}\s\'(.+)\',(\d+),(\d+),(\d+),(\d+),\s*null$') for line in lines[1:]: # Skip the header line. match1 = regex1.match(line) if match1: inst, n, name, major, minor, patch, build, build_type = match1.groups( ) libInfo = LibraryInfo(inst, n, name, major, minor, patch, build, build_type) ret.append(libInfo) else: match2 = regex2.match(line) if match2: inst, n, name, major, minor, patch, build = match2.groups() libInfo = LibraryInfo(inst, n, name, major, minor, patch, build) ret.append(libInfo) else: raise scidblib.AppError( "I don\'t understand the result line {0} :\n{1}".format( i + 1, line)) i += 1 return ret
def __init__(self, s): """Given the dimension-specification part of a schema ('?' allowed), parse into Dimensions. @param s string representation of the dimensions specification. All the five parts of a dimension specification must be specified. """ self.list = [] re_one = ( r'\s*([^=\s]+)' + # dim_name r'\s*=' + # = r'\s*([^:\s]+)' + # dim_low r'\s*:' + # : r'\s*([^,\s]+)' + # dim_high r'\s*,' + # , r'\s*([^,\s]+)' + # chunk_length r'\s*,' + # , r'\s*([^,\s]+)' + # chunk_overlap r'\s*' # ) re_all_with_leading_comma = r'^\s*,\s*' + re_one + r'(.*)$' re_all_without_leading_comma = r'^\s*' + re_one + r'(.*)$' remains = s re_dim = re_all_without_leading_comma while remains: match_dim = re.match(re_dim, remains, re.M | re.I) if not match_dim: raise scidblib.AppError( 'Error! I cannot parse \'' + remains + '\'.\n' + 'It is expected to start with dim_name=dim_low:dim_high,chunk_length,chunk_overlap.' ) dim_name = match_dim.group(1) dim_low = match_dim.group(2) dim_high = match_dim.group(3) chunk_length = match_dim.group(4) chunk_overlap = match_dim.group(5) # Add to dimensions, after checking there is not a dimension with the same name. for dim in self.list: if dim_name == dim.dim_name: raise scidblib.AppError( 'Error! There are multiple occurrences of the same dim_name (=' + dim_name + ').') self.list.append( Dimension(dim_name, dim_low, dim_high, chunk_length, chunk_overlap)) remains = match_dim.group(6) re_dim = re_all_with_leading_comma if len(self.list) == 0: raise scidblib.AppError( 'Error! Please specify at least one dimension.')
def get_array_names(iquery_cmd=None, temp_only=False, versions=False, namespace=None): """Get a list of array names. @param iquery_cmd the iquery command to use. @param temp_only only get the names of temporary arrays. @param versions set to true if interested in getting all array names and their versions @param namespace used to specify a namespace prior to getting the arrays @return a list of array names that are in SciDB, returned by AFL query project(list(), name). @exception AppError if SciDB is not running or if the AFL query failed. """ if not iquery_cmd: iquery_cmd = get_iquery_cmd() set_namespace_cmd = '' if namespace and (namespace != 'public'): set_namespace_cmd = make_set_namespace_cmd(namespace, iquery_cmd) query_arrays = "list('arrays')" if versions: query_arrays = "list('arrays', true)" if temp_only: query_arrays = "filter({0}, temporary=true)".format(query_arrays) query = set_namespace_cmd + 'project({0}, name);'.format(query_arrays) out_data, err_data = afl(iquery_cmd, query, want_output=True) lines = out_data.strip().splitlines() if not lines: raise scidblib.AppError(query + ' is expected to return at least one line.') ret = [] if set_namespace_cmd == '': start = 1 else: start = 2 if not lines[0].startswith('Query was executed successfully'): raise scidblib.AppError(iquery_cmd, ' ', query, ' failed - result=', lines[0]) for line in lines[start:]: # Skip the header line. re_name = r'^\{\d+\}\s\'(.+)\'$' # e.g.: {4} 'MyArray' match_name = re.match(re_name, line) if not match_name: raise scidblib.AppError( 'get_array_names() failed to parse: [' + line + "] the expected format is {No} 'name'\nquery=", iquery_cmd, " ", query, " -- start=", str(start)) ret.append(match_name.group(1)) return ret
def __init__(self, s): """Given a string representation of some SciDB-array attributes, parse into Attributes. @param s the string representation of the attributes. """ self.list = [] re_one = ( r'\s*([^:\s]+)\b' + # the attribute name r'\s*:' + # : r'\s*([^,\s]+)\b' + # type r'(\s+(' + # begin of optional clauses '|'.join(( r'not\s+null\b', # - optional clause: not null r'null\b', # - optional clause: null r'default\s+\\\'.*?\\\'', # - optional clause: default \'value_of_string_type\' r'default\s+[^,\s]+\b', # - optional clause: default value_of_other_types r'compression\s+[^,\s]+\b', # - optional clause: compression constant r'reserve\s+[^,\s]+\b' # - optional clause: reserve constant )) + r'))*' + # end of optional clause r'\s*' # trailing space ) re_all_with_leading_comma = r'^\s*,\s*' + re_one + r'(.*)$' re_all_without_leading_comma = r'^\s*' + re_one + r'(.*)$' remains = s re_attr = re_all_without_leading_comma while remains: match_attr = re.match(re_attr, remains, re.M | re.I) if not match_attr: raise scidblib.AppError( 'Error! I cannot parse \'' + remains + '\'.\n' + 'It does not appear to contain valid attribute definition.' ) attr_name = match_attr.group(1) attr_type = match_attr.group(2) # Add to attrs, after checking there is not an attr with the same name. for attr in self.list: if attr_name == attr.attr_name: raise scidblib.AppError( 'Error! There are multiple occurrences of the same attr_name (=' + attr_name + ').') self.list.append(Attribute(attr_name, attr_type)) remains = match_attr.group(5) re_attr = re_all_with_leading_comma
def single_cell_afl(iquery_cmd, query, num_attrs): """Execute an AFL query that is supposed to return a single cell, and return the attribute values. The return type is either a scalar (if num_attrs=1), or a list (if num_attrs>1). @example - scaler_result1 = single_cell_afl(iquery_cmd, cmd, 1) - scaler_result1, scaler_result2 = single_cell_afl(iquery_cmd, cmd, 2) @param iquery_cmd the iquery command @param query the query. @param num_attrs the expected number of attributes in the return array. @return the attribute value (if num_attrs=1), or a list of attribute values (if num_attrs>1) @exception AssertionError if num_attrs is not a positive integer. @exception AppError if either the query fails, or the query result is not single cell, or the actual number of attributes is not num_attrs. """ assert isinstance(num_attrs, (int, long)) and num_attrs>0, \ 'AssertionError: single_cell_afl must be called with a positive num_attrs.' out_data, err_data = afl(iquery_cmd, query, want_output=True) lines = out_data.strip().split('\n') if len(lines) != 2: raise scidblib.AppError( 'The afl query, ' + query + ', is supposed to return two lines including header; but it returned ' + str(len(lines)) + ' lines.') class DcsvDialect(csv.excel): """Dialect slightly tweaked from csv.excel, as a parameter to csv.reader.""" def __init__(self): csv.excel.__init__(self) self.quotechar = "'" self.lineterminator = '\n' re_result = r'^\{0\}\s([^\n]+)$' # A single-cell afl query returns result at row 0. match_result = re.match(re_result, lines[1], re.M | re.I) if not match_result: raise scidblib.AppError('The afl query, ' + query + ', did not generate ' + str(num_attrs) + ' attributes as expected.') string_io = StringIO(match_result.group(1)) csv_reader = csv.reader(string_io, DcsvDialect()) row = csv_reader.next() if len(row) != num_attrs: raise scidblib.AppError('The afl query, ' + query + ', did not generate ' + str(num_attrs) + ' attributes as expected.') if num_attrs == 1: return row[0] return row
def get_instances_info(iquery_cmd=None): """Get the info returned by the list('instances') query as a list of lists. @param iquery_cmd the iquery command to use. @return info returned by AFL query list('instances') as list of lists @exception AppError if SciDB is not running or if #instances <= 0 (for whatever reason) """ iquery_args = scidblib.util.superTuple('args', 'host', 'port') iquery_args.host = os.environ.get('IQUERY_HOST', None) iquery_args.port = os.environ.get('IQUERY_PORT', None) if not iquery_cmd: iquery_cmd = get_iquery_cmd(args=iquery_args, base_iquery_cmd='iquery -o csv:l') query = 'list(\'instances\')' out_data, err_data = afl(iquery_cmd, query, want_output=True) lines = [line.strip() for line in out_data.strip().split('\n')] if (len(lines) < 2): raise scidblib.AppError(query + ' is expected to return at least two lines.') tokenized_lines = [[t.strip().replace('\'', '') for t in line.split(',')] for line in lines[1:]] return tokenized_lines
def afl(iquery_cmd, query, want_output=False, tolerate_error=False, verbose=False): """Execute an AFL query. @param iquery_cmd the iquery command. @param query the AFL query. @param want_output requesting iquery to output query result. @param tolerate_error whether to keep silent when STDERR is not empty. A use case is when trying to delete an array which may or may not exist. @return (stdout_data, stderr_data) @exception AppError if STDERR is not empty and the caller says tolerate_error=False. """ full_command = iquery_cmd + ' -' if not want_output: full_command += 'n' full_command += "aq \"" + query + "\"" out_data, err_data = execute_it_return_out_err(full_command) if not tolerate_error and len(err_data) > 0: raise scidblib.AppError('The AFL query, ' + query + ', failed with the following error:\n' + err_data) if verbose: print verbose_afl_result_line_start() + '%s.' % query return (out_data, err_data)
def _print(self, what, step_id): """A helper function, servicing all of start_step(), end_step(), and skip_step(). @param what a string out of 'start', 'end', or 'skip'. @param step_id the previously-registered step_id. @exception AssertException if 'what' is not understood. @exception AppError if the step_id was not registered. """ assert what in self._if_print if not step_id in self._id_2_index: raise scidblib.AppError('The step_id, \'' + step_id + '\', was not registered in ProgressTracker.') if self._if_print[what]: s = self._prefix[what] if self._name: s += self._name + ': ' s += 'Step ' + str(self._id_2_index[step_id]) + ' of ' + str(len(self._id_2_index)) + ' ' + self._verb[what] # In 'start' and 'skip' messages, print the step name; # In 'end' messages, print the elapsed time for the step. if what=='start' or what=='skip': s += '. (' + self._id_2_name[step_id] + ')' # print the step name elif step_id in self._start_time and step_id in self._end_time and self._end_time[step_id] > self._start_time[step_id]: timedelta = self._end_time[step_id]-self._start_time[step_id] seconds = timedelta_total_seconds(timedelta) s += ' after ' + str(seconds) + ' s.' else: s += '.' s += self._suffix[what] print >> self._out, s
def remove_array(arrayName, namespace=None, iquery_cmd = None): """Remove an array from scidb @param arrayName the name of the array to remove @param namespace the namespace in which the array resides, None=public @param iquery_cmd the iquery command to use. @exception AppError if SciDB is not running or if the AFL query failed. """ if not iquery_cmd: iquery_cmd = get_iquery_cmd() if namespace and (namespace != 'public'): query = ';'.join((make_set_namespace_cmd(namespace), 'remove(%s)' % arrayName)) expected='Query was executed successfully\nQuery was executed successfully\n' else: query = 'remove(%s)' % arrayName expected='Query was executed successfully\n' out_data, err_data = afl(iquery_cmd, query, want_output=True) if out_data != expected: failureMsg='Cannot remove array ', arrayName if namespace: ' from namespace ', namespace failureMsg += "\nout_data=", out_data failureMsg += "\nexpected=", expected raise scidblib.AppError(failureMsg)
def get_operators(iquery_cmd = None): """Get a list of array names. @return a list of operators and the associated libraries that are in SciDB, returned by AFL query list('operators'). Example usage: operators = scidb_afl.get_operators() for (operator, library) in operators: print "operator=" + operator + " library=" + library """ if not iquery_cmd: iquery_cmd = get_iquery_cmd() query = 'list(\'operators\');' out_data, err_data = afl(iquery_cmd, query, want_output=True) lines = out_data.strip().splitlines() if not lines: raise scidblib.AppError(query + ' is expected to return at least one line.') list=[] for line in lines: try: parse_line=line[line.find(" ") + 1:] operator, library=csv_splitter(parse_line, "\'") list.append((operator, library)) except Exception, error: pass
def get_array_names(iquery_cmd = None, temp_only = False, namespace=None): """Get a list of array names. @param iquery_cmd the iquery command to use. @param temp_only only get the names of temporary arrays. @param namespace used to specify a namespace prior to getting the arrays @return a list of array names that are in SciDB, returned by AFL query project(list(), name). @exception AppError if SciDB is not running or if the AFL query failed. """ if not iquery_cmd: iquery_cmd = get_iquery_cmd() set_namespace_cmd=make_set_namespace_cmd(namespace) if namespace else '' if temp_only: query = set_namespace_cmd + 'project(filter(list(), temporary=true), name);' else: query = set_namespace_cmd + 'project(list(), name);' out_data, err_data = afl(iquery_cmd, query, want_output=True) lines = out_data.strip().splitlines() if not lines: raise scidblib.AppError(query + ' is expected to return at least one line.') ret = [] if not namespace or (namespace == 'public'): start = 1 else: start = 2 if lines[0] != 'Query was executed successfully\n': raise scidblib.AppError( 'set_namespace', namespace, ') failed - result=', lines[0]) for line in lines[start:]: # Skip the header line. re_name = r'^\{\d+\}\s\'(.+)\'$' # e.g.: {4} 'MyArray' match_name = re.match(re_name, line) if not match_name: raise scidblib.AppError( 'get_array_names() failed to parse: [' + line + "] the expected format is {#} 'name'") ret.append(match_name.group(1)) return ret
def register_step(self, step_id, step_name): """Register a step. @param step_id an identifier to be used later when a step starts/ends. @param step_name what the step does. """ if step_id in self._id_2_index: raise scidblib.AppError('The step_id, \'' + step_id + '\', was already registered.') self._id_2_name[step_id] = step_name self._id_2_index[step_id] = len(self._id_2_name)
def __str__(self): """To String. @return a string form of the version and date. @exception AppError if the VersionAndDate is not valid. """ if not self.valid(): raise scidblib.AppError('The VersionAndDate cannot be turned to string because it is not valid.') return '{0}.{1}.{2} ({3}-{4}-{5})'.format(self.major, self.minor, self.revision, self.year, self.month, self.day)
def raise_exception(self, err_msg): """Raise an exception, prefixing the error message with the dimension name. The exception string is prefixed with the dimension name. @param err_msg the error string. @exception AppError unconditionally. """ str = 'Error! In dimension \'' + self.dim_name + '\': ' str += err_msg raise scidblib.AppError(str)
def find_index(self, name): """Given a name, find its index in self.list. @param name the name to search for. @return the index of name in self.list. @exception AppError if the name does not exist. """ for i, the_name in enumerate(self.list): if the_name.name == name: return i raise scidblib.AppError('System Error: the name \'' + name + '\' does not exist in NamesInLoadArray!')
def __str__(self): """Generate a string from Dimensions. @return the generated string @exception AppError if there is no dimension, or some dimension's __str__() encounters an error. """ if len(self.list)==0: raise scidblib.AppError('System Error! There should be at least one dimension.') str_dims = [] for dim in self.list: str_dims.append(dim.__str__()) return ', '.join(str_dims)
def csv_splitter(line, string_delimiter='\"'): output = [] try: lines = [line] output = csv.reader(lines, quotechar=string_delimiter, delimiter=',', quoting=csv.QUOTE_ALL, skipinitialspace=True) output = next(output) except Exception, error: raise scidblib.AppError('csv_splitter exception ' + str(error) + " on line=" + line)
def raise_if_duplicates(it, description): """Raise an AppError if iterable 'it' contains duplicates. @param[in] it iterable to test for duplicates @param[in] description what 'it' contains, for error reporting @throws AppError if duplicate(s) found """ unique = set(it) if len(unique) < len(it): it2 = list(it) # local copy for x in unique: it2.remove(x) raise scidblib.AppError( 'Error! There are multiple occurrences of the same {0}: {1}'. format(name, it2))
def get_num_instances(iquery_cmd = None): """Get the number of SciDB instances. @param iquery_cmd the iquery command to use. @return the number of SciDB instances acquired by AFL query list('instances') @exception AppError if SciDB is not running or if #instances <= 0 (for whatever reason) """ if not iquery_cmd: iquery_cmd = get_iquery_cmd() query = 'list(\'instances\')' out_data, err_data = afl(iquery_cmd, query, want_output=True) num_lines = len(out_data.strip().split('\n')) if num_lines < 2: raise scidblib.AppError(query + ' is expected to return at least two lines.') return num_lines - 1 # skip the header line
def time_afl(iquery_cmd, query): """Execute an AFL query, and return the execution time. @param iquery_cmd the iquery command. @param query the AFL query. @return the execution time. @exception AppError if the error did not execute successfully. """ full_command = '/usr/bin/time -f \"%e\" ' + iquery_cmd + ' -naq \"' + query + "\" 1>/dev/null" out_data, err_data = execute_it_return_out_err(full_command) try: return float(err_data) except ValueError: raise scidblib.AppError('Timing the AFL query ' + query + ', failed with the following error:\n' + err_data)
def __init__(self, iquery_cmd, load_array): """Call iquery -aq "show(load_array)" to get the schema of the load array, and fill in data members. @param iquery_cmd the iquery command. @param load_array the name of the load array. @exception AppError if the show() command does not produce a valid schema, e.g. if load_array is not a valid array name in the database. """ self.list = [] schema_str = scidb_afl.single_cell_afl(iquery_cmd, 'show(' + load_array + ')', 1) re_schema = ( r'^.*' + # array_name r'\<(.*)\>\s*' + # <attributes> r'\[(.*)\]$' # [dimensions] ) match_schema = re.match(re_schema, schema_str, re.M | re.I) if not match_schema: raise scidblib.AppError( 'System Error! I failed to parse the schema of the load_array.' ) str_attrs = match_schema.group(1) str_dims = match_schema.group(2) # attributes self.attrs = Attributes(str_attrs) attrs = self.attrs.list for i, attr in enumerate(attrs): one_name = NameInLoadArray(attr.attr_name, is_dim=False, is_int64=attr.attr_type == 'int64', local_index=i) self.list.append(one_name) # dimensions self.dims = Dimensions(str_dims) dims = self.dims.list for i, dim in enumerate(dims): one_name = NameInLoadArray(dim.dim_name, is_dim=True, is_int64=True, local_index=i) self.list.append(one_name)
def time_afl(iquery_cmd, query, verbose=False): """Execute an AFL query, and return the execution time. @param iquery_cmd the iquery command. @param query the AFL query. @return the execution time. @exception AppError if the error did not execute successfully. """ full_command = '/usr/bin/time -f \"%e\" ' + iquery_cmd + ' -naq \"' + query + "\" 1>/dev/null" out_data, err_data = execute_it_return_out_err(full_command) try: t = float(err_data) if verbose: print verbose_afl_result_line_start() + '%s in %f seconds.' % (query, t) return t except ValueError: raise scidblib.AppError('Timing the AFL query ' + query + ', failed with the following error:\n' + err_data)
def earlier_than(self, another): """Comparing two versions. @param another another VersionAndDate object. @return whether the current VersionAndDate is earlier than the other one. @exception AppError if any of the two objects is not valid. """ if not self.valid() or not another.valid(): raise scidblib.AppError('I cannot compare invalid VersionAndDate objects.') if self.major < another.major: return True elif self.major > another.major: return False if self.minor < another.minor: return True elif self.minor > another.minor: return False return self.revision < another.revision
def main(): """The main function gets command-line arguments and calls calculate_chunk_length(). @return 0 @exception AppError if something goes wrong. """ parser = argparse.ArgumentParser( description='Chunk-length calculator (c) SciDB, Inc.\n' + '\n' + 'The program calculates a dimension-specification string from a raw_dims string,\n' + 'by replacing \'?\' with calculated values, for fields such as the chunk length.\n' + 'The calculated string may be cut & pasted into the dimension-specification part\n' + 'of a result-array schema, for the redimension() query.', epilog='examples:\n' + ' Suppose you have a SciDB array:\n' + ' arr_raw <i:int64,j:int64,v:double> [dummy=0:*,1000000,0],\n' + ' you want to redimension it into a matrix where i and j are dimensions, but you\n' + ' need help in choosing chunk lengths and/or low and high coordinates of the\n' + ' dimensions.\n' + ' You may call:\n' + ' calculate_chunk_length.py arr_raw \'i=?:?,?,?, j=?:?,?,?\'\n' + ' You are free to interleave \'?\' with values you desire. E.g. you may call:\n' + ' calculate_chunk_length.py arr_raw \'i=0:?,8192,0, j=?:?,?,10\'\n' + '\n' + 'assumptions:\n' + ' - iquery is in your path.\n' + ' - The specified load_array exists in the database, and has data loaded.\n' + ' - Every specified dim_name in raw_dims must exist in load_array, either as a\n' + ' dimension, or as an attribute which is of type int64.\n' + ' - If you choose to specify a \'low\' (or \'high\') value for a dimension, it\n' + ' must be a lowerbound (or upperbound) of all actual values for that name in\n' + ' load_array.\n' + '\n' + 'limitations:\n' + ' - The algorithm does not handle skew, e.g. when majority of the array is empty\n' + ' but there are a few small dense regions. In such cases, the script may\n' + ' produce overly large chunk lengths in that chunks covering the dense regions\n' + ' may use too much memory.\n' + ' The workaround is to reduce the desired_values_per_chunk argument.\n' + ' - The algorithm does not handle large-sized attributes, e.g. string attributes\n' + ' with thousands or even millions of bytes. In such cases, the script may\n' + ' produce overly large chunk lengths in that chunks of such attributes may\n' + ' use too much memory.\n' + ' The workaround is again to reduce the desired_values_per_chunk argument.', formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument( 'load_array', help='The name of the source array to redimension from.') parser.add_argument( 'raw_dims', help= '''A string describing the dimension specification of the result array, in the form of \'dim_name=low:high,chunk_length,chunk_overlap [, OTHER_DIMS]\'. Some note: (a) dim_name must be the name of either a dimension or an int64-typed attribute in load_array. (b) The other four components are either an integer (the algorithm will respect that) or \'?\' (the algorithm will calculate a value for it). (c) The \'high\' component has an additional choice of \'*\', in which case the calculated schema will also contains \'*\'.''') parser.add_argument('-c', '--host', help='Host name to be passed to iquery.') parser.add_argument('-p', '--port', help='Port number to be passed to iquery.') parser.add_argument( '-d', '--desired_values_per_chunk', type=int, default=1024 * 1024, help='''The number of desired non-empty values per chunk. The default value is 1 Mebi (i.e. 2^20). With the same desired values per chunk, a sparser result array will get at larger chunk lengths.''') parser.add_argument( '-k', '--keep_shape', action='store_true', help= '''If specified, the shape of a chunk will be similar to the shape of the array, i.e. every dimension will be partitioned to a similar number of pieces. The default is not keep_shape, i.e. a larger dimension will be partitioned to more pieces.''' ) parser.add_argument( '-v', '--verbose', default=0, action='count', help='If specified, progress of the algorithm will be reported.') parser.add_argument('-t', '--grid_threshold', type=float, default=0.1, help='''The default value is 0.1 (or 10%%).\n' + The algorithm makes an effort to adjust each calculated chunk length to a closeby \'gridline value\' (see the -g option). A gridline value will be considered only if its relative difference from the calculated chunk length is no more than this grid_threshold. E.g. with the default value, if a calculated chunk length is 3847, the gridline value 4000 may be considered because its relative difference from 3847, i.e. (4000-3847)/3847=0.40, is less than 0.1; but the gridline value 3000 may not be considered because its relative difference from 3847 (=0.22) exceeds 0.1. You may disable any adjustment by setting grid_threshold = 0.''' ) parser.add_argument( '-g', '--grid_base10', action='store_true', help='''If specified, use a multiple-of-power-of-10 as the gridline. The default is not to specify, in which case a power-of-2 is used as the gridline. In the power-of-2 case, there is only one candidate gridline value: the closest power of 2. In the multiple-of-power-of-10 case, multiple candidates may be considered, with different numbers of ending zeros. If multiple gridline values are within grid_threshold, the one with the most number of ending zeros is chosen, breaking ties by favoring the one closer to the calculated chunk length. E.g. if a calculated chunk length is 3847, gridline values 10,000, 4000, 3800, and 3850 are all considered. If grid_threshold=0.1, 4000 is chosen; if grid_threshold=0.01, 3850 is chosen.''' ) args = parser.parse_args() global _verbose _verbose = args.verbose try: if args.desired_values_per_chunk <= 0: raise scidblib.AppError( 'Desired values per chunk must be positive.') exit_code = calculate_chunk_length(args) assert exit_code == 0, 'AssertionError: the command is expected to return 0 unless an exception was thrown.' except scidblib.AppError as e: print >> sys.stderr, '------ Exception -----------------------------' print >> sys.stderr, e if args.verbose: print >> sys.stderr, '------ Traceback (for debug purpose) ---------' traceback.print_exc() print >> sys.stderr, '----------------------------------------------' return -1 except Exception as e: print >> sys.stderr, '------ Unexpected Exception ------------------' print >> sys.stderr, e print >> sys.stderr, '------ Traceback (for debug purpose) ---------' traceback.print_exc() print >> sys.stderr, '----------------------------------------------' return -2 return 0
def calculate_chunk_length(args): """Calculate chunk length and other fields which were '?', and print out the schema. @param args the result of argparse.ArgumentParser.parse_args(). @return 0 @exception AppError if anything goes wrong. """ iquery_cmd = scidb_afl.get_iquery_cmd(args) load_array = args.load_array raw_dims_str = args.raw_dims calculated_dims = parse_dimensions(raw_dims_str) dbg("Calculated dims:", [x.to_tuple() for x in calculated_dims]) # Initialize the progress tracker progress_tracker = scidb_progress.ProgressTracker( sys.stdout, '', args.verbose, # if_print_start args.verbose, # if_print_end args.verbose # if_print_skip ) progress_tracker.register_step( 'min_max_dc', 'Get min_coord, max_coord, and ApproxDC for each dim from load_array.') progress_tracker.register_step('overall_dc', 'Get overall ApproxDC from load_array.') progress_tracker.register_step( 'calculate', 'Calculate and adjust dimension specification.') # S = dims where chunk_length is Specified; # N = dims where chunk_length is Not specified. S = [] N = [] for i, the_dim in enumerate(calculated_dims): if the_dim.chunk_length == '?': N.append(i) else: S.append(i) dbg("S:", S) dbg("N:", N) # Get the (dimension and attribute) names of the load_array. names_in_load_array = NamesInLoadArray(iquery_cmd, load_array) dbg("names...:", names_in_load_array.list) # for each i in [0..d), calculate min_coord[i], max_coord[i], and distinct_count[i] progress_tracker.start_step('min_max_dc') for the_dim in calculated_dims: index = names_in_load_array.find_index(the_dim.dim_name) the_name_in_load_array = names_in_load_array.list[index] if the_name_in_load_array.is_dim: tmp = names_in_load_array.gen_uniq_name() cmd = ('aggregate(apply(aggregate(' + load_array + ', count(*), ' + the_dim.dim_name + '), ' + tmp + ', ' + the_dim.dim_name + '), min(' + tmp + '), max(' + tmp + '), count(*))') else: cmd = ('aggregate(' + load_array + ', min(' + the_dim.dim_name + '), max(' + the_dim.dim_name + '), approxdc(' + the_dim.dim_name + '))') dbg("Cmd:", cmd) min_coord, max_coord, distinct_count = scidb_afl.single_cell_afl( iquery_cmd, cmd, 3) dbg("(min,max,dc):", (min_coord, max_coord, distinct_count)) try: min_coord_int = int(min_coord) max_coord_int = int(max_coord) distinct_count_int = int(distinct_count) if args.verbose: print 'For ' + the_dim.dim_name + ', min_coord=' + str(min_coord_int) +\ ', max_coord=' + str(max_coord_int) +\ ', distinct_count=' + str(distinct_count_int) except ValueError: raise scidblib.AppError('Error: I cannot proceed because for ' + the_dim.dim_name + ' in array ' + load_array + ', not all of min_coord (=' + min_coord + '), max_coord (=' + max_coord + '), and distinct_count (=' + distinct_count + ') are integers.') the_dim.set_min_max_dc(min_coord_int, max_coord_int, distinct_count_int) progress_tracker.end_step('min_max_dc') # Fill dim_low, dim_high, and chunk_overlap (which was a '?' before). for the_dim in calculated_dims: if the_dim.dim_low == '?': the_dim.dim_low = the_dim.min_coord if the_dim.dim_high == '?': the_dim.dim_high = the_dim.max_coord if the_dim.chunk_overlap == '?': the_dim.chunk_overlap = 0 # Generate string_concat_of_dim_values in the form of: # string(dim_name1) + '|' + string(dim_name2) + '|' + string(dim_name3) string_values = [] for i, the_dim in enumerate(calculated_dims): string_values.append('string(' + the_dim.dim_name + ')') string_concat_of_dim_values = ' + \'|\' + '.join(string_values) # Calculate overall_distinct_count. tmp = names_in_load_array.gen_uniq_name() cmd = ('aggregate(apply(' + load_array + ', ' + tmp + ', ' + string_concat_of_dim_values + '), approxdc(' + tmp + '))') progress_tracker.start_step('overall_dc') overall_distinct_count = scidb_afl.single_cell_afl(iquery_cmd, cmd, 1) overall_count = scidb_afl.single_cell_afl( iquery_cmd, 'aggregate(' + load_array + ', count(*))', 1) try: overall_distinct_count = int(overall_distinct_count) overall_count = int(overall_count) if overall_distinct_count > overall_count: overall_distinct_count = overall_count except ValueError: raise scidblib.AppError( 'Error: The query to get overall_distinct_count failed to return an integer.' ) if args.verbose: print 'overall_distinct_count=' + str(overall_distinct_count) progress_tracker.end_step('overall_dc') progress_tracker.start_step('calculate') # Shortcut: if |N| == 0, we are done. if len(N) == 0: print scidb_schema.unparse( dims=[x.to_tuple() for x in calculated_dims]) return 0 # Set num_chunks_from_n. num_chunks_from_n = scidb_math.ceil_of_division( overall_distinct_count, args.desired_values_per_chunk) for i in S: the_dim = calculated_dims[i] chunk_count = scidb_math.ceil_of_division(the_dim.distinct_count, int(the_dim.chunk_length)) num_chunks_from_n = scidb_math.ceil_of_division( num_chunks_from_n, chunk_count) if num_chunks_from_n <= 1: num_chunks_from_n = 1 # For each dimension i in N, calculate chunk_count[i], then set chunk_length. for i in N: the_dim = calculated_dims[i] chunk_count = math.pow(num_chunks_from_n, 1.0 / len(N)) if not args.keep_shape: # calculate geomean product = 1.0 for k in N: product *= calculated_dims[k].distinct_count geomean = math.pow(product, 1.0 / len(N)) chunk_count *= the_dim.distinct_count / geomean if chunk_count < 1: chunk_count = 1.0 the_dim.chunk_length = int( math.ceil( (the_dim.max_coord - the_dim.min_coord + 1) / chunk_count)) if chunk_count > 1: the_dim.chunk_length = scidb_math.snap_to_grid( the_dim.chunk_length, args.grid_threshold, use_binary=(not args.grid_base10)) progress_tracker.end_step('calculate') # Print result. print scidb_schema.unparse(dims=[x.to_tuple() for x in calculated_dims]) return 0