Beispiel #1
0
def get_array_names(iquery_cmd=None, temp_only=False):
    """Get a list of array names.

    @param iquery_cmd  the iquery command to use.
    @param temp_only   only get the names of temp arrays.
    @return a list of array names that are in SciDB, returned by AFL query project(list(), name).
    @exception AppError if SciDB is not running or if the AFL query failed.
    """
    if not iquery_cmd:
        iquery_cmd = get_iquery_cmd()
    query = 'project(filter(list(), temporary=true), name)' if temp_only else 'project(list(), name)'
    out_data, err_data = afl(iquery_cmd, query, want_output=True)
    lines = out_data.strip().splitlines()
    if not lines:
        raise scidblib.AppError(query +
                                ' is expected to return at least one line.')
    ret = []
    for line in lines[1:]:  # Skip the header line.
        re_name = r'^\{\d+\}\s\'(.+)\'$'  # e.g.: {4} 'MyArray'
        match_name = re.match(re_name, line)
        if not match_name:
            raise scidblib.AppError('I don\'t understand the result line ' +
                                    str(i + 1) + ': ' + line)
        ret.append(match_name.group(1))
    return ret
Beispiel #2
0
def get_user_names(iquery_cmd=None):
    """Get a list of user names.

    @param iquery_cmd  the iquery command to use.
    @return a list of namespace names that are in SciDB, returned by AFL query project(list('namespaces'), name).
    @exception AppError if SciDB is not running or if the AFL query failed.
    """
    if not iquery_cmd:
        iquery_cmd = get_iquery_cmd()
    query = 'project(list(\'users\'), name)'
    out_data, err_data = afl(iquery_cmd, query, want_output=True)
    lines = out_data.strip().splitlines()
    if not lines:
        raise scidblib.AppError(query +
                                ' is expected to return at least one line.')
    ret = []
    i = 1
    for line in lines[1:]:  # Skip the header line.
        re_name = r'^\{\d+\}\s\'(.+)\'$'  # e.g.: {4} 'username'
        match_name = re.match(re_name, line)
        if not match_name:
            raise scidblib.AppError('I don\'t understand the result line ' +
                                    str(i + 1) + ': ' + line)
        ret.append(match_name.group(1))
        i += 1
    return ret
Beispiel #3
0
def get_libraries(iquery_cmd=None):
    """Get a list of user names.

    @param iquery_cmd  the iquery command to use.
    @return a tuple containing a list of library info objects (found in SciDb) and err_data
        ([LibraryInfo(...)], err_data)
    @exception AppError if SciDB is not running or if the AFL query failed.
    """
    if not iquery_cmd:
        iquery_cmd = get_iquery_cmd()
    query = "list('libraries')"
    out_data, err_data = afl(iquery_cmd, query, want_output=True)
    if len(err_data) > 0:
        raise scidblib.AppError(
            "Failed to list('libraries')\nerr={0}".format(err_data))

    # Successful query execution - out_data format
    # {inst,n} name,major,minor,patch,build,build_type
    # {0,0} 'SciDB',15,12,0,1,'Debug'
    # {1,0} 'SciDB',15,12,0,1,'Debug'
    # {2,0} 'SciDB',15,12,0,1,'Debug'
    # {3,0} 'SciDB',15,12,0,1,'Debug'

    lines = out_data.strip().splitlines()
    if not lines:
        raise scidblib.AppError(query +
                                ' is expected to return at least one line.')

    ret = []
    i = 1
    regex1 = re.compile(
        r'^\{(\d+)\,(\d+)\}\s\'(.+)\',(\d+),(\d+),(\d+),(\d+),\s*\'(.+)\'$')
    regex2 = re.compile(
        r'^\{(\d+)\,(\d+)\}\s\'(.+)\',(\d+),(\d+),(\d+),(\d+),\s*null$')
    for line in lines[1:]:  # Skip the header line.
        match1 = regex1.match(line)
        if match1:
            inst, n, name, major, minor, patch, build, build_type = match1.groups(
            )
            libInfo = LibraryInfo(inst, n, name, major, minor, patch, build,
                                  build_type)
            ret.append(libInfo)
        else:
            match2 = regex2.match(line)
            if match2:
                inst, n, name, major, minor, patch, build = match2.groups()
                libInfo = LibraryInfo(inst, n, name, major, minor, patch,
                                      build)
                ret.append(libInfo)
            else:
                raise scidblib.AppError(
                    "I don\'t understand the result line {0} :\n{1}".format(
                        i + 1, line))

        i += 1
    return ret
    def __init__(self, s):
        """Given the dimension-specification part of a schema ('?' allowed), parse into Dimensions.

        @param s  string representation of the dimensions specification.
        All the five parts of a dimension specification must be specified.
        """
        self.list = []

        re_one = (
            r'\s*([^=\s]+)' +  # dim_name
            r'\s*=' +  # =
            r'\s*([^:\s]+)' +  # dim_low
            r'\s*:' +  # :
            r'\s*([^,\s]+)' +  # dim_high
            r'\s*,' +  # ,
            r'\s*([^,\s]+)' +  # chunk_length
            r'\s*,' +  # ,
            r'\s*([^,\s]+)' +  # chunk_overlap
            r'\s*'  #
        )
        re_all_with_leading_comma = r'^\s*,\s*' + re_one + r'(.*)$'
        re_all_without_leading_comma = r'^\s*' + re_one + r'(.*)$'

        remains = s

        re_dim = re_all_without_leading_comma
        while remains:
            match_dim = re.match(re_dim, remains, re.M | re.I)
            if not match_dim:
                raise scidblib.AppError(
                    'Error! I cannot parse \'' + remains + '\'.\n' +
                    'It is expected to start with dim_name=dim_low:dim_high,chunk_length,chunk_overlap.'
                )
            dim_name = match_dim.group(1)
            dim_low = match_dim.group(2)
            dim_high = match_dim.group(3)
            chunk_length = match_dim.group(4)
            chunk_overlap = match_dim.group(5)

            # Add to dimensions, after checking there is not a dimension with the same name.
            for dim in self.list:
                if dim_name == dim.dim_name:
                    raise scidblib.AppError(
                        'Error! There are multiple occurrences of the same dim_name (='
                        + dim_name + ').')
            self.list.append(
                Dimension(dim_name, dim_low, dim_high, chunk_length,
                          chunk_overlap))

            remains = match_dim.group(6)
            re_dim = re_all_with_leading_comma

        if len(self.list) == 0:
            raise scidblib.AppError(
                'Error! Please specify at least one dimension.')
Beispiel #5
0
def get_array_names(iquery_cmd=None,
                    temp_only=False,
                    versions=False,
                    namespace=None):
    """Get a list of array names.
    @param iquery_cmd  the iquery command to use.
    @param temp_only   only get the names of temporary arrays.
    @param versions    set to true if interested in getting all array names and their versions
    @param namespace used to specify a namespace prior to getting the arrays
    @return a list of array names that are in SciDB, returned by AFL query project(list(), name).
    @exception AppError if SciDB is not running or if the AFL query failed.
    """
    if not iquery_cmd:
        iquery_cmd = get_iquery_cmd()

    set_namespace_cmd = ''
    if namespace and (namespace != 'public'):
        set_namespace_cmd = make_set_namespace_cmd(namespace, iquery_cmd)

    query_arrays = "list('arrays')"
    if versions:
        query_arrays = "list('arrays', true)"

    if temp_only:
        query_arrays = "filter({0}, temporary=true)".format(query_arrays)

    query = set_namespace_cmd + 'project({0}, name);'.format(query_arrays)

    out_data, err_data = afl(iquery_cmd, query, want_output=True)
    lines = out_data.strip().splitlines()
    if not lines:
        raise scidblib.AppError(query +
                                ' is expected to return at least one line.')
    ret = []

    if set_namespace_cmd == '':
        start = 1
    else:
        start = 2
        if not lines[0].startswith('Query was executed successfully'):
            raise scidblib.AppError(iquery_cmd, ' ', query,
                                    ' failed - result=', lines[0])

    for line in lines[start:]:  # Skip the header line.
        re_name = r'^\{\d+\}\s\'(.+)\'$'  # e.g.: {4} 'MyArray'
        match_name = re.match(re_name, line)
        if not match_name:
            raise scidblib.AppError(
                'get_array_names() failed to parse: [' + line +
                "] the expected format is {No} 'name'\nquery=", iquery_cmd,
                " ", query, " -- start=", str(start))

        ret.append(match_name.group(1))
    return ret
    def __init__(self, s):
        """Given a string representation of some SciDB-array attributes, parse into Attributes.

        @param s  the string representation of the attributes.
        """
        self.list = []

        re_one = (
            r'\s*([^:\s]+)\b' +  # the attribute name
            r'\s*:' +  # :
            r'\s*([^,\s]+)\b' +  # type
            r'(\s+(' +  # begin of optional clauses
            '|'.join((
                r'not\s+null\b',  #   - optional clause: not null
                r'null\b',  #   - optional clause: null
                r'default\s+\\\'.*?\\\'',  #   - optional clause: default \'value_of_string_type\'
                r'default\s+[^,\s]+\b',  #   - optional clause: default value_of_other_types
                r'compression\s+[^,\s]+\b',  #   - optional clause: compression constant
                r'reserve\s+[^,\s]+\b'  #   - optional clause: reserve constant
            )) + r'))*' +  # end of optional clause
            r'\s*'  # trailing space
        )

        re_all_with_leading_comma = r'^\s*,\s*' + re_one + r'(.*)$'
        re_all_without_leading_comma = r'^\s*' + re_one + r'(.*)$'

        remains = s

        re_attr = re_all_without_leading_comma
        while remains:
            match_attr = re.match(re_attr, remains, re.M | re.I)
            if not match_attr:
                raise scidblib.AppError(
                    'Error! I cannot parse \'' + remains + '\'.\n' +
                    'It does not appear to contain valid attribute definition.'
                )

            attr_name = match_attr.group(1)
            attr_type = match_attr.group(2)

            # Add to attrs, after checking there is not an attr with the same name.
            for attr in self.list:
                if attr_name == attr.attr_name:
                    raise scidblib.AppError(
                        'Error! There are multiple occurrences of the same attr_name (='
                        + attr_name + ').')
            self.list.append(Attribute(attr_name, attr_type))

            remains = match_attr.group(5)
            re_attr = re_all_with_leading_comma
Beispiel #7
0
def single_cell_afl(iquery_cmd, query, num_attrs):
    """Execute an AFL query that is supposed to return a single cell, and return the attribute values.

    The return type is either a scalar (if num_attrs=1), or a list (if num_attrs>1).
    @example
      - scaler_result1 = single_cell_afl(iquery_cmd, cmd, 1)
      - scaler_result1, scaler_result2 = single_cell_afl(iquery_cmd, cmd, 2)

    @param iquery_cmd the iquery command
    @param query the query.
    @param num_attrs the expected number of attributes in the return array.
    @return the attribute value (if num_attrs=1), or a list of attribute values (if num_attrs>1)
    @exception AssertionError if num_attrs is not a positive integer.
    @exception AppError if either the query fails, or the query result is not single cell,
                     or the actual number of attributes is not num_attrs.
    """
    assert isinstance(num_attrs, (int, long)) and num_attrs>0, \
        'AssertionError: single_cell_afl must be called with a positive num_attrs.'
    out_data, err_data = afl(iquery_cmd, query, want_output=True)
    lines = out_data.strip().split('\n')
    if len(lines) != 2:
        raise scidblib.AppError(
            'The afl query, ' + query +
            ', is supposed to return two lines including header; but it returned '
            + str(len(lines)) + ' lines.')

    class DcsvDialect(csv.excel):
        """Dialect slightly tweaked from csv.excel, as a parameter to csv.reader."""
        def __init__(self):
            csv.excel.__init__(self)
            self.quotechar = "'"
            self.lineterminator = '\n'

    re_result = r'^\{0\}\s([^\n]+)$'  # A single-cell afl query returns result at row 0.
    match_result = re.match(re_result, lines[1], re.M | re.I)
    if not match_result:
        raise scidblib.AppError('The afl query, ' + query +
                                ', did not generate ' + str(num_attrs) +
                                ' attributes as expected.')

    string_io = StringIO(match_result.group(1))
    csv_reader = csv.reader(string_io, DcsvDialect())
    row = csv_reader.next()
    if len(row) != num_attrs:
        raise scidblib.AppError('The afl query, ' + query +
                                ', did not generate ' + str(num_attrs) +
                                ' attributes as expected.')
    if num_attrs == 1:
        return row[0]
    return row
Beispiel #8
0
def get_instances_info(iquery_cmd=None):
    """Get the info returned by the list('instances') query as a list of lists.

    @param iquery_cmd  the iquery command to use.
    @return info returned by AFL query list('instances') as list of lists
    @exception AppError if SciDB is not running or if #instances <= 0 (for whatever reason)
    """
    iquery_args = scidblib.util.superTuple('args', 'host', 'port')
    iquery_args.host = os.environ.get('IQUERY_HOST', None)
    iquery_args.port = os.environ.get('IQUERY_PORT', None)

    if not iquery_cmd:
        iquery_cmd = get_iquery_cmd(args=iquery_args,
                                    base_iquery_cmd='iquery -o csv:l')

    query = 'list(\'instances\')'
    out_data, err_data = afl(iquery_cmd, query, want_output=True)

    lines = [line.strip() for line in out_data.strip().split('\n')]
    if (len(lines) < 2):
        raise scidblib.AppError(query +
                                ' is expected to return at least two lines.')

    tokenized_lines = [[t.strip().replace('\'', '') for t in line.split(',')]
                       for line in lines[1:]]

    return tokenized_lines
Beispiel #9
0
def afl(iquery_cmd,
        query,
        want_output=False,
        tolerate_error=False,
        verbose=False):
    """Execute an AFL query.

    @param iquery_cmd     the iquery command.
    @param query          the AFL query.
    @param want_output    requesting iquery to output query result.
    @param tolerate_error whether to keep silent when STDERR is not empty.
                          A use case is when trying to delete an array which may or may not exist.
    @return (stdout_data, stderr_data)
    @exception AppError if STDERR is not empty and the caller says tolerate_error=False.
    """
    full_command = iquery_cmd + ' -'
    if not want_output:
        full_command += 'n'
    full_command += "aq \"" + query + "\""
    out_data, err_data = execute_it_return_out_err(full_command)
    if not tolerate_error and len(err_data) > 0:
        raise scidblib.AppError('The AFL query, ' + query +
                                ', failed with the following error:\n' +
                                err_data)
    if verbose:
        print verbose_afl_result_line_start() + '%s.' % query
    return (out_data, err_data)
Beispiel #10
0
    def _print(self, what, step_id):
        """A helper function, servicing all of start_step(), end_step(), and skip_step().

        @param what     a string out of 'start', 'end', or 'skip'.
        @param step_id  the previously-registered step_id.
        @exception AssertException if 'what' is not understood.
        @exception AppError if the step_id was not registered.
        """
        assert what in self._if_print
        if not step_id in self._id_2_index:
            raise scidblib.AppError('The step_id, \'' + step_id + '\', was not registered in ProgressTracker.')

        if self._if_print[what]:
            s = self._prefix[what]
            if self._name:
                s += self._name + ': '
            s += 'Step ' + str(self._id_2_index[step_id]) + ' of ' + str(len(self._id_2_index)) + ' ' + self._verb[what]

            # In 'start' and 'skip' messages, print the step name;
            # In 'end' messages, print the elapsed time for the step.
            if what=='start' or what=='skip':
                s += '. (' + self._id_2_name[step_id] + ')'  # print the step name
            elif step_id in self._start_time and step_id in self._end_time and self._end_time[step_id] > self._start_time[step_id]:
                timedelta = self._end_time[step_id]-self._start_time[step_id]
                seconds = timedelta_total_seconds(timedelta)
                s += ' after ' + str(seconds) + ' s.'
            else:
                s += '.'
            s += self._suffix[what]

            print >> self._out, s
Beispiel #11
0
def remove_array(arrayName, namespace=None, iquery_cmd = None):
    """Remove an array from scidb
    @param arrayName the name of the array to remove
    @param namespace the namespace in which the array resides, None=public
    @param iquery_cmd  the iquery command to use.
    @exception AppError if SciDB is not running or if the AFL query failed.
    """
    if not iquery_cmd:
        iquery_cmd = get_iquery_cmd()


    if namespace and (namespace != 'public'):
        query = ';'.join((make_set_namespace_cmd(namespace),
                          'remove(%s)' % arrayName))
        expected='Query was executed successfully\nQuery was executed successfully\n'
    else:
        query = 'remove(%s)' % arrayName
        expected='Query was executed successfully\n'

    out_data, err_data = afl(iquery_cmd, query, want_output=True)

    if out_data != expected:
        failureMsg='Cannot remove array ', arrayName
        if namespace:
            ' from namespace ', namespace
        failureMsg += "\nout_data=", out_data
        failureMsg += "\nexpected=", expected
        raise scidblib.AppError(failureMsg)
Beispiel #12
0
def get_operators(iquery_cmd = None):
    """Get a list of array names.
    @return a list of operators and the associated libraries that are in SciDB, returned by AFL query list('operators').
    Example usage:
        operators = scidb_afl.get_operators()
        for (operator, library) in operators:
            print "operator=" + operator + " library=" + library
    """

    if not iquery_cmd:
        iquery_cmd = get_iquery_cmd()

    query = 'list(\'operators\');'
    out_data, err_data = afl(iquery_cmd, query, want_output=True)
    lines = out_data.strip().splitlines()
    if not lines:
        raise scidblib.AppError(query + ' is expected to return at least one line.')

    list=[]
    for line in lines:
        try:
            parse_line=line[line.find(" ") + 1:]
            operator, library=csv_splitter(parse_line, "\'")
            list.append((operator, library))
        except Exception, error:
            pass
Beispiel #13
0
def get_array_names(iquery_cmd = None, temp_only = False, namespace=None):
    """Get a list of array names.
    @param iquery_cmd  the iquery command to use.
    @param temp_only   only get the names of temporary arrays.
    @param namespace used to specify a namespace prior to getting the arrays
    @return a list of array names that are in SciDB, returned by AFL query project(list(), name).
    @exception AppError if SciDB is not running or if the AFL query failed.
    """
    if not iquery_cmd:
        iquery_cmd = get_iquery_cmd()


    set_namespace_cmd=make_set_namespace_cmd(namespace) if namespace else ''

    if temp_only:
        query = set_namespace_cmd + 'project(filter(list(), temporary=true), name);' 
    else:
        query = set_namespace_cmd + 'project(list(), name);'

    out_data, err_data = afl(iquery_cmd, query, want_output=True)
    lines = out_data.strip().splitlines()
    if not lines:
        raise scidblib.AppError(query + ' is expected to return at least one line.')
    ret = []

    if not namespace or (namespace == 'public'):
        start = 1
    else:
        start = 2
        if lines[0] != 'Query was executed successfully\n':
            raise scidblib.AppError(
                'set_namespace', namespace, ') failed - result=', lines[0])

    for line in lines[start:]:  # Skip the header line.
        re_name = r'^\{\d+\}\s\'(.+)\'$'  # e.g.: {4} 'MyArray'
        match_name = re.match(re_name, line)
        if not match_name:
            raise scidblib.AppError(
                'get_array_names() failed to parse: ['
                + line
                + "] the expected format is {#} 'name'")

        ret.append(match_name.group(1))
    return ret
Beispiel #14
0
    def register_step(self, step_id, step_name):
        """Register a step.

        @param step_id    an identifier to be used later when a step starts/ends.
        @param step_name  what the step does.
        """
        if step_id in self._id_2_index:
            raise scidblib.AppError('The step_id, \'' + step_id + '\', was already registered.')
        self._id_2_name[step_id] = step_name
        self._id_2_index[step_id] = len(self._id_2_name)
Beispiel #15
0
    def __str__(self):
        """To String.

        @return a string form of the version and date.
        @exception AppError if the VersionAndDate is not valid.
        """
        if not self.valid():
            raise scidblib.AppError('The VersionAndDate cannot be turned to string because it is not valid.')
        return '{0}.{1}.{2} ({3}-{4}-{5})'.format(self.major, self.minor, self.revision,
                                                  self.year, self.month, self.day)
Beispiel #16
0
    def raise_exception(self, err_msg):
        """Raise an exception, prefixing the error message with the dimension name.

        The exception string is prefixed with the dimension name.
        @param err_msg  the error string.
        @exception AppError unconditionally.
        """
        str = 'Error! In dimension \'' + self.dim_name + '\': '
        str += err_msg
        raise scidblib.AppError(str)
    def find_index(self, name):
        """Given a name, find its index in self.list.

        @param name  the name to search for.
        @return the index of name in self.list.
        @exception AppError if the name does not exist.
        """
        for i, the_name in enumerate(self.list):
            if the_name.name == name:
                return i
        raise scidblib.AppError('System Error: the name \'' + name + '\' does not exist in NamesInLoadArray!')
    def __str__(self):
        """Generate a string from Dimensions.

        @return the generated string
        @exception AppError if there is no dimension, or some dimension's __str__() encounters an error.
        """
        if len(self.list)==0:
            raise scidblib.AppError('System Error! There should be at least one dimension.')
        str_dims = []
        for dim in self.list:
            str_dims.append(dim.__str__())
        return ', '.join(str_dims)
Beispiel #19
0
def csv_splitter(line, string_delimiter='\"'):
    output = []
    try:
        lines = [line]
        output = csv.reader(lines,
                            quotechar=string_delimiter,
                            delimiter=',',
                            quoting=csv.QUOTE_ALL,
                            skipinitialspace=True)
        output = next(output)
    except Exception, error:
        raise scidblib.AppError('csv_splitter exception ' + str(error) +
                                " on line=" + line)
Beispiel #20
0
def raise_if_duplicates(it, description):
    """Raise an AppError if iterable 'it' contains duplicates.

    @param[in] it iterable to test for duplicates
    @param[in] description what 'it' contains, for error reporting
    @throws AppError if duplicate(s) found
    """
    unique = set(it)
    if len(unique) < len(it):
        it2 = list(it)  # local copy
        for x in unique:
            it2.remove(x)
        raise scidblib.AppError(
            'Error! There are multiple occurrences of the same {0}: {1}'.
            format(name, it2))
Beispiel #21
0
def get_num_instances(iquery_cmd = None):
    """Get the number of SciDB instances.

    @param iquery_cmd  the iquery command to use.
    @return the number of SciDB instances acquired by AFL query list('instances')
    @exception AppError if SciDB is not running or if #instances <= 0 (for whatever reason)
    """
    if not iquery_cmd:
        iquery_cmd = get_iquery_cmd()
    query = 'list(\'instances\')'
    out_data, err_data = afl(iquery_cmd, query, want_output=True)
    num_lines = len(out_data.strip().split('\n'))
    if num_lines < 2:
        raise scidblib.AppError(query + ' is expected to return at least two lines.')
    return num_lines - 1  # skip the header line
Beispiel #22
0
def time_afl(iquery_cmd, query):
    """Execute an AFL query, and return the execution time.

    @param iquery_cmd the iquery command.
    @param query  the AFL query.
    @return the execution time.
    @exception AppError if the error did not execute successfully.
    """
    full_command = '/usr/bin/time -f \"%e\" ' + iquery_cmd + ' -naq \"' + query + "\" 1>/dev/null"
    out_data, err_data = execute_it_return_out_err(full_command)
    try:
        return float(err_data)
    except ValueError:
        raise scidblib.AppError('Timing the AFL query ' + query +
                                ', failed with the following error:\n' +
                                err_data)
    def __init__(self, iquery_cmd, load_array):
        """Call iquery -aq "show(load_array)" to get the schema of the load array, and fill in data members.

        @param iquery_cmd  the iquery command.
        @param load_array  the name of the load array.
        @exception AppError if the show() command does not produce a valid schema,
                             e.g. if load_array is not a valid array name in the database.
        """
        self.list = []

        schema_str = scidb_afl.single_cell_afl(iquery_cmd,
                                               'show(' + load_array + ')', 1)
        re_schema = (
            r'^.*' +  # array_name
            r'\<(.*)\>\s*' +  # <attributes>
            r'\[(.*)\]$'  # [dimensions]
        )

        match_schema = re.match(re_schema, schema_str, re.M | re.I)
        if not match_schema:
            raise scidblib.AppError(
                'System Error! I failed to parse the schema of the load_array.'
            )
        str_attrs = match_schema.group(1)
        str_dims = match_schema.group(2)

        # attributes
        self.attrs = Attributes(str_attrs)
        attrs = self.attrs.list
        for i, attr in enumerate(attrs):
            one_name = NameInLoadArray(attr.attr_name,
                                       is_dim=False,
                                       is_int64=attr.attr_type == 'int64',
                                       local_index=i)
            self.list.append(one_name)

        # dimensions
        self.dims = Dimensions(str_dims)
        dims = self.dims.list
        for i, dim in enumerate(dims):
            one_name = NameInLoadArray(dim.dim_name,
                                       is_dim=True,
                                       is_int64=True,
                                       local_index=i)
            self.list.append(one_name)
Beispiel #24
0
def time_afl(iquery_cmd, query, verbose=False):
    """Execute an AFL query, and return the execution time.

    @param iquery_cmd the iquery command.
    @param query  the AFL query.
    @return the execution time.
    @exception AppError if the error did not execute successfully.
    """
    full_command = '/usr/bin/time -f \"%e\" ' + iquery_cmd + ' -naq \"' + query + "\" 1>/dev/null"
    out_data, err_data = execute_it_return_out_err(full_command)
    try:
        t = float(err_data)
        if verbose:
            print verbose_afl_result_line_start() + '%s in %f seconds.' % (query, t)
        return t
    except ValueError:
        raise scidblib.AppError('Timing the AFL query ' + query + ', failed with the following error:\n' +
                        err_data)
Beispiel #25
0
    def earlier_than(self, another):
        """Comparing two versions.

        @param another  another VersionAndDate object.
        @return whether the current VersionAndDate is earlier than the other one.
        @exception AppError if any of the two objects is not valid.
        """
        if not self.valid() or not another.valid():
            raise scidblib.AppError('I cannot compare invalid VersionAndDate objects.')

        if self.major < another.major:
            return True
        elif self.major > another.major:
            return False

        if self.minor < another.minor:
            return True
        elif self.minor > another.minor:
            return False

        return self.revision < another.revision
Beispiel #26
0
def main():
    """The main function gets command-line arguments and calls calculate_chunk_length().

    @return 0
    @exception AppError if something goes wrong.
    """
    parser = argparse.ArgumentParser(
        description='Chunk-length calculator (c) SciDB, Inc.\n' + '\n' +
        'The program calculates a dimension-specification string from a raw_dims string,\n'
        +
        'by replacing \'?\' with calculated values, for fields such as the chunk length.\n'
        +
        'The calculated string may be cut & pasted into the dimension-specification part\n'
        + 'of a result-array schema, for the redimension() query.',
        epilog='examples:\n' + '  Suppose you have a SciDB array:\n' +
        '      arr_raw <i:int64,j:int64,v:double> [dummy=0:*,1000000,0],\n' +
        '  you want to redimension it into a matrix where i and j are dimensions, but you\n'
        +
        '  need help in choosing chunk lengths and/or low and high coordinates of the\n'
        + '  dimensions.\n' + '  You may call:\n' +
        '      calculate_chunk_length.py  arr_raw  \'i=?:?,?,?, j=?:?,?,?\'\n'
        +
        '  You are free to interleave \'?\' with values you desire. E.g. you may call:\n'
        +
        '      calculate_chunk_length.py  arr_raw  \'i=0:?,8192,0, j=?:?,?,10\'\n'
        + '\n' + 'assumptions:\n' + '  - iquery is in your path.\n' +
        '  - The specified load_array exists in the database, and has data loaded.\n'
        +
        '  - Every specified dim_name in raw_dims must exist in load_array, either as a\n'
        + '    dimension, or as an attribute which is of type int64.\n' +
        '  - If you choose to specify a \'low\' (or \'high\') value for a dimension, it\n'
        +
        '    must be a lowerbound (or upperbound) of all actual values for that name in\n'
        + '    load_array.\n' + '\n' + 'limitations:\n' +
        '  - The algorithm does not handle skew, e.g. when majority of the array is empty\n'
        +
        '    but there are a few small dense regions. In such cases, the script may\n'
        +
        '    produce overly large chunk lengths in that chunks covering the dense regions\n'
        + '    may use too much memory.\n' +
        '    The workaround is to reduce the desired_values_per_chunk argument.\n'
        +
        '  - The algorithm does not handle large-sized attributes, e.g. string attributes\n'
        +
        '    with thousands or even millions of bytes. In such cases, the script may\n'
        +
        '    produce overly large chunk lengths in that chunks of such attributes may\n'
        + '    use too much memory.\n' +
        '    The workaround is again to reduce the desired_values_per_chunk argument.',
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument(
        'load_array', help='The name of the source array to redimension from.')
    parser.add_argument(
        'raw_dims',
        help=
        '''A string describing the dimension specification of the result array,
                        in the form of \'dim_name=low:high,chunk_length,chunk_overlap [, OTHER_DIMS]\'.
                        Some note:
                        (a) dim_name must be the name of either a dimension or an int64-typed attribute in load_array.
                        (b) The other four components are either an integer (the algorithm will respect that)
                          or \'?\' (the algorithm will calculate a value for it).
                        (c) The \'high\' component has an additional choice of \'*\', in which case the calculated
                          schema will also contains \'*\'.''')
    parser.add_argument('-c',
                        '--host',
                        help='Host name to be passed to iquery.')
    parser.add_argument('-p',
                        '--port',
                        help='Port number to be passed to iquery.')
    parser.add_argument(
        '-d',
        '--desired_values_per_chunk',
        type=int,
        default=1024 * 1024,
        help='''The number of desired non-empty values per chunk.
                        The default value is 1 Mebi (i.e. 2^20).
                        With the same desired values per chunk, a sparser result array will get at larger
                        chunk lengths.''')
    parser.add_argument(
        '-k',
        '--keep_shape',
        action='store_true',
        help=
        '''If specified, the shape of a chunk will be similar to the shape of the array,
                        i.e. every dimension will be partitioned to a similar number of pieces.
                        The default is not keep_shape, i.e. a larger dimension will be partitioned to more pieces.'''
    )
    parser.add_argument(
        '-v',
        '--verbose',
        default=0,
        action='count',
        help='If specified, progress of the algorithm will be reported.')
    parser.add_argument('-t',
                        '--grid_threshold',
                        type=float,
                        default=0.1,
                        help='''The default value is 0.1 (or 10%%).\n' +
                        The algorithm makes an effort to adjust each calculated chunk length to a
                        closeby \'gridline value\' (see the -g option).
                        A gridline value will be considered only if its relative difference from the
                        calculated chunk length is no more than this grid_threshold.
                        E.g. with the default value, if a calculated chunk length is 3847, the gridline value 4000
                        may be considered because its relative difference from 3847, i.e. (4000-3847)/3847=0.40, is
                        less than 0.1; but the gridline value 3000 may not be considered because its relative
                        difference from 3847 (=0.22) exceeds 0.1.
                        You may disable any adjustment by setting grid_threshold = 0.'''
                        )
    parser.add_argument(
        '-g',
        '--grid_base10',
        action='store_true',
        help='''If specified, use a multiple-of-power-of-10 as the gridline.
                        The default is not to specify, in which case a power-of-2 is used as the gridline.
                        In the power-of-2 case, there is only one candidate gridline value: the closest power of 2.
                        In the multiple-of-power-of-10 case, multiple candidates may be considered, with different
                        numbers of ending zeros. If multiple gridline values are within grid_threshold, the one with
                        the most number of ending zeros is chosen, breaking ties by favoring the one closer to the
                        calculated chunk length.
                        E.g. if a calculated chunk length is 3847, gridline values 10,000, 4000, 3800, and 3850 are
                        all considered. If grid_threshold=0.1, 4000 is chosen; if grid_threshold=0.01, 3850 is chosen.'''
    )
    args = parser.parse_args()

    global _verbose
    _verbose = args.verbose

    try:
        if args.desired_values_per_chunk <= 0:
            raise scidblib.AppError(
                'Desired values per chunk must be positive.')
        exit_code = calculate_chunk_length(args)
        assert exit_code == 0, 'AssertionError: the command is expected to return 0 unless an exception was thrown.'
    except scidblib.AppError as e:
        print >> sys.stderr, '------ Exception -----------------------------'
        print >> sys.stderr, e
        if args.verbose:
            print >> sys.stderr, '------ Traceback (for debug purpose) ---------'
            traceback.print_exc()
        print >> sys.stderr, '----------------------------------------------'
        return -1
    except Exception as e:
        print >> sys.stderr, '------ Unexpected Exception ------------------'
        print >> sys.stderr, e
        print >> sys.stderr, '------ Traceback (for debug purpose) ---------'
        traceback.print_exc()
        print >> sys.stderr, '----------------------------------------------'
        return -2

    return 0
Beispiel #27
0
def calculate_chunk_length(args):
    """Calculate chunk length and other fields which were '?', and print out the schema.

    @param args  the result of argparse.ArgumentParser.parse_args().
    @return 0
    @exception AppError if anything goes wrong.
    """
    iquery_cmd = scidb_afl.get_iquery_cmd(args)
    load_array = args.load_array
    raw_dims_str = args.raw_dims

    calculated_dims = parse_dimensions(raw_dims_str)
    dbg("Calculated dims:", [x.to_tuple() for x in calculated_dims])

    # Initialize the progress tracker
    progress_tracker = scidb_progress.ProgressTracker(
        sys.stdout,
        '',
        args.verbose,  # if_print_start
        args.verbose,  # if_print_end
        args.verbose  # if_print_skip
    )
    progress_tracker.register_step(
        'min_max_dc',
        'Get min_coord, max_coord, and ApproxDC for each dim from load_array.')
    progress_tracker.register_step('overall_dc',
                                   'Get overall ApproxDC from load_array.')
    progress_tracker.register_step(
        'calculate', 'Calculate and adjust dimension specification.')

    # S = dims where chunk_length is Specified;
    # N = dims where chunk_length is Not specified.
    S = []
    N = []
    for i, the_dim in enumerate(calculated_dims):
        if the_dim.chunk_length == '?':
            N.append(i)
        else:
            S.append(i)
    dbg("S:", S)
    dbg("N:", N)

    # Get the (dimension and attribute) names of the load_array.
    names_in_load_array = NamesInLoadArray(iquery_cmd, load_array)
    dbg("names...:", names_in_load_array.list)

    # for each i in [0..d), calculate min_coord[i], max_coord[i], and distinct_count[i]
    progress_tracker.start_step('min_max_dc')
    for the_dim in calculated_dims:
        index = names_in_load_array.find_index(the_dim.dim_name)
        the_name_in_load_array = names_in_load_array.list[index]

        if the_name_in_load_array.is_dim:
            tmp = names_in_load_array.gen_uniq_name()
            cmd = ('aggregate(apply(aggregate(' + load_array + ', count(*), ' +
                   the_dim.dim_name + '), ' + tmp + ', ' + the_dim.dim_name +
                   '), min(' + tmp + '), max(' + tmp + '), count(*))')
        else:
            cmd = ('aggregate(' + load_array + ', min(' + the_dim.dim_name +
                   '), max(' + the_dim.dim_name + '), approxdc(' +
                   the_dim.dim_name + '))')
        dbg("Cmd:", cmd)
        min_coord, max_coord, distinct_count = scidb_afl.single_cell_afl(
            iquery_cmd, cmd, 3)
        dbg("(min,max,dc):", (min_coord, max_coord, distinct_count))
        try:
            min_coord_int = int(min_coord)
            max_coord_int = int(max_coord)
            distinct_count_int = int(distinct_count)
            if args.verbose:
                print 'For ' + the_dim.dim_name + ', min_coord=' + str(min_coord_int) +\
                    ', max_coord=' + str(max_coord_int) +\
                    ', distinct_count=' + str(distinct_count_int)
        except ValueError:
            raise scidblib.AppError('Error: I cannot proceed because for ' +
                                    the_dim.dim_name + ' in array ' +
                                    load_array + ', not all of min_coord (=' +
                                    min_coord + '), max_coord (=' + max_coord +
                                    '), and distinct_count (=' +
                                    distinct_count + ') are integers.')
        the_dim.set_min_max_dc(min_coord_int, max_coord_int,
                               distinct_count_int)
    progress_tracker.end_step('min_max_dc')

    # Fill dim_low, dim_high, and chunk_overlap (which was a '?' before).
    for the_dim in calculated_dims:
        if the_dim.dim_low == '?':
            the_dim.dim_low = the_dim.min_coord
        if the_dim.dim_high == '?':
            the_dim.dim_high = the_dim.max_coord
        if the_dim.chunk_overlap == '?':
            the_dim.chunk_overlap = 0

    # Generate string_concat_of_dim_values in the form of:
    # string(dim_name1) + '|' + string(dim_name2) + '|' + string(dim_name3)
    string_values = []
    for i, the_dim in enumerate(calculated_dims):
        string_values.append('string(' + the_dim.dim_name + ')')
    string_concat_of_dim_values = ' + \'|\' + '.join(string_values)

    # Calculate overall_distinct_count.
    tmp = names_in_load_array.gen_uniq_name()
    cmd = ('aggregate(apply(' + load_array + ', ' + tmp + ', ' +
           string_concat_of_dim_values + '), approxdc(' + tmp + '))')
    progress_tracker.start_step('overall_dc')
    overall_distinct_count = scidb_afl.single_cell_afl(iquery_cmd, cmd, 1)
    overall_count = scidb_afl.single_cell_afl(
        iquery_cmd, 'aggregate(' + load_array + ', count(*))', 1)
    try:
        overall_distinct_count = int(overall_distinct_count)
        overall_count = int(overall_count)
        if overall_distinct_count > overall_count:
            overall_distinct_count = overall_count
    except ValueError:
        raise scidblib.AppError(
            'Error: The query to get overall_distinct_count failed to return an integer.'
        )
    if args.verbose:
        print 'overall_distinct_count=' + str(overall_distinct_count)
    progress_tracker.end_step('overall_dc')

    progress_tracker.start_step('calculate')

    # Shortcut: if |N| == 0, we are done.
    if len(N) == 0:
        print scidb_schema.unparse(
            dims=[x.to_tuple() for x in calculated_dims])
        return 0

    # Set num_chunks_from_n.
    num_chunks_from_n = scidb_math.ceil_of_division(
        overall_distinct_count, args.desired_values_per_chunk)
    for i in S:
        the_dim = calculated_dims[i]
        chunk_count = scidb_math.ceil_of_division(the_dim.distinct_count,
                                                  int(the_dim.chunk_length))
        num_chunks_from_n = scidb_math.ceil_of_division(
            num_chunks_from_n, chunk_count)
    if num_chunks_from_n <= 1:
        num_chunks_from_n = 1

    # For each dimension i in N, calculate chunk_count[i], then set chunk_length.
    for i in N:
        the_dim = calculated_dims[i]
        chunk_count = math.pow(num_chunks_from_n, 1.0 / len(N))
        if not args.keep_shape:
            # calculate geomean
            product = 1.0
            for k in N:
                product *= calculated_dims[k].distinct_count
            geomean = math.pow(product, 1.0 / len(N))
            chunk_count *= the_dim.distinct_count / geomean
        if chunk_count < 1:
            chunk_count = 1.0
        the_dim.chunk_length = int(
            math.ceil(
                (the_dim.max_coord - the_dim.min_coord + 1) / chunk_count))
        if chunk_count > 1:
            the_dim.chunk_length = scidb_math.snap_to_grid(
                the_dim.chunk_length,
                args.grid_threshold,
                use_binary=(not args.grid_base10))
    progress_tracker.end_step('calculate')

    # Print result.
    print scidb_schema.unparse(dims=[x.to_tuple() for x in calculated_dims])
    return 0