Ejemplo n.º 1
0
def aslabeledarray(data):
    sequence = (tuple, list)
    if isinstance(data, la.LArray):
        return data
    elif (isinstance(data, sequence) and len(data)
          and isinstance(data[0], la.LArray)):
        # XXX: use la.stack?
        # TODO: check that all arrays have the same axes
        axes = [la.Axis(len(data))] + list(data[0].axes)
        return la.LArray(data, axes)
    else:
        return la.LArray(data)
Ejemplo n.º 2
0
    geo = la.Axis(belgium, 'geo')

    # data1 = np.arange(30).reshape(2, 15)
    # arr1 = la.LArray(data1, axes=(sex, lipro))
    # edit(arr1)

    # data2 = np.arange(116 * 44 * 2 * 15).reshape(116, 44, 2, 15) \
    #           .astype(float)
    # data2 = np.random.random(116 * 44 * 2 * 15).reshape(116, 44, 2, 15) \
    #           .astype(float)
    # data2 = (np.random.randint(10, size=(116, 44, 2, 15)) - 5) / 17
    # data2 = np.random.randint(10, size=(116, 44, 2, 15)) / 100 + 1567
    # data2 = np.random.normal(51000000, 10000000, size=(116, 44, 2, 15))
    data2 = np.random.normal(0, 1, size=(116, 44, 2, 15))
    arr2 = la.LArray(data2, axes=(age, geo, sex, lipro))
    # arr2 = la.ndrange([100, 100, 100, 100, 5])
    # arr2 = arr2['F', 'A11', 1]

    # view(arr2[0, 'A11', 'F', 'P01'])
    # view(arr1)
    # view(arr2[0, 'A11'])
    # edit(arr1)
    # print(arr2[0, 'A11', :, 'P01'])
    # edit(arr2.astype(int), minvalue=-99, maxvalue=55.123456)
    # edit(arr2.astype(int), minvalue=-99)
    # arr2.i[0, 0, 0, 0] = np.inf
    # arr2.i[0, 0, 1, 1] = -np.inf
    # arr2 = [0.0000111, 0.0000222]
    # arr2 = [0.00001, 0.00002]
    # edit(arr2, minvalue=-99, maxvalue=25.123456)
Ejemplo n.º 3
0
def load_ndarray(fpath, celltype=None):
    print(" - reading", fpath)
    # FIXME: implement celltype
    a = la.read_csv(fpath, dialect='liam2')
    # print(a.info)
    return a
    with open(fpath, "rb") as f:
        reader = csv.reader(f)
        line_stream = skip_comment_cells(strip_rows(reader))
        header = line_stream.next()
        str_table = []
        for line in line_stream:
            if any(value == '' for value in line):
                raise Exception("empty cell found in %s" % fpath)
            str_table.append(line)
    ndim = len(header)

    # handle last dimension header (horizontal values)
    last_d_header = str_table.pop(0)
    # auto-detect type of values for the last d and convert them
    last_d_pvalues = convert_1darray(last_d_header)

    unique_last_d, dupe_last_d = unique_duplicate(last_d_pvalues)
    if dupe_last_d:
        print(("Duplicate column header value(s) (for '%s') in '%s': %s"
              % (header[-1], fpath,
                 ", ".join(str(v) for v in dupe_last_d))))
        raise Exception("bad data in '%s': found %d "
                        "duplicate column header value(s)"
                        % (fpath, len(dupe_last_d)))

    # handle other dimensions header

    # strip the ndim-1 first columns
    headers = [[line.pop(0) for line in str_table]
               for _ in range(ndim - 1)]
    headers = [convert_1darray(pvalues_str) for pvalues_str in headers]
    if ndim > 1:
        # having duplicate values is normal when there are more than 2
        # dimensions but we need to test whether there are duplicates of
        # combinations.
        dupe_combos = list(duplicates(zip(*headers)))
        if dupe_combos:
            print(("Duplicate row header value(s) in '%s':" % fpath))
            print((PrettyTable(dupe_combos)))
            raise Exception("bad alignment data in '%s': found %d "
                            "duplicate row header value(s)"
                            % (fpath, len(dupe_combos)))

    possible_values = [np.array(list(unique(pvalues))) for pvalues in headers]
    possible_values.append(np.array(unique_last_d))

    shape = tuple(len(values) for values in possible_values)
    num_possible_values = prod(shape)

    # transform the 2d table into a 1d list
    str_table = list(chain.from_iterable(str_table))
    if len(str_table) != num_possible_values:
        raise Exception("incoherent data in '%s': %d data cells "
                        "found while it should be %d based on the number "
                        "of possible values in headers (%s)"
                        % (fpath,
                           len(str_table),
                           num_possible_values,
                           ' * '.join(str(len(values))
                                      for values in possible_values)))

    # TODO: compare time with numpy built-in conversion:
    # if dtype is None, numpy tries to detect the best type itself
    # which it does a good job of if the values are already numeric values
    # if dtype is provided, numpy does a good job to convert from string
    # values.
    if celltype is None:
        celltype = detect_column_type(str_table)
    data = convert_1darray(str_table, celltype)
    array = np.array(data, dtype=celltype)
    return la.LArray(array.reshape(shape), header, possible_values)
Ejemplo n.º 4
0
    def compute(self, context, *expressions, **kwargs):
        if not expressions:
            raise TypeError("groupby() takes at least 1 argument")

        # TODO: allow lists/tuples of arguments to group by the combinations
        # of keys
        for expr in expressions:
            if isinstance(expr, (bool, int, float)):
                raise TypeError("groupby() does not work with constant "
                                "arguments")
            if isinstance(expr, (tuple, list)):
                raise TypeError("groupby() takes expressions as arguments, "
                                "not a list of expressions")

        # On python 3, we could clean up this code (keyword only arguments).
        expr = kwargs.pop('expr', None)
        if expr is None:
            expr = Count()

#        by = kwargs.pop('by', None)
        filter_value = kwargs.pop('filter', None)
        percent = kwargs.pop('percent', False)
        possible_values = kwargs.pop('pvalues', None)
        totals = kwargs.pop('totals', True)

        expr_vars = [v.name for v in collect_variables(expr)]
        labels = [str(e) for e in expressions]
        columns = [expr_eval(e, context) for e in expressions]
        columns = [expand(c, context_length(context)) for c in columns]

        if filter_value is not None:
            filtered_columns = [col[filter_value] for col in columns]
            # FIXME: use the actual filter_expr instead of not_hashable
            filtered_context = context.subset(filter_value, expr_vars,
                                              not_hashable)
        else:
            filtered_columns = columns
            filtered_context = context

        if possible_values is None:
            possible_values = [np.unique(col) for col in filtered_columns]

        # We pre-filtered columns instead of passing the filter to partition_nd
        # because it is a bit faster this way. The indices are still correct,
        # because we use them on a filtered_context.
        groups = partition_nd(filtered_columns, True, possible_values)
        if not groups:
            # return la.LArray([], labels, possible_values)
            return la.LArray([])

        # evaluate the expression on each group
        # we use not_hashable to avoid storing the subset in the cache
        contexts = [
            filtered_context.subset(indices, expr_vars, not_hashable)
            for indices in groups
        ]
        data = [expr_eval(expr, c) for c in contexts]

        # TODO: use group_indices_nd directly to avoid using np.unique
        # this is twice as fast (unique is very slow) but breaks because
        # the rest of the code assumes all combinations are present
        #        if self.filter is not None:
        #            filter_value = expr_eval(self.filter, context)
        #        else:
        #            filter_value = True
        #
        #        d = group_indices_nd(columns, filter_value)
        #        pvalues = sorted(d.keys())
        #        ndim = len(columns)
        #        possible_values = [[pv[i] for pv in pvalues]
        #                           for i in range(ndim)]
        #        groups = [d[k] for k in pvalues]

        # groups is a (flat) list of list.
        # the first variable is the outer-most "loop",
        # the last one the inner most.

        # add total for each row
        len_pvalues = [len(vals) for vals in possible_values]

        if percent:
            totals = True

        if totals:
            width = len_pvalues[-1]
            height = prod(len_pvalues[:-1])
            rows_indices = [
                np.concatenate([groups[y * width + x] for x in range(width)])
                for y in range(height)
            ]
            cols_indices = [
                np.concatenate([groups[y * width + x] for y in range(height)])
                for x in range(width)
            ]
            cols_indices.append(np.concatenate(cols_indices))

            # evaluate the expression on each "combined" group (ie compute totals)
            row_ctxs = [
                filtered_context.subset(indices, expr_vars, not_hashable)
                for indices in rows_indices
            ]
            row_totals = [expr_eval(expr, ctx) for ctx in row_ctxs]
            col_ctxs = [
                filtered_context.subset(indices, expr_vars, not_hashable)
                for indices in cols_indices
            ]
            col_totals = [expr_eval(expr, ctx) for ctx in col_ctxs]
        else:
            row_totals = None
            col_totals = None

        if percent:
            # convert to np.float64 to get +-inf if total_value is int(0)
            # instead of Python's built-in behaviour of raising an exception.
            # This can happen at least when using the default expr (count())
            # and the filter yields empty groups
            total_value = np.float64(col_totals[-1])
            data = [100.0 * value / total_value for value in data]
            row_totals = [100.0 * value / total_value for value in row_totals]
            col_totals = [100.0 * value / total_value for value in col_totals]


#        if self.by or self.percent:
#            if self.percent:
#                total_value = data[-1]
#                divisors = [total_value for _ in data]
#            else:
#                num_by = len(self.by)
#                inc = prod(len_pvalues[-num_by:])
#                num_groups = len(groups)
#                num_categories = prod(len_pvalues[:-num_by])
#
#                categories_groups_idx = [range(cat_idx, num_groups, inc)
#                                         for cat_idx in range(num_categories)]
#
#                divisors = ...
#
#            data = [100.0 * value / divisor
#                    for value, divisor in zip(data, divisors)]

# convert to a 1d array. We don't simply use data = np.array(data),
# because if data is a list of ndarray (for example if we use
# groupby(a, expr=id), *and* all the ndarrays have the same length,
# the result is a 2d array instead of an array of ndarrays like we
# need (at this point).
        arr = np.empty(len(data), dtype=type(data[0]))
        arr[:] = data
        data = arr

        # and reshape it
        data = data.reshape(len_pvalues)
        axes = [
            la.Axis(axis_labels, axis_name)
            for axis_name, axis_labels in zip(labels, possible_values)
        ]
        # FIXME: also handle totals
        return la.LArray(data, axes)
Ejemplo n.º 5
0
def index_tables(globals_def, entities, fpath):
    print("reading data from %s ..." % fpath)
    input_file = tables.open_file(fpath)
    try:
        input_root = input_file.root

        def must_load_from_input_file(gdef):
            return isinstance(gdef, dict) and 'path' not in gdef

        any_global_from_input_file = any(
            must_load_from_input_file(gdef) for gdef in globals_def.values())
        if any_global_from_input_file and 'globals' not in input_root:
            raise Exception(
                'could not find any globals in the input data file '
                '(but some are declared in the simulation file)')

        globals_data = load_path_globals(globals_def)
        constant_globals_data = handle_constant_globals(globals_def)
        globals_data.update(constant_globals_data)
        globals_node = getattr(input_root, 'globals', None)
        for name, global_def in globals_def.items():
            # already loaded from another source (path)
            if name in globals_data:
                continue

            if name not in globals_node:
                raise Exception("could not find 'globals/%s' in the input "
                                "data file" % name)

            global_data = getattr(globals_node, name)

            global_type = global_def.get('type', global_def.get('fields'))
            # TODO: move the checking (assertValidType) to a separate function
            assert_valid_type(global_data, global_type, context=name)
            array = global_data.read()
            if isinstance(global_type, list):
                # make sure we do not keep in memory columns which are
                # present in the input file but where not asked for by the
                # modeller. They are not accessible anyway.
                array = add_and_drop_fields(array, global_type)
            attrs = global_data.attrs
            dim_names = getattr(attrs, 'dimensions', None)
            if dim_names is not None:
                # we serialise dim_names as a numpy array so that it is
                # stored as a native hdf type and not a pickle but we
                # prefer to work with simple lists
                # also files serialized using Python2 are "bytes" not "str"
                dim_names = [str(dim_name) for dim_name in dim_names]
                pvalues = [
                    getattr(attrs, 'dim%d_pvalues' % i)
                    for i in range(len(dim_names))
                ]
                axes = [
                    la.Axis(labels, axis_name)
                    for axis_name, labels in zip(dim_names, pvalues)
                ]
                array = la.LArray(array, axes)
            globals_data[name] = array

        input_entities = input_root.entities

        entities_tables = {}
        print(" * indexing tables")
        for ent_name, entity in entities.items():
            print("    -", ent_name, "...", end=' ')

            table = getattr(input_entities, ent_name)
            assert_valid_type(table, list(entity.fields.in_input.name_types))

            rows_per_period, id_to_rownum_per_period = \
                timed(index_table, table)
            indexed_table = IndexedTable(table, rows_per_period,
                                         id_to_rownum_per_period)
            entities_tables[ent_name] = indexed_table
    except:
        input_file.close()
        raise

    return input_file, {'globals': globals_data, 'entities': entities_tables}