Example #1
0
 def keys(self, extra=True):
     res = list(self.entity.array.dtype.names)
     res.extend(sorted(self.entity.temp_variables.keys()))
     if extra:
         res.extend(sorted(self.extra.keys()))
     # in theory, this should not be needed because we present defining a function argument with the same
     # name than an entity field
     assert list(unique(res)) == res
     return res
Example #2
0
 def keys(self, extra=True):
     res = list(self.entity.array.dtype.names)
     res.extend(sorted(self.entity.temp_variables.keys()))
     if extra:
         res.extend(sorted(self.extra.keys()))
     # in theory, this should not be needed because we present defining a function argument with the same
     # name than an entity field
     assert list(unique(res)) == res
     return res
Example #3
0
    def _modified_fields(self):
        fnames = self.predictors
        if not fnames:
            return []

        fnames.insert(0, 'id')
        temp = self.entity.temp_variables
        array = self.entity.array
        length = len(array)

        fields = [(k, temp[k] if k in temp else array[k])
                  for k in utils.unique(fnames)]
        return [(k, v) for k, v in fields
                if isinstance(v, np.ndarray) and v.shape == (length, )]
Example #4
0
def load_ndarray(fpath, celltype=None):
    print(" - reading", fpath)
    # FIXME: implement celltype
    a = la.read_csv(fpath, dialect='liam2')
    # print(a.info)
    return a
    with open(fpath, "rb") as f:
        reader = csv.reader(f)
        line_stream = skip_comment_cells(strip_rows(reader))
        header = line_stream.next()
        str_table = []
        for line in line_stream:
            if any(value == '' for value in line):
                raise Exception("empty cell found in %s" % fpath)
            str_table.append(line)
    ndim = len(header)

    # handle last dimension header (horizontal values)
    last_d_header = str_table.pop(0)
    # auto-detect type of values for the last d and convert them
    last_d_pvalues = convert_1darray(last_d_header)

    unique_last_d, dupe_last_d = unique_duplicate(last_d_pvalues)
    if dupe_last_d:
        print(("Duplicate column header value(s) (for '%s') in '%s': %s"
              % (header[-1], fpath,
                 ", ".join(str(v) for v in dupe_last_d))))
        raise Exception("bad data in '%s': found %d "
                        "duplicate column header value(s)"
                        % (fpath, len(dupe_last_d)))

    # handle other dimensions header

    # strip the ndim-1 first columns
    headers = [[line.pop(0) for line in str_table]
               for _ in range(ndim - 1)]
    headers = [convert_1darray(pvalues_str) for pvalues_str in headers]
    if ndim > 1:
        # having duplicate values is normal when there are more than 2
        # dimensions but we need to test whether there are duplicates of
        # combinations.
        dupe_combos = list(duplicates(zip(*headers)))
        if dupe_combos:
            print(("Duplicate row header value(s) in '%s':" % fpath))
            print((PrettyTable(dupe_combos)))
            raise Exception("bad alignment data in '%s': found %d "
                            "duplicate row header value(s)"
                            % (fpath, len(dupe_combos)))

    possible_values = [np.array(list(unique(pvalues))) for pvalues in headers]
    possible_values.append(np.array(unique_last_d))

    shape = tuple(len(values) for values in possible_values)
    num_possible_values = prod(shape)

    # transform the 2d table into a 1d list
    str_table = list(chain.from_iterable(str_table))
    if len(str_table) != num_possible_values:
        raise Exception("incoherent data in '%s': %d data cells "
                        "found while it should be %d based on the number "
                        "of possible values in headers (%s)"
                        % (fpath,
                           len(str_table),
                           num_possible_values,
                           ' * '.join(str(len(values))
                                      for values in possible_values)))

    # TODO: compare time with numpy built-in conversion:
    # if dtype is None, numpy tries to detect the best type itself
    # which it does a good job of if the values are already numeric values
    # if dtype is provided, numpy does a good job to convert from string
    # values.
    if celltype is None:
        celltype = detect_column_type(str_table)
    data = convert_1darray(str_table, celltype)
    array = np.array(data, dtype=celltype)
    return la.LArray(array.reshape(shape), header, possible_values)
Example #5
0
def load_ndarray(fpath, celltype=None):
    print(" - reading", fpath)
    # FIXME: implement celltype
    a = la.read_csv(fpath, dialect='liam2')
    # print(a.info)
    return a
    with open(fpath, "rb") as f:
        reader = csv.reader(f)
        line_stream = skip_comment_cells(strip_rows(reader))
        header = line_stream.next()
        str_table = []
        for line in line_stream:
            if any(value == '' for value in line):
                raise Exception("empty cell found in %s" % fpath)
            str_table.append(line)
    ndim = len(header)

    # handle last dimension header (horizontal values)
    last_d_header = str_table.pop(0)
    # auto-detect type of values for the last d and convert them
    last_d_pvalues = convert_1darray(last_d_header)

    unique_last_d, dupe_last_d = unique_duplicate(last_d_pvalues)
    if dupe_last_d:
        print(("Duplicate column header value(s) (for '%s') in '%s': %s"
              % (header[-1], fpath,
                 ", ".join(str(v) for v in dupe_last_d))))
        raise Exception("bad data in '%s': found %d "
                        "duplicate column header value(s)"
                        % (fpath, len(dupe_last_d)))

    # handle other dimensions header

    # strip the ndim-1 first columns
    headers = [[line.pop(0) for line in str_table]
               for _ in range(ndim - 1)]
    headers = [convert_1darray(pvalues_str) for pvalues_str in headers]
    if ndim > 1:
        # having duplicate values is normal when there are more than 2
        # dimensions but we need to test whether there are duplicates of
        # combinations.
        dupe_combos = list(duplicates(zip(*headers)))
        if dupe_combos:
            print(("Duplicate row header value(s) in '%s':" % fpath))
            print((PrettyTable(dupe_combos)))
            raise Exception("bad alignment data in '%s': found %d "
                            "duplicate row header value(s)"
                            % (fpath, len(dupe_combos)))

    possible_values = [np.array(list(unique(pvalues))) for pvalues in headers]
    possible_values.append(np.array(unique_last_d))

    shape = tuple(len(values) for values in possible_values)
    num_possible_values = prod(shape)

    # transform the 2d table into a 1d list
    str_table = list(chain.from_iterable(str_table))
    if len(str_table) != num_possible_values:
        raise Exception("incoherent data in '%s': %d data cells "
                        "found while it should be %d based on the number "
                        "of possible values in headers (%s)"
                        % (fpath,
                           len(str_table),
                           num_possible_values,
                           ' * '.join(str(len(values))
                                      for values in possible_values)))

    # TODO: compare time with numpy built-in conversion:
    # if dtype is None, numpy tries to detect the best type itself
    # which it does a good job of if the values are already numeric values
    # if dtype is provided, numpy does a good job to convert from string
    # values.
    if celltype is None:
        celltype = detect_column_type(str_table)
    data = convert_1darray(str_table, celltype)
    array = np.array(data, dtype=celltype)
    return la.LArray(array.reshape(shape), header, possible_values)