Example #1
0
    def load(self, fpath):
        from exprparser import parse

        with open(os.path.join(config.input_directory, fpath), "rb") as f:
            reader = csv.reader(f)
            lines = skip_comment_cells(strip_rows(reader))
            header = lines.next()
            self.expressions = [parse(s, autovariables=True) for s in header]
            table = []
            for line in lines:
                if any(value == "" for value in line):
                    raise Exception("empty cell found in %s" % fpath)
                table.append([eval(value) for value in line])
        ndim = len(header)
        unique_last_d, dupe_last_d = unique_duplicate(table.pop(0))
        if dupe_last_d:
            print(
                "Duplicate column header value(s) (for '%s') in '%s': %s"
                % (header[-1], fpath, ", ".join(str(v) for v in dupe_last_d))
            )
            raise Exception(
                "bad alignment data in '%s': found %d " "duplicate column header value(s)" % (fpath, len(dupe_last_d))
            )

        # strip the ndim-1 first columns
        headers = [[line.pop(0) for line in table] for _ in range(ndim - 1)]

        possible_values = [list(unique(values)) for values in headers]
        if ndim > 1:
            # having duplicate values is normal when there are more than 2
            # dimensions but we need to test whether there are duplicates of
            # combinations.
            dupe_combos = list(duplicates(zip(*headers)))
            if dupe_combos:
                print("Duplicate row header value(s) in '%s':" % fpath)
                print(PrettyTable(dupe_combos))
                raise Exception(
                    "bad alignment data in '%s': found %d " "duplicate row header value(s)" % (fpath, len(dupe_combos))
                )

        possible_values.append(unique_last_d)
        self.possible_values = possible_values
        self.probabilities = list(chain.from_iterable(table))
        num_possible_values = prod(len(values) for values in possible_values)
        if len(self.probabilities) != num_possible_values:
            raise Exception(
                "incoherent alignment data in '%s': %d data cells "
                "found while it should be %d based on the number "
                "of possible values in headers (%s)"
                % (
                    fpath,
                    len(self.probabilities),
                    num_possible_values,
                    " * ".join(str(len(values)) for values in possible_values),
                )
            )
Example #2
0
def load_ndarray(fpath, celltype=None):
    print(" - reading", fpath)
    with open(fpath, "rb") as f:
        reader = csv.reader(f)
        line_stream = skip_comment_cells(strip_rows(reader))
        header = line_stream.next()
        str_table = []
        for line in line_stream:
            if any(value == '' for value in line):
                raise Exception("empty cell found in %s" % fpath)
            str_table.append(line)
    ndim = len(header)

    # handle last dimension header (horizontal values)
    last_d_header = str_table.pop(0)
    # auto-detect type of values for the last d and convert them
    last_d_pvalues = convert_1darray(last_d_header)

    unique_last_d, dupe_last_d = unique_duplicate(last_d_pvalues)
    if dupe_last_d:
        print(("Duplicate column header value(s) (for '%s') in '%s': %s"
              % (header[-1], fpath,
                 ", ".join(str(v) for v in dupe_last_d))))
        raise Exception("bad data in '%s': found %d "
                        "duplicate column header value(s)"
                        % (fpath, len(dupe_last_d)))

    # handle other dimensions header

    # strip the ndim-1 first columns
    headers = [[line.pop(0) for line in str_table]
               for _ in range(ndim - 1)]
    headers = [convert_1darray(pvalues_str) for pvalues_str in headers]
    if ndim > 1:
        # having duplicate values is normal when there are more than 2
        # dimensions but we need to test whether there are duplicates of
        # combinations.
        dupe_combos = list(duplicates(zip(*headers)))
        if dupe_combos:
            print(("Duplicate row header value(s) in '%s':" % fpath))
            print((PrettyTable(dupe_combos)))
            raise Exception("bad alignment data in '%s': found %d "
                            "duplicate row header value(s)"
                            % (fpath, len(dupe_combos)))

    possible_values = [np.array(list(unique(pvalues))) for pvalues in headers]
    possible_values.append(np.array(unique_last_d))

    shape = tuple(len(values) for values in possible_values)
    num_possible_values = prod(shape)

    # transform the 2d table into a 1d list
    str_table = list(chain.from_iterable(str_table))
    if len(str_table) != num_possible_values:
        raise Exception("incoherent data in '%s': %d data cells "
                        "found while it should be %d based on the number "
                        "of possible values in headers (%s)"
                        % (fpath,
                           len(str_table),
                           num_possible_values,
                           ' * '.join(str(len(values))
                                      for values in possible_values)))

    #TODO: compare time with numpy built-in conversion:
    # if dtype is None, numpy tries to detect the best type itself
    # which it does a good job of if the values are already numeric values
    # if dtype is provided, numpy does a good job to convert from string
    # values.
    if celltype is None:
        celltype = detect_column_type(str_table)
    data = convert_1darray(str_table, celltype)
    array = np.array(data, dtype=celltype)
    return LabeledArray(array.reshape(shape), header, possible_values)
Example #3
0
def load_ndarray(fpath, celltype=None):
    print(" - reading", fpath)
    with open(fpath, "rb") as f:
        reader = csv.reader(f)
        line_stream = skip_comment_cells(strip_rows(reader))
        header = line_stream.next()
        str_table = []
        for line in line_stream:
            if any(value == '' for value in line):
                raise Exception("empty cell found in %s" % fpath)
            str_table.append(line)
    ndim = len(header)

    # handle last dimension header (horizontal values)
    last_d_header = str_table.pop(0)
    # auto-detect type of values for the last d and convert them
    last_d_pvalues = convert_1darray(last_d_header)

    unique_last_d, dupe_last_d = unique_duplicate(last_d_pvalues)
    if dupe_last_d:
        print(("Duplicate column header value(s) (for '%s') in '%s': %s" %
               (header[-1], fpath, ", ".join(str(v) for v in dupe_last_d))))
        raise Exception("bad data in '%s': found %d "
                        "duplicate column header value(s)" %
                        (fpath, len(dupe_last_d)))

    # handle other dimensions header

    # strip the ndim-1 first columns
    headers = [[line.pop(0) for line in str_table] for _ in range(ndim - 1)]
    headers = [convert_1darray(pvalues_str) for pvalues_str in headers]
    if ndim > 1:
        # having duplicate values is normal when there are more than 2
        # dimensions but we need to test whether there are duplicates of
        # combinations.
        dupe_combos = list(duplicates(zip(*headers)))
        if dupe_combos:
            print(("Duplicate row header value(s) in '%s':" % fpath))
            print((PrettyTable(dupe_combos)))
            raise Exception("bad alignment data in '%s': found %d "
                            "duplicate row header value(s)" %
                            (fpath, len(dupe_combos)))

    possible_values = [np.array(list(unique(pvalues))) for pvalues in headers]
    possible_values.append(np.array(unique_last_d))

    shape = tuple(len(values) for values in possible_values)
    num_possible_values = prod(shape)

    # transform the 2d table into a 1d list
    str_table = list(chain.from_iterable(str_table))
    if len(str_table) != num_possible_values:
        raise Exception(
            "incoherent data in '%s': %d data cells "
            "found while it should be %d based on the number "
            "of possible values in headers (%s)" %
            (fpath, len(str_table), num_possible_values, ' * '.join(
                str(len(values)) for values in possible_values)))

    # TODO: compare time with numpy built-in conversion:
    # if dtype is None, numpy tries to detect the best type itself
    # which it does a good job of if the values are already numeric values
    # if dtype is provided, numpy does a good job to convert from string
    # values.
    if celltype is None:
        celltype = detect_column_type(str_table)
    data = convert_1darray(str_table, celltype)
    array = np.array(data, dtype=celltype)
    return LabeledArray(array.reshape(shape), header, possible_values)