コード例 #1
0
ファイル: io.py プロジェクト: electricFeel/BeatKeeperHRM
def load_csv(file, create_new_on=MakeStatus.Incompatible, **kwargs):
    """ Load an Orange.data.Table from s csv file.
    """
    import csv, numpy
    file = as_open_file(file, "rb")
    snifer = csv.Sniffer()
    sample = file.read(
        5 * 2**20
    )  # max 5MB sample TODO: What if this is not enough. Try with a bigger sample
    dialect = snifer.sniff(sample)
    has_header = snifer.has_header(sample)
    file.seek(0)  # Rewind
    reader = csv.reader(file, dialect=dialect)

    header = types = var_attrs = None

    #    if not has_header:
    #        raise ValueError("No header in the data file.")

    header = reader.next()

    if header:
        # Try to get variable definitions
        types_row = reader.next()
        if is_var_types_row(types_row):
            types = var_types(types_row)

    if types:
        # Try to get the variable attributes
        # (third line in the standard orange tab format).
        labels_row = reader.next()
        if is_var_attributes_row(labels_row):
            var_attrs = var_attributes(labels_row)

    # If definitions not present fill with blanks
    if not types:
        types = [None] * len(header)
    if not var_attrs:
        var_attrs = [None] * len(header)

    # start from the beginning
    file.seek(0)
    reader = csv.reader(file, dialect=dialect)
    for defined in [header, types, var_attrs]:
        if any(defined):  # skip definition rows if present in the file
            reader.next()

    variables = []
    undefined_vars = []
    for i, (name, var_t) in enumerate(zip(header, types)):
        if var_t == variable.Discrete:  # We do not have values yet.
            variables.append(_disc_placeholder(name))
            undefined_vars.append((i, variables[-1]))
        elif var_t == variable.Continuous:
            variables.append(
                make(name, Orange.feature.Type.Continuous, [], [],
                     create_new_on))
        elif var_t == variable.String:
            variables.append(
                make(name, Orange.feature.Type.String, [], [], create_new_on))
        elif var_t == variable.Python:
            variables.append(variable.Python(name))
        elif isinstance(var_t, tuple):
            var_t, values = var_t
            if var_t == variable.Discrete:
                variables.append(
                    make(name, Orange.feature.Type.Discrete, values, [],
                         create_new_on))
            elif var_t == variable.Python:
                raise NotImplementedError()
        elif var_t is None:
            variables.append(_var_placeholder(name))
            undefined_vars.append((i, variables[-1]))

    data = []
    for row in reader:
        data.append(row)
        for ind, var_def in undefined_vars:
            var_def.values.add(row[ind])

    for ind, var_def in undefined_vars:
        values = var_def.values - set(["?", ""
                                       ])  # TODO: Other unknown strings?
        values = sorted(values)
        if isinstance(var_def, _disc_placeholder):
            variables[ind] = make(var_def.name, Orange.feature.Type.Discrete,
                                  [], values, create_new_on)
        elif isinstance(var_def, _var_placeholder):
            if is_variable_cont(values):
                variables[ind] = make(var_def.name,
                                      Orange.feature.Type.Continuous, [], [],
                                      create_new_on)
            elif is_variable_discrete(values):
                variables[ind] = make(var_def.name,
                                      Orange.feature.Type.Discrete, [], values,
                                      create_new_on)
            elif is_variable_string(values):
                variables[ind] = make(var_def.name, Orange.feature.Type.String,
                                      [], [], create_new_on)
            else:
                raise ValueError("Strange column in the data")

    vars = []
    vars_load_status = []
    attribute_load_status = []
    meta_attribute_load_status = {}
    class_var_load_status = []
    for var, status in vars:
        vars.append(var)
        vars_load_status.append(status)

    attributes = []
    class_var = []
    metas = {}
    attribute_indices = []
    variable_indices = []
    class_indices = []
    meta_indices = []
    for i, ((var, status), var_attr) in enumerate(zip(variables, var_attrs)):
        if var_attr:
            flag, attrs = var_attr
            if flag == "class":
                class_var.append(var)
                class_var_load_status.append(status)
                class_indices.append(i)
            elif flag == "meta":
                mid = Orange.feature.Descriptor.new_meta_id()
                metas[mid] = var
                meta_attribute_load_status[mid] = status
                meta_indices.append((i, var))
            else:
                attributes.append(var)
                attribute_load_status.append(status)
                attribute_indices.append(i)
            var.attributes.update(attrs)
        else:
            attributes.append(var)
            attribute_load_status.append(status)
            attribute_indices.append(i)

    if len(class_var) > 1:
        raise ValueError("Multiple class variables defined")

    class_var = class_var[0] if class_var else None

    attribute_load_status += class_var_load_status
    variable_indices = attribute_indices + class_indices
    domain = Orange.data.Domain(attributes, class_var)
    domain.add_metas(metas)
    normal = [[row[i] for i in variable_indices] for row in data]
    meta_part = [[row[i] for i, _ in meta_indices] for row in data]
    table = Orange.data.Table(domain, normal)
    for ex, m_part in zip(table, meta_part):
        for (column, var), val in zip(meta_indices, m_part):
            ex[var] = var(val)

    table.setattr("metaAttributeLoadStatus", meta_attribute_load_status)
    table.setattr("attributeLoadStatus", attribute_load_status)

    return table
コード例 #2
0
def load_csv(file,
             create_new_on=MakeStatus.Incompatible,
             delimiter=None,
             quotechar=None,
             escapechar=None,
             skipinitialspace=None,
             has_header=None,
             has_types=None,
             has_annotations=None,
             has_simplified_header=False,
             DK=None,
             **kwargs):
    """Load an Orange.data.Table from a csv file."""

    file = as_open_file(file, "rU")
    snifer = csv.Sniffer()

    # Max 5MB sample
    # TODO: What if this is not enough. Try with a bigger sample
    sample = file.read(5 * 2**20)
    try:
        dialect = snifer.sniff(sample)
    except csv.Error:
        # try the default, hope the provided arguments are correct
        dialect = "excel"

    if has_header is None:
        try:
            has_header = snifer.has_header(sample)
        except csv.Error:
            has_header = False

    file.seek(0)  # Rewind

    def kwparams(**kwargs):
        """Return not None kwargs.
        """
        return dict([(k, v) for k, v in kwargs.items() if v is not None])

    # non-None format parameters.
    fmtparam = kwparams(delimiter=delimiter,
                        quotechar=quotechar,
                        escapechar=escapechar,
                        skipinitialspace=skipinitialspace)

    reader = csv.reader(file, dialect=dialect, **fmtparam)

    header = types = var_attrs = None

    row = first_row = reader.next()

    if has_simplified_header == True and \
            (has_types == True or has_annotations == True):
        raise ValueError("'has_simplified_header' and 'has_types', "
                         "'has_anotations' are exclusive'")

    if has_header and not has_simplified_header:
        header = row
        # Eat this row and move to the next
        row = reader.next()
    elif has_header and has_simplified_header:
        header, types, var_attrs = parse_simplified_header(row)
        row = reader.next()

    # Guess types row
    if has_types is None and not has_simplified_header:
        has_types = has_header and is_var_types_row(row)

    if has_types:
        try:
            types = var_types(row)
        except ValueError as err:
            raise VariableDefinitionError(*err.args)

        # Eat this row and move to the next
        row = reader.next()

    # Guess variable annotations row
    if has_annotations is None and not has_simplified_header:
        has_annotations = has_header and has_types and \
                          is_var_attributes_row(row)

    if has_annotations:
        try:
            var_attrs = var_attributes(row)
        except ValueError as err:
            raise VariableDefinitionError(*err.args)
        # Eat this row and move to the next
        row = reader.next()

    if not header:
        # Create a default header
        header = ["F_%i" % i for i in range(len(first_row))]

    if not types:
        # Create blank variable types
        types = [None] * len(header)

    if not var_attrs:
        # Create blank variable attributes
        var_attrs = [None] * len(header)
    else:
        # Pad the vars_attrs if it is not complete
        # (orange tab format allows this line to be shorter then header).
        if len(var_attrs) < len(header):
            var_attrs += [None] * (len(header) - len(var_attrs))

    # start from the beginning
    file.seek(0)
    reader = csv.reader(file, dialect=dialect, **fmtparam)

    for defined in [has_header, has_types, has_annotations]:
        if defined:
            # skip definition rows if present in the file
            reader.next()

    variables = []
    undefined_vars = []
    # Missing value flags
    missing_flags = DK.split(",") if DK is not None else [
        "?", "", "NA", "~", "*"
    ]
    missing_map = dict.fromkeys(missing_flags, "?")
    missing_translate = lambda val: missing_map.get(val, val)

    # Create domain variables or corresponding place holders
    for i, (name, var_t) in enumerate(zip(header, types)):
        if var_t == variable.Discrete:
            # We do not have values yet
            variables.append(_disc_placeholder(name))
            undefined_vars.append((i, variables[-1]))
        elif var_t == variable.Continuous:
            variables.append(
                make(name, Orange.feature.Type.Continuous, [], [],
                     create_new_on))
        elif var_t == variable.String:
            variables.append(
                make(name, Orange.feature.Type.String, [], [], create_new_on))
        elif var_t == variable.Python:
            variables.append(variable.Python(name))
        elif isinstance(var_t, tuple):
            var_t, values = var_t
            if var_t == variable.Discrete:
                # We have values for discrete variable
                variables.append(
                    make(name, Orange.feature.Type.Discrete, values, [],
                         create_new_on))
            elif var_t == variable.Python:
                # Python variables are not supported yet
                raise NotImplementedError()
        elif var_t is None or var_t is variable.Descriptor:
            # Unknown variable type, to be deduced at the end
            variables.append(_var_placeholder(name))
            undefined_vars.append((i, variables[-1]))

    data = []
    # Read all the rows
    for i, row in enumerate(reader):
        # check for final newline.
        if row:
            row = map(missing_translate, row)
            if len(row) != len(header):
                warnings.warn("row {} has {} cells, expected {}.".format(
                    i, len(row), len(header)),
                              CSVFormatError,
                              stacklevel=2)
            # Pad or strip the row to ensure it has the same length
            if len(row) < len(header):
                row += ["?"] * (len(header) - len(row))
            elif len(row) > len(header):
                row = row[:len(header)]

            data.append(row)
            # For undefined variables collect all their values
            for ind, var_def in undefined_vars:
                var_def.values.add(row[ind])

    # Process undefined variables now that we can deduce their type
    for ind, var_def in undefined_vars:
        values = var_def.values - set(missing_flags)
        values = sorted(values)
        if isinstance(var_def, _disc_placeholder):
            variables[ind] = make(var_def.name, Orange.feature.Type.Discrete,
                                  [], values, create_new_on)
        elif isinstance(var_def, _var_placeholder):
            if is_variable_cont(values, cutoff=1.0):
                variables[ind] = make(var_def.name,
                                      Orange.feature.Type.Continuous, [], [],
                                      create_new_on)
            elif is_variable_discrete(values, cutoff=0.0):
                variables[ind] = make(var_def.name,
                                      Orange.feature.Type.Discrete, [], values,
                                      create_new_on)
            elif is_variable_string(values):
                variables[ind] = make(var_def.name, Orange.feature.Type.String,
                                      [], [], create_new_on)
            else:
                # Treat it as a string anyway
                variables[ind] = make(var_def.name, Orange.feature.Type.String,
                                      [], [], create_new_on)

    attribute_load_status = []
    meta_attribute_load_status = {}
    class_var_load_status = []
    multiclass_var_load_status = []

    attributes = []
    class_var = []
    class_vars = []
    metas = {}
    attribute_indices = []
    class_indices = []
    multiclass_indices = []
    meta_indices = []
    ignore_indices = []
    for i, ((var, status), var_attr) in enumerate(zip(variables, var_attrs)):
        if var_attr:
            flag, attrs = var_attr
            if flag == "class":
                class_var.append(var)
                class_var_load_status.append(status)
                class_indices.append(i)
            elif flag == "multiclass":
                class_vars.append(var)
                multiclass_var_load_status.append(status)
                multiclass_indices.append(i)
            elif flag == "meta":
                mid = Orange.feature.Descriptor.new_meta_id()
                metas[mid] = var
                meta_attribute_load_status[mid] = status
                meta_indices.append((i, var))
            elif flag == "ignore":
                ignore_indices.append(i)
            else:
                attributes.append(var)
                attribute_load_status.append(status)
                attribute_indices.append(i)
            var.attributes.update(attrs)
        else:
            attributes.append(var)
            attribute_load_status.append(status)
            attribute_indices.append(i)

    if len(class_var) > 1:
        raise ValueError("Multiple class variables defined")
    if class_var and class_vars:
        raise ValueError("Both 'class' and 'multiclass' used.")

    class_var = class_var[0] if class_var else None

    attribute_load_status += class_var_load_status
    variable_indices = attribute_indices + class_indices
    domain = Orange.data.Domain(attributes, class_var, class_vars=class_vars)
    domain.add_metas(metas)
    normal = [[row[i] for i in variable_indices] for row in data]
    meta_part = [[row[i] for i, _ in meta_indices] for row in data]
    multiclass_part = [[row[i] for i in multiclass_indices] for row in data]
    table = Orange.data.Table(domain, normal)
    for ex, m_part, mc_part in zip(table, meta_part, multiclass_part):
        for (column, var), val in zip(meta_indices, m_part):
            ex[var] = var(val)
        if mc_part:
            ex.set_classes(mc_part)

    table.setattr("metaAttributeLoadStatus", meta_attribute_load_status)
    table.setattr("attributeLoadStatus", attribute_load_status)

    return table