def load_csv(file, create_new_on=MakeStatus.Incompatible, **kwargs): """ Load an Orange.data.Table from s csv file. """ import csv, numpy file = as_open_file(file, "rb") snifer = csv.Sniffer() sample = file.read( 5 * 2**20 ) # max 5MB sample TODO: What if this is not enough. Try with a bigger sample dialect = snifer.sniff(sample) has_header = snifer.has_header(sample) file.seek(0) # Rewind reader = csv.reader(file, dialect=dialect) header = types = var_attrs = None # if not has_header: # raise ValueError("No header in the data file.") header = reader.next() if header: # Try to get variable definitions types_row = reader.next() if is_var_types_row(types_row): types = var_types(types_row) if types: # Try to get the variable attributes # (third line in the standard orange tab format). labels_row = reader.next() if is_var_attributes_row(labels_row): var_attrs = var_attributes(labels_row) # If definitions not present fill with blanks if not types: types = [None] * len(header) if not var_attrs: var_attrs = [None] * len(header) # start from the beginning file.seek(0) reader = csv.reader(file, dialect=dialect) for defined in [header, types, var_attrs]: if any(defined): # skip definition rows if present in the file reader.next() variables = [] undefined_vars = [] for i, (name, var_t) in enumerate(zip(header, types)): if var_t == variable.Discrete: # We do not have values yet. variables.append(_disc_placeholder(name)) undefined_vars.append((i, variables[-1])) elif var_t == variable.Continuous: variables.append( make(name, Orange.feature.Type.Continuous, [], [], create_new_on)) elif var_t == variable.String: variables.append( make(name, Orange.feature.Type.String, [], [], create_new_on)) elif var_t == variable.Python: variables.append(variable.Python(name)) elif isinstance(var_t, tuple): var_t, values = var_t if var_t == variable.Discrete: variables.append( make(name, Orange.feature.Type.Discrete, values, [], create_new_on)) elif var_t == variable.Python: raise NotImplementedError() elif var_t is None: variables.append(_var_placeholder(name)) undefined_vars.append((i, variables[-1])) data = [] for row in reader: data.append(row) for ind, var_def in undefined_vars: var_def.values.add(row[ind]) for ind, var_def in undefined_vars: values = var_def.values - set(["?", "" ]) # TODO: Other unknown strings? values = sorted(values) if isinstance(var_def, _disc_placeholder): variables[ind] = make(var_def.name, Orange.feature.Type.Discrete, [], values, create_new_on) elif isinstance(var_def, _var_placeholder): if is_variable_cont(values): variables[ind] = make(var_def.name, Orange.feature.Type.Continuous, [], [], create_new_on) elif is_variable_discrete(values): variables[ind] = make(var_def.name, Orange.feature.Type.Discrete, [], values, create_new_on) elif is_variable_string(values): variables[ind] = make(var_def.name, Orange.feature.Type.String, [], [], create_new_on) else: raise ValueError("Strange column in the data") vars = [] vars_load_status = [] attribute_load_status = [] meta_attribute_load_status = {} class_var_load_status = [] for var, status in vars: vars.append(var) vars_load_status.append(status) attributes = [] class_var = [] metas = {} attribute_indices = [] variable_indices = [] class_indices = [] meta_indices = [] for i, ((var, status), var_attr) in enumerate(zip(variables, var_attrs)): if var_attr: flag, attrs = var_attr if flag == "class": class_var.append(var) class_var_load_status.append(status) class_indices.append(i) elif flag == "meta": mid = Orange.feature.Descriptor.new_meta_id() metas[mid] = var meta_attribute_load_status[mid] = status meta_indices.append((i, var)) else: attributes.append(var) attribute_load_status.append(status) attribute_indices.append(i) var.attributes.update(attrs) else: attributes.append(var) attribute_load_status.append(status) attribute_indices.append(i) if len(class_var) > 1: raise ValueError("Multiple class variables defined") class_var = class_var[0] if class_var else None attribute_load_status += class_var_load_status variable_indices = attribute_indices + class_indices domain = Orange.data.Domain(attributes, class_var) domain.add_metas(metas) normal = [[row[i] for i in variable_indices] for row in data] meta_part = [[row[i] for i, _ in meta_indices] for row in data] table = Orange.data.Table(domain, normal) for ex, m_part in zip(table, meta_part): for (column, var), val in zip(meta_indices, m_part): ex[var] = var(val) table.setattr("metaAttributeLoadStatus", meta_attribute_load_status) table.setattr("attributeLoadStatus", attribute_load_status) return table
def load_csv(file, create_new_on=MakeStatus.Incompatible, delimiter=None, quotechar=None, escapechar=None, skipinitialspace=None, has_header=None, has_types=None, has_annotations=None, has_simplified_header=False, DK=None, **kwargs): """Load an Orange.data.Table from a csv file.""" file = as_open_file(file, "rU") snifer = csv.Sniffer() # Max 5MB sample # TODO: What if this is not enough. Try with a bigger sample sample = file.read(5 * 2**20) try: dialect = snifer.sniff(sample) except csv.Error: # try the default, hope the provided arguments are correct dialect = "excel" if has_header is None: try: has_header = snifer.has_header(sample) except csv.Error: has_header = False file.seek(0) # Rewind def kwparams(**kwargs): """Return not None kwargs. """ return dict([(k, v) for k, v in kwargs.items() if v is not None]) # non-None format parameters. fmtparam = kwparams(delimiter=delimiter, quotechar=quotechar, escapechar=escapechar, skipinitialspace=skipinitialspace) reader = csv.reader(file, dialect=dialect, **fmtparam) header = types = var_attrs = None row = first_row = reader.next() if has_simplified_header == True and \ (has_types == True or has_annotations == True): raise ValueError("'has_simplified_header' and 'has_types', " "'has_anotations' are exclusive'") if has_header and not has_simplified_header: header = row # Eat this row and move to the next row = reader.next() elif has_header and has_simplified_header: header, types, var_attrs = parse_simplified_header(row) row = reader.next() # Guess types row if has_types is None and not has_simplified_header: has_types = has_header and is_var_types_row(row) if has_types: try: types = var_types(row) except ValueError as err: raise VariableDefinitionError(*err.args) # Eat this row and move to the next row = reader.next() # Guess variable annotations row if has_annotations is None and not has_simplified_header: has_annotations = has_header and has_types and \ is_var_attributes_row(row) if has_annotations: try: var_attrs = var_attributes(row) except ValueError as err: raise VariableDefinitionError(*err.args) # Eat this row and move to the next row = reader.next() if not header: # Create a default header header = ["F_%i" % i for i in range(len(first_row))] if not types: # Create blank variable types types = [None] * len(header) if not var_attrs: # Create blank variable attributes var_attrs = [None] * len(header) else: # Pad the vars_attrs if it is not complete # (orange tab format allows this line to be shorter then header). if len(var_attrs) < len(header): var_attrs += [None] * (len(header) - len(var_attrs)) # start from the beginning file.seek(0) reader = csv.reader(file, dialect=dialect, **fmtparam) for defined in [has_header, has_types, has_annotations]: if defined: # skip definition rows if present in the file reader.next() variables = [] undefined_vars = [] # Missing value flags missing_flags = DK.split(",") if DK is not None else [ "?", "", "NA", "~", "*" ] missing_map = dict.fromkeys(missing_flags, "?") missing_translate = lambda val: missing_map.get(val, val) # Create domain variables or corresponding place holders for i, (name, var_t) in enumerate(zip(header, types)): if var_t == variable.Discrete: # We do not have values yet variables.append(_disc_placeholder(name)) undefined_vars.append((i, variables[-1])) elif var_t == variable.Continuous: variables.append( make(name, Orange.feature.Type.Continuous, [], [], create_new_on)) elif var_t == variable.String: variables.append( make(name, Orange.feature.Type.String, [], [], create_new_on)) elif var_t == variable.Python: variables.append(variable.Python(name)) elif isinstance(var_t, tuple): var_t, values = var_t if var_t == variable.Discrete: # We have values for discrete variable variables.append( make(name, Orange.feature.Type.Discrete, values, [], create_new_on)) elif var_t == variable.Python: # Python variables are not supported yet raise NotImplementedError() elif var_t is None or var_t is variable.Descriptor: # Unknown variable type, to be deduced at the end variables.append(_var_placeholder(name)) undefined_vars.append((i, variables[-1])) data = [] # Read all the rows for i, row in enumerate(reader): # check for final newline. if row: row = map(missing_translate, row) if len(row) != len(header): warnings.warn("row {} has {} cells, expected {}.".format( i, len(row), len(header)), CSVFormatError, stacklevel=2) # Pad or strip the row to ensure it has the same length if len(row) < len(header): row += ["?"] * (len(header) - len(row)) elif len(row) > len(header): row = row[:len(header)] data.append(row) # For undefined variables collect all their values for ind, var_def in undefined_vars: var_def.values.add(row[ind]) # Process undefined variables now that we can deduce their type for ind, var_def in undefined_vars: values = var_def.values - set(missing_flags) values = sorted(values) if isinstance(var_def, _disc_placeholder): variables[ind] = make(var_def.name, Orange.feature.Type.Discrete, [], values, create_new_on) elif isinstance(var_def, _var_placeholder): if is_variable_cont(values, cutoff=1.0): variables[ind] = make(var_def.name, Orange.feature.Type.Continuous, [], [], create_new_on) elif is_variable_discrete(values, cutoff=0.0): variables[ind] = make(var_def.name, Orange.feature.Type.Discrete, [], values, create_new_on) elif is_variable_string(values): variables[ind] = make(var_def.name, Orange.feature.Type.String, [], [], create_new_on) else: # Treat it as a string anyway variables[ind] = make(var_def.name, Orange.feature.Type.String, [], [], create_new_on) attribute_load_status = [] meta_attribute_load_status = {} class_var_load_status = [] multiclass_var_load_status = [] attributes = [] class_var = [] class_vars = [] metas = {} attribute_indices = [] class_indices = [] multiclass_indices = [] meta_indices = [] ignore_indices = [] for i, ((var, status), var_attr) in enumerate(zip(variables, var_attrs)): if var_attr: flag, attrs = var_attr if flag == "class": class_var.append(var) class_var_load_status.append(status) class_indices.append(i) elif flag == "multiclass": class_vars.append(var) multiclass_var_load_status.append(status) multiclass_indices.append(i) elif flag == "meta": mid = Orange.feature.Descriptor.new_meta_id() metas[mid] = var meta_attribute_load_status[mid] = status meta_indices.append((i, var)) elif flag == "ignore": ignore_indices.append(i) else: attributes.append(var) attribute_load_status.append(status) attribute_indices.append(i) var.attributes.update(attrs) else: attributes.append(var) attribute_load_status.append(status) attribute_indices.append(i) if len(class_var) > 1: raise ValueError("Multiple class variables defined") if class_var and class_vars: raise ValueError("Both 'class' and 'multiclass' used.") class_var = class_var[0] if class_var else None attribute_load_status += class_var_load_status variable_indices = attribute_indices + class_indices domain = Orange.data.Domain(attributes, class_var, class_vars=class_vars) domain.add_metas(metas) normal = [[row[i] for i in variable_indices] for row in data] meta_part = [[row[i] for i, _ in meta_indices] for row in data] multiclass_part = [[row[i] for i in multiclass_indices] for row in data] table = Orange.data.Table(domain, normal) for ex, m_part, mc_part in zip(table, meta_part, multiclass_part): for (column, var), val in zip(meta_indices, m_part): ex[var] = var(val) if mc_part: ex.set_classes(mc_part) table.setattr("metaAttributeLoadStatus", meta_attribute_load_status) table.setattr("attributeLoadStatus", attribute_load_status) return table