def add_new_class_label(self, undoable=True): newlabel = next(label for label in namegen("C", 1) if label not in self.class_model) command = SimpleUndoCommand(lambda: self.class_model.append(newlabel), lambda: self.class_model.__delitem__(-1)) if undoable: self.undo_stack.push(command) else: command.redo()
def add_new_class_label(self, undoable=True): newlabel = next(label for label in namegen('C', 1) if label not in self.class_model) command = SimpleUndoCommand(lambda: self.class_model.append(newlabel), lambda: self.class_model.__delitem__(-1)) if undoable: self.undo_stack.push(command) else: command.redo()
def __init__(self, data: np.ndarray, ncols: int, header: _TableHeader, offset: int): self.data = data self.ncols = ncols self.header = header self.offset = offset self.namegen: Generator[str] = namegen('Feature ', 1) self.cols_X: List[np.ndarray] = [] self.cols_Y: List[np.ndarray] = [] self.cols_M: List[np.ndarray] = [] self.cols_W: List[np.ndarray] = [] self.attrs: List[Variable] = [] self.clses: List[Variable] = [] self.metas: List[Variable] = []
def data_table(cls, data, headers=None): """ Return Orange.data.Table given rows of `headers` (iterable of iterable) and rows of `data` (iterable of iterable). Basically, the idea of subclasses is to produce those two iterables, however they might. If `headers` is not provided, the header rows are extracted from `data`, assuming they precede it. """ if not headers: headers, data = cls.parse_headers(data) # Consider various header types (single-row, two-row, three-row, none) if len(headers) == 3: names, types, flags = map(list, headers) else: if len(headers) == 1: HEADER1_FLAG_SEP = '#' # First row format either: # 1) delimited column names # 2) -||- with type and flags prepended, separated by #, # e.g. d#sex,c#age,cC#IQ _flags, names = zip(*[ i.split(HEADER1_FLAG_SEP, 1) if HEADER1_FLAG_SEP in i else ('', i) for i in headers[0] ]) names = list(names) elif len(headers) == 2: names, _flags = map(list, headers) else: # Use heuristics for everything names, _flags = [], [] types = [ ''.join(filter(str.isupper, flag)).lower() for flag in _flags ] flags = [Flags.join(filter(str.islower, flag)) for flag in _flags] # Determine maximum row length rowlen = max(map(len, (names, types, flags))) strip = False def _equal_length(lst): nonlocal strip if len(lst) > rowlen > 0: lst = lst[:rowlen] strip = True elif len(lst) < rowlen: lst.extend([''] * (rowlen - len(lst))) return lst # Ensure all data is of equal width in a column-contiguous array data = [ _equal_length([s.strip() for s in row]) for row in data if any(row) ] data = np.array(data, dtype=object, order='F') if strip: warnings.warn("Columns with no headers were removed.") # Data may actually be longer than headers were try: rowlen = data.shape[1] except IndexError: pass else: for lst in (names, types, flags): _equal_length(lst) NAMEGEN = namegen('Feature ', 1) Xcols, attrs = [], [] Mcols, metas = [], [] Ycols, clses = [], [] Wcols = [] # Rename variables if necessary # Reusing across files still works if both files have same duplicates name_counts = Counter(names) del name_counts[""] if len(name_counts) != len(names) and name_counts: uses = { name: 0 for name, count in name_counts.items() if count > 1 } for i, name in enumerate(names): if name in uses: uses[name] += 1 names[i] = "{}_{}".format(name, uses[name]) namask = np.empty(data.shape[0], dtype=bool) # Iterate through the columns for col in range(rowlen): flag = Flags(Flags.split(flags[col])) if flag.i: continue type_flag = types and types[col].strip() try: orig_values = data[:, col] except IndexError: orig_values = np.array([], dtype=object) namask = isnastr(orig_values, out=namask) coltype_kwargs = {} valuemap = None values = orig_values if type_flag in StringVariable.TYPE_HEADERS: coltype = StringVariable values = orig_values elif type_flag in ContinuousVariable.TYPE_HEADERS: coltype = ContinuousVariable values = np.empty(data.shape[0], dtype=float) try: np.copyto(values, orig_values, casting="unsafe", where=~namask) values[namask] = np.nan except ValueError: for row, num in enumerate(orig_values): if not isnastr(num): try: float(num) except ValueError: break raise ValueError('Non-continuous value in (1-based) ' 'line {}, column {}'.format( row + len(headers) + 1, col + 1)) elif type_flag in TimeVariable.TYPE_HEADERS: coltype = TimeVariable values = np.where(namask, "", orig_values) elif (type_flag in DiscreteVariable.TYPE_HEADERS or _RE_DISCRETE_LIST.match(type_flag)): coltype = DiscreteVariable orig_values = values = np.where(namask, "", orig_values) if _RE_DISCRETE_LIST.match(type_flag): valuemap = Flags.split(type_flag) coltype_kwargs.update(ordered=True) else: valuemap = sorted(set(orig_values) - {""}) else: # No known type specified, use heuristics valuemap, values, coltype = guess_data_type( orig_values, namask) if flag.m or coltype is StringVariable: append_to = (Mcols, metas) elif flag.w: append_to = (Wcols, None) elif flag.c: append_to = (Ycols, clses) else: append_to = (Xcols, attrs) cols, domain_vars = append_to if domain_vars is not None: var_name = names and names[col] if not var_name: var_name = next(NAMEGEN) values, var = sanitize_variable(valuemap, values, orig_values, coltype, coltype_kwargs, name=var_name) else: var = None if domain_vars is not None: var.attributes.update(flag.attributes) domain_vars.append(var) if isinstance(values, np.ndarray) and not values.flags.owndata: values = values.copy() # might view `data` (string columns) cols.append(values) try: # allow gc to reclaim memory used by string values data[:, col] = None except IndexError: pass domain = Domain(attrs, clses, metas) if not data.size: return Table.from_domain(domain, 0) X = Y = M = W = None if Xcols: X = np.c_[tuple(Xcols)] assert X.dtype == np.float_ else: X = np.empty((data.shape[0], 0), dtype=np.float_) if Ycols: Y = np.c_[tuple(Ycols)] assert Y.dtype == np.float_ if Mcols: M = np.c_[tuple(Mcols)].astype(object) if Wcols: W = np.c_[tuple(Wcols)].astype(float) table = Table.from_numpy(domain, X, Y, M, W) return table
def data_table(cls, data, headers=None): """ Return Orange.data.Table given rows of `headers` (iterable of iterable) and rows of `data` (iterable of iterable; if ``numpy.ndarray``, might as well **have it sorted column-major**, e.g. ``order='F'``). Basically, the idea of subclasses is to produce those two iterables, however they might. If `headers` is not provided, the header rows are extracted from `data`, assuming they precede it. """ if not headers: headers, data = cls.parse_headers(data) # Consider various header types (single-row, two-row, three-row, none) if len(headers) == 3: names, types, flags = map(list, headers) else: if len(headers) == 1: HEADER1_FLAG_SEP = '#' # First row format either: # 1) delimited column names # 2) -||- with type and flags prepended, separated by #, # e.g. d#sex,c#age,cC#IQ _flags, names = zip(*[ i.split(HEADER1_FLAG_SEP, 1) if HEADER1_FLAG_SEP in i else ('', i) for i in headers[0] ]) names = list(names) elif len(headers) == 2: names, _flags = map(list, headers) else: # Use heuristics for everything names, _flags = [], [] types = [ ''.join(filter(str.isupper, flag)).lower() for flag in _flags ] flags = [Flags.join(filter(str.islower, flag)) for flag in _flags] # Determine maximum row length rowlen = max(map(len, (names, types, flags))) def _equal_length(lst): lst.extend([''] * (rowlen - len(lst))) return lst # Ensure all data is of equal width in a column-contiguous array data = np.array([_equal_length(list(row)) for row in data if any(row)], copy=False, dtype=object, order='F') # Data may actually be longer than headers were try: rowlen = data.shape[1] except IndexError: pass else: for lst in (names, types, flags): _equal_length(lst) NAMEGEN = namegen('Feature ', 1) Xcols, attrs = [], [] Mcols, metas = [], [] Ycols, clses = [], [] Wcols = [] # Rename variables if necessary # Reusing across files still works if both files have same duplicates name_counts = Counter(names) del name_counts[""] if len(name_counts) != len(names) and name_counts: uses = { name: 0 for name, count in name_counts.items() if count > 1 } for i, name in enumerate(names): if name in uses: uses[name] += 1 names[i] = "{}_{}".format(name, uses[name]) # Iterate through the columns for col in range(rowlen): flag = Flags(Flags.split(flags[col])) if flag.i: continue type_flag = types and types[col].strip() try: orig_values = [ np.nan if i in MISSING_VALUES else i for i in (i.strip() for i in data[:, col]) ] except IndexError: # No data instances leads here orig_values = [] # In this case, coltype could be anything. It's set as-is # only to satisfy test_table.TableTestCase.test_append coltype = DiscreteVariable coltype_kwargs = {} valuemap = [] values = orig_values if type_flag in StringVariable.TYPE_HEADERS: coltype = StringVariable elif type_flag in ContinuousVariable.TYPE_HEADERS: coltype = ContinuousVariable try: values = [float(i) for i in orig_values] except ValueError: for row, num in enumerate(orig_values): try: float(num) except ValueError: break raise ValueError('Non-continuous value in (1-based) ' 'line {}, column {}'.format( row + len(headers) + 1, col + 1)) elif type_flag in TimeVariable.TYPE_HEADERS: coltype = TimeVariable elif (type_flag in DiscreteVariable.TYPE_HEADERS or _RE_DISCRETE_LIST.match(type_flag)): coltype = DiscreteVariable if _RE_DISCRETE_LIST.match(type_flag): valuemap = Flags.split(type_flag) coltype_kwargs.update(ordered=True) else: valuemap = sorted(set(orig_values) - {np.nan}) else: # No known type specified, use heuristics valuemap, values, coltype = guess_data_type(orig_values) if flag.m or coltype is StringVariable: append_to = (Mcols, metas) elif flag.w: append_to = (Wcols, None) elif flag.c: append_to = (Ycols, clses) else: append_to = (Xcols, attrs) cols, domain_vars = append_to cols.append(col) existing_var, new_var_name = None, None if domain_vars is not None: existing_var = names and names[col] if not existing_var: new_var_name = next(NAMEGEN) values, var = sanitize_variable(valuemap, values, orig_values, coltype, coltype_kwargs, domain_vars, existing_var, new_var_name, data) if domain_vars is not None: var.attributes.update(flag.attributes) domain_vars.append(var) # Write back the changed data. This is needeed to pass the # correct, converted values into Table.from_numpy below try: data[:, col] = values except IndexError: pass domain = Domain(attrs, clses, metas) if not data.size: return Table.from_domain(domain, 0) table = Table.from_numpy(domain, data[:, Xcols].astype(float, order='C'), data[:, Ycols].astype(float, order='C'), data[:, Mcols].astype(object, order='C'), data[:, Wcols].astype(float, order='C')) return table
def data_table(cls, data, headers=None): """ Return Orange.data.Table given rows of `headers` (iterable of iterable) and rows of `data` (iterable of iterable). Basically, the idea of subclasses is to produce those two iterables, however they might. If `headers` is not provided, the header rows are extracted from `data`, assuming they precede it. """ if not headers: headers, data = cls.parse_headers(data) # Consider various header types (single-row, two-row, three-row, none) if len(headers) == 3: names, types, flags = map(list, headers) else: if len(headers) == 1: HEADER1_FLAG_SEP = '#' # First row format either: # 1) delimited column names # 2) -||- with type and flags prepended, separated by #, # e.g. d#sex,c#age,cC#IQ _flags, names = zip(*[i.split(HEADER1_FLAG_SEP, 1) if HEADER1_FLAG_SEP in i else ('', i) for i in headers[0]] ) names = list(names) elif len(headers) == 2: names, _flags = map(list, headers) else: # Use heuristics for everything names, _flags = [], [] types = [''.join(filter(str.isupper, flag)).lower() for flag in _flags] flags = [Flags.join(filter(str.islower, flag)) for flag in _flags] # Determine maximum row length rowlen = max(map(len, (names, types, flags))) strip = False def _equal_length(lst): nonlocal strip if len(lst) > rowlen > 0: lst = lst[:rowlen] strip = True elif len(lst) < rowlen: lst.extend(['']*(rowlen - len(lst))) return lst # Ensure all data is of equal width in a column-contiguous array data = [_equal_length([s.strip() for s in row]) for row in data if any(row)] data = np.array(data, dtype=object, order='F') if strip: warnings.warn("Columns with no headers were removed.") # Data may actually be longer than headers were try: rowlen = data.shape[1] except IndexError: pass else: for lst in (names, types, flags): _equal_length(lst) NAMEGEN = namegen('Feature ', 1) Xcols, attrs = [], [] Mcols, metas = [], [] Ycols, clses = [], [] Wcols = [] # Rename variables if necessary # Reusing across files still works if both files have same duplicates name_counts = Counter(names) del name_counts[""] if len(name_counts) != len(names) and name_counts: uses = {name: 0 for name, count in name_counts.items() if count > 1} for i, name in enumerate(names): if name in uses: uses[name] += 1 names[i] = "{}_{}".format(name, uses[name]) namask = np.empty(data.shape[0], dtype=bool) # Iterate through the columns for col in range(rowlen): flag = Flags(Flags.split(flags[col])) if flag.i: continue type_flag = types and types[col].strip() try: orig_values = data[:, col] except IndexError: orig_values = np.array([], dtype=object) namask = isnastr(orig_values, out=namask) coltype_kwargs = {} valuemap = None values = orig_values if type_flag in StringVariable.TYPE_HEADERS: coltype = StringVariable values = orig_values elif type_flag in ContinuousVariable.TYPE_HEADERS: coltype = ContinuousVariable values = np.empty(data.shape[0], dtype=float) try: np.copyto(values, orig_values, casting="unsafe", where=~namask) values[namask] = np.nan except ValueError: for row, num in enumerate(orig_values): if not isnastr(num): try: float(num) except ValueError: break raise ValueError('Non-continuous value in (1-based) ' 'line {}, column {}'.format(row + len(headers) + 1, col + 1)) elif type_flag in TimeVariable.TYPE_HEADERS: coltype = TimeVariable values = np.where(namask, "", orig_values) elif (type_flag in DiscreteVariable.TYPE_HEADERS or _RE_DISCRETE_LIST.match(type_flag)): coltype = DiscreteVariable orig_values = values = np.where(namask, "", orig_values) if _RE_DISCRETE_LIST.match(type_flag): valuemap = Flags.split(type_flag) coltype_kwargs.update(ordered=True) else: valuemap = sorted(set(orig_values) - {""}) else: # No known type specified, use heuristics valuemap, values, coltype = guess_data_type(orig_values, namask) if flag.m or coltype is StringVariable: append_to = (Mcols, metas) elif flag.w: append_to = (Wcols, None) elif flag.c: append_to = (Ycols, clses) else: append_to = (Xcols, attrs) cols, domain_vars = append_to if domain_vars is not None: var_name = names and names[col] if not var_name: var_name = next(NAMEGEN) values, var = sanitize_variable( valuemap, values, orig_values, coltype, coltype_kwargs, name=var_name) else: var = None if domain_vars is not None: var.attributes.update(flag.attributes) domain_vars.append(var) if isinstance(values, np.ndarray) and not values.flags.owndata: values = values.copy() # might view `data` (string columns) cols.append(values) try: # allow gc to reclaim memory used by string values data[:, col] = None except IndexError: pass domain = Domain(attrs, clses, metas) if not data.size: return Table.from_domain(domain, 0) X = Y = M = W = None if Xcols: X = np.c_[tuple(Xcols)] assert X.dtype == np.float_ else: X = np.empty((data.shape[0], 0), dtype=np.float_) if Ycols: Y = np.c_[tuple(Ycols)] assert Y.dtype == np.float_ if Mcols: M = np.c_[tuple(Mcols)].astype(object) if Wcols: W = np.c_[tuple(Wcols)].astype(float) table = Table.from_numpy(domain, X, Y, M, W) return table
def data_table(self, data, headers=None): """ Return Orange.data.Table given rows of `headers` (iterable of iterable) and rows of `data` (iterable of iterable; if ``numpy.ndarray``, might as well **have it sorted column-major**, e.g. ``order='F'``). Basically, the idea of subclasses is to produce those two iterables, however they might. If `headers` is not provided, the header rows are extracted from `data`, assuming they precede it. """ if not headers: headers, data = self.parse_headers(data) # Consider various header types (single-row, two-row, three-row, none) if 3 == len(headers): names, types, flags = map(list, headers) else: if 1 == len(headers): HEADER1_FLAG_SEP = '#' # First row format either: # 1) delimited column names # 2) -||- with type and flags prepended, separated by #, # e.g. d#sex,c#age,cC#IQ _flags, names = zip(*[i.split(HEADER1_FLAG_SEP, 1) if HEADER1_FLAG_SEP in i else ('', i) for i in headers[0]]) names = list(names) elif 2 == len(headers): names, _flags = map(list, headers) else: # Use heuristics for everything names, _flags = [], [] types = [''.join(filter(str.isupper, flag)).lower() for flag in _flags] flags = [Flags.join(filter(str.islower, flag)) for flag in _flags] # Determine maximum row length rowlen = max(map(len, (names, types, flags))) def _equal_length(lst): lst.extend(['']*(rowlen - len(lst))) return lst # Ensure all data is of equal width in a column-contiguous array data = np.array([_equal_length(list(row)) for row in data if any(row)], copy=False, dtype=object, order='F') # Data may actually be longer than headers were try: rowlen = data.shape[1] except IndexError: pass else: for lst in (names, types, flags): _equal_length(lst) NAMEGEN = namegen('Feature ', 1) Xcols, attrs = [], [] Mcols, metas = [], [] Ycols, clses = [], [] Wcols = [] # Rename variables if necessary # Reusing across files still works if both files have same duplicates name_counts = Counter(names) del name_counts[""] if len(name_counts) != len(names) and name_counts: uses = {name: 0 for name, count in name_counts.items() if count > 1} for i, name in enumerate(names): if name in uses: uses[name] += 1 names[i] = "{}_{}".format(name, uses[name]) # Iterate through the columns for col in range(rowlen): flag = Flags(Flags.split(flags[col])) if flag.i: continue type_flag = types and types[col].strip() try: orig_values = [np.nan if i in MISSING_VALUES else i for i in (i.strip() for i in data[:, col])] except IndexError: # No data instances leads here orig_values = [] # In this case, coltype could be anything. It's set as-is # only to satisfy test_table.TableTestCase.test_append coltype = DiscreteVariable coltype_kwargs = {} valuemap = [] values = orig_values if type_flag in StringVariable.TYPE_HEADERS: coltype = StringVariable elif type_flag in ContinuousVariable.TYPE_HEADERS: coltype = ContinuousVariable try: values = [float(i) for i in orig_values] except ValueError: for row, num in enumerate(orig_values): try: float(num) except ValueError: break raise ValueError('Non-continuous value in (1-based) ' 'line {}, column {}'.format(row + len(headers) + 1, col + 1)) elif type_flag in TimeVariable.TYPE_HEADERS: coltype = TimeVariable elif (type_flag in DiscreteVariable.TYPE_HEADERS or _RE_DISCRETE_LIST.match(type_flag)): coltype = DiscreteVariable if _RE_DISCRETE_LIST.match(type_flag): valuemap = Flags.split(type_flag) coltype_kwargs.update(ordered=True) else: valuemap = sorted(set(orig_values) - {np.nan}) else: # No known type specified, use heuristics valuemap, values, coltype = guess_data_type(orig_values) if flag.m or coltype is StringVariable: append_to = (Mcols, metas) elif flag.w: append_to = (Wcols, None) elif flag.c: append_to = (Ycols, clses) else: append_to = (Xcols, attrs) cols, domain_vars = append_to cols.append(col) existing_var, new_var_name, column = None, None, None if domain_vars is not None: existing_var = names and names[col] if not existing_var: new_var_name = next(NAMEGEN) values, var = sanitize_variable( valuemap, values, orig_values, coltype, coltype_kwargs, domain_vars, existing_var, new_var_name, data) if domain_vars is not None: var.attributes.update(flag.attributes) domain_vars.append(var) # Write back the changed data. This is needeed to pass the # correct, converted values into Table.from_numpy below try: data[:, col] = values except IndexError: pass domain = Domain(attrs, clses, metas) if not data.size: return Table.from_domain(domain, 0) table = Table.from_numpy(domain, data[:, Xcols].astype(float, order='C'), data[:, Ycols].astype(float, order='C'), data[:, Mcols].astype(object, order='C'), data[:, Wcols].astype(float, order='C')) return table
def data_table(self, data, headers=None): """ Return Orange.data.Table given rows of `headers` (iterable of iterable) and rows of `data` (iterable of iterable; if ``numpy.ndarray``, might as well **have it sorted column-major**, e.g. ``order='F'``). Basically, the idea of subclasses is to produce those two iterables, however they might. If `headers` is not provided, the header rows are extracted from `data`, assuming they precede it. """ if not headers: headers, data = self.parse_headers(data) # Consider various header types (single-row, two-row, three-row, none) if 3 == len(headers): names, types, flags = map(list, headers) else: if 1 == len(headers): HEADER1_FLAG_SEP = '#' # First row format either: # 1) delimited column names # 2) -||- with type and flags prepended, separated by #, # e.g. d#sex,c#age,cC#IQ _flags, names = zip(*[i.split(HEADER1_FLAG_SEP, 1) if HEADER1_FLAG_SEP in i else ('', i) for i in headers[0]]) names = list(names) elif 2 == len(headers): names, _flags = map(list, headers) else: # Use heuristics for everything names, _flags = [], [] types = [''.join(filter(str.isupper, flag)).lower() for flag in _flags] flags = [Flags.join(filter(str.islower, flag)) for flag in _flags] # Determine maximum row length rowlen = max(map(len, (names, types, flags))) def _equal_length(lst): lst.extend(['']*(rowlen - len(lst))) return lst # Ensure all data is of equal width in a column-contiguous array data = np.array([_equal_length(list(row)) for row in data if any(row)], copy=False, dtype=object, order='F') # Data may actually be longer than headers were try: rowlen = data.shape[1] except IndexError: pass else: for lst in (names, types, flags): _equal_length(lst) NAMEGEN = namegen('Feature ', 1) Xcols, attrs = [], [] Mcols, metas = [], [] Ycols, clses = [], [] Wcols = [] # Iterate through the columns for col in range(rowlen): flag = Flags(Flags.split(flags[col])) if flag.i: continue type_flag = types and types[col].strip() try: orig_values = [np.nan if i in MISSING_VALUES else i for i in (i.strip() for i in data[:, col])] except IndexError: # No data instances leads here orig_values = [] # In this case, coltype could be anything. It's set as-is # only to satisfy test_table.TableTestCase.test_append coltype = DiscreteVariable coltype_kwargs = {} valuemap = [] values = orig_values if type_flag in StringVariable.TYPE_HEADERS: coltype = StringVariable elif type_flag in ContinuousVariable.TYPE_HEADERS: coltype = ContinuousVariable try: values = [float(i) for i in orig_values] except ValueError: for row, num in enumerate(orig_values): try: float(num) except ValueError: break raise ValueError('Non-continuous value in (1-based) ' 'line {}, column {}'.format(row + len(headers) + 1, col + 1)) elif type_flag in TimeVariable.TYPE_HEADERS: coltype = TimeVariable elif (type_flag in DiscreteVariable.TYPE_HEADERS or _RE_DISCRETE_LIST.match(type_flag)): if _RE_DISCRETE_LIST.match(type_flag): valuemap = Flags.split(type_flag) coltype_kwargs.update(ordered=True) else: valuemap = sorted(set(orig_values) - {np.nan}) else: # No known type specified, use heuristics is_discrete = is_discrete_values(orig_values) if is_discrete: valuemap = sorted(is_discrete) else: try: values = [float(i) for i in orig_values] except ValueError: tvar = TimeVariable('_') try: values = [tvar.parse(i) for i in orig_values] except ValueError: coltype = StringVariable else: coltype = TimeVariable else: coltype = ContinuousVariable if valuemap: # Map discrete data to ints def valuemap_index(val): try: return valuemap.index(val) except ValueError: return np.nan values = np.vectorize(valuemap_index, otypes=[float])(orig_values) coltype = DiscreteVariable coltype_kwargs.update(values=valuemap) if coltype is StringVariable: values = ['' if i is np.nan else i for i in orig_values] if flag.m or coltype is StringVariable: append_to = (Mcols, metas) elif flag.w: append_to = (Wcols, None) elif flag.c: append_to = (Ycols, clses) else: append_to = (Xcols, attrs) cols, domain_vars = append_to cols.append(col) if domain_vars is not None: if names and names[col]: # Use existing variable if available var = coltype.make(names[col].strip(), **coltype_kwargs) else: # Never use existing for un-named variables var = coltype(next(NAMEGEN), **coltype_kwargs) var.attributes.update(flag.attributes) domain_vars.append(var) # Reorder discrete values to match existing variable if var.is_discrete and not var.ordered: new_order, old_order = var.values, coltype_kwargs.get('values', var.values) if new_order != old_order: offset = len(new_order) column = values if data.ndim > 1 else data column += offset for i, val in enumerate(var.values): try: oldval = old_order.index(val) except ValueError: continue bn.replace(column, offset + oldval, new_order.index(val)) if coltype is TimeVariable: # Re-parse the values because only now after coltype.make call # above, variable var is the correct one values = [var.parse(i) for i in orig_values] # Write back the changed data. This is needeed to pass the # correct, converted values into Table.from_numpy below try: data[:, col] = values except IndexError: pass domain = Domain(attrs, clses, metas) if not data.size: return Table.from_domain(domain, 0) table = Table.from_numpy(domain, data[:, Xcols].astype(float, order='C'), data[:, Ycols].astype(float, order='C'), data[:, Mcols].astype(object, order='C'), data[:, Wcols].astype(float, order='C')) return table
def data_table(self, data, headers=None): """ Return Orange.data.Table given rows of `headers` (iterable of iterable) and rows of `data` (iterable of iterable; if ``numpy.ndarray``, might as well **have it sorted column-major**, e.g. ``order='F'``). Basically, the idea of subclasses is to produce those two iterables, however they might. If `headers` is not provided, the header rows are extracted from `data`, assuming they precede it. """ if not headers: headers, data = self.parse_headers(data) # Consider various header types (single-row, two-row, three-row, none) if 3 == len(headers): names, types, flags = map(list, headers) else: if 1 == len(headers): HEADER1_FLAG_SEP = '#' # First row format either: # 1) delimited column names # 2) -||- with type and flags prepended, separated by #, # e.g. d#sex,c#age,cC#IQ _flags, names = zip(*[ i.split(HEADER1_FLAG_SEP, 1) if HEADER1_FLAG_SEP in i else ('', i) for i in headers[0] ]) names = list(names) elif 2 == len(headers): names, _flags = map(list, headers) else: # Use heuristics for everything names, _flags = [], [] types = [ ''.join(filter(str.isupper, flag)).lower() for flag in _flags ] flags = [Flags.join(filter(str.islower, flag)) for flag in _flags] # Determine maximum row length rowlen = max(map(len, (names, types, flags))) def _equal_length(lst): lst.extend([''] * (rowlen - len(lst))) return lst # Ensure all data is of equal width in a column-contiguous array data = np.array([_equal_length(list(row)) for row in data if any(row)], copy=False, dtype=object, order='F') # Data may actually be longer than headers were try: rowlen = data.shape[1] except IndexError: pass else: for lst in (names, types, flags): _equal_length(lst) NAMEGEN = namegen('Feature ', 1) Xcols, attrs = [], [] Mcols, metas = [], [] Ycols, clses = [], [] Wcols = [] # Iterate through the columns for col in range(rowlen): flag = Flags(Flags.split(flags[col])) if flag.i: continue type_flag = types and types[col].strip() try: orig_values = [ np.nan if i in MISSING_VALUES else i for i in (i.strip() for i in data[:, col]) ] except IndexError: # No data instances leads here orig_values = [] # In this case, coltype could be anything. It's set as-is # only to satisfy test_table.TableTestCase.test_append coltype = DiscreteVariable coltype_kwargs = {} valuemap = [] values = orig_values if type_flag in StringVariable.TYPE_HEADERS: coltype = StringVariable elif type_flag in ContinuousVariable.TYPE_HEADERS: coltype = ContinuousVariable try: values = [float(i) for i in orig_values] except ValueError: for row, num in enumerate(orig_values): try: float(num) except ValueError: break raise ValueError('Non-continuous value in (1-based) ' 'line {}, column {}'.format( row + len(headers) + 1, col + 1)) elif type_flag in TimeVariable.TYPE_HEADERS: coltype = TimeVariable elif (type_flag in DiscreteVariable.TYPE_HEADERS or _RE_DISCRETE_LIST.match(type_flag)): if _RE_DISCRETE_LIST.match(type_flag): valuemap = Flags.split(type_flag) coltype_kwargs.update(ordered=True) else: valuemap = sorted(set(orig_values) - {np.nan}) else: # No known type specified, use heuristics is_discrete = is_discrete_values(orig_values) if is_discrete: valuemap = sorted(is_discrete) else: try: values = [float(i) for i in orig_values] except ValueError: tvar = TimeVariable('_') try: values = [tvar.parse(i) for i in orig_values] except ValueError: coltype = StringVariable else: coltype = TimeVariable else: coltype = ContinuousVariable if valuemap: # Map discrete data to ints def valuemap_index(val): try: return valuemap.index(val) except ValueError: return np.nan values = np.vectorize(valuemap_index, otypes=[float])(orig_values) coltype = DiscreteVariable coltype_kwargs.update(values=valuemap) if coltype is StringVariable: values = ['' if i is np.nan else i for i in orig_values] if flag.m or coltype is StringVariable: append_to = (Mcols, metas) elif flag.w: append_to = (Wcols, None) elif flag.c: append_to = (Ycols, clses) else: append_to = (Xcols, attrs) cols, domain_vars = append_to cols.append(col) if domain_vars is not None: if names and names[col]: # Use existing variable if available var = coltype.make(names[col].strip(), **coltype_kwargs) else: # Never use existing for un-named variables var = coltype(next(NAMEGEN), **coltype_kwargs) var.attributes.update(flag.attributes) domain_vars.append(var) # Reorder discrete values to match existing variable if var.is_discrete and not var.ordered: new_order, old_order = var.values, coltype_kwargs.get( 'values', var.values) if new_order != old_order: offset = len(new_order) column = values if data.ndim > 1 else data column += offset for i, val in enumerate(var.values): try: oldval = old_order.index(val) except ValueError: continue bn.replace(column, offset + oldval, new_order.index(val)) if coltype is TimeVariable: # Re-parse the values because only now after coltype.make call # above, variable var is the correct one values = [var.parse(i) for i in orig_values] # Write back the changed data. This is needeed to pass the # correct, converted values into Table.from_numpy below try: data[:, col] = values except IndexError: pass domain = Domain(attrs, clses, metas) if not data.size: return Table.from_domain(domain, 0) table = Table.from_numpy(domain, data[:, Xcols].astype(float, order='C'), data[:, Ycols].astype(float, order='C'), data[:, Mcols].astype(object, order='C'), data[:, Wcols].astype(float, order='C')) return table