def write_data(cls, write, data): """`write` is a callback that accepts an iterable""" vars = list(chain((ContinuousVariable('_w'),) if data.has_weights() else (), data.domain.attributes, data.domain.class_vars, data.domain.metas)) def formatter(var): # type: (Variable) -> Callable[[Variable], Any] # Return a column 'formatter' function. The function must return # something that `write` knows how to write if var.is_time: return var.repr_val elif var.is_continuous: return lambda value: "" if isnan(value) else value elif var.is_discrete: return lambda value: "" if isnan(value) else var.values[int(value)] elif var.is_string: return lambda value: value else: return var.repr_val formatters = [formatter(v) for v in vars] for row in zip(data.W if data.W.ndim > 1 else data.W[:, np.newaxis], data.X, data.Y if data.Y.ndim > 1 else data.Y[:, np.newaxis], data.metas): write([fmt(v) for fmt, v in zip(formatters, flatten(row))])
def write_data(cls, write, data): """`write` is a callback that accepts an iterable""" vars = list( chain( (ContinuousVariable("_w"),) if data.has_weights() else (), data.domain.attributes, data.domain.class_vars, data.domain.metas, ) ) for row in zip( data.W if data.W.ndim > 1 else data.W[:, np.newaxis], data.X, data.Y if data.Y.ndim > 1 else data.Y[:, np.newaxis], data.metas, ): write( [ "" if isinstance(val, Number) and isnan(val) else var.values[int(val)] if var.is_discrete else var.repr_val(val) if isinstance(var, TimeVariable) else val for var, val in zip(vars, flatten(row)) ] )
def write_data(cls, write, data): """`write` is a callback that accepts an iterable""" vars = list(chain((ContinuousVariable('_w'),) if data.has_weights() else (), data.domain.attributes, data.domain.class_vars, data.domain.metas)) formatters = [cls.formatter(v) for v in vars] for row in zip(data.W if data.W.ndim > 1 else data.W[:, np.newaxis], data.X, data.Y if data.Y.ndim > 1 else data.Y[:, np.newaxis], data.metas): write([fmt(v) for fmt, v in zip(formatters, flatten(row))])
class Flags: """Parser for column flags (i.e. third header row)""" DELIMITER = ' ' _RE_SPLIT = re.compile(r'(?<!\\)' + DELIMITER).split _RE_ATTR_UNQUOTED_STR = re.compile(r'^[a-zA-Z_]').match ALL = OrderedDict(( ('class', 'c'), ('ignore', 'i'), ('meta', 'm'), ('weight', 'w'), ('.+?=.*?', ''), # general key=value attributes )) _RE_ALL = re.compile(r'^({})$'.format('|'.join(filter(None, flatten(ALL.items()))))) def __init__(self, flags): for v in filter(None, self.ALL.values()): setattr(self, v, False) self.attributes = {} for flag in flags or []: flag = flag.strip() if self._RE_ALL.match(flag): if '=' in flag: k, v = flag.split('=', 1) if not Flags._RE_ATTR_UNQUOTED_STR(v): try: v = literal_eval(v) except SyntaxError: # If parsing failed, treat value as string pass self.attributes[k] = v else: setattr(self, flag, True) setattr(self, self.ALL.get(flag, ''), True) elif flag: warnings.warn('Invalid attribute flag \'{}\''.format(flag)) @staticmethod def join(iterable, *args): return Flags.DELIMITER.join(i.strip().replace(Flags.DELIMITER, '\\' + Flags.DELIMITER) for i in chain(iterable, args)).lstrip() @staticmethod def split(s): return [i.replace('\\' + Flags.DELIMITER, Flags.DELIMITER) for i in Flags._RE_SPLIT(s)]
def apply(self): self.Warning.renamed_variables.clear() tables, domain, source_var = [], None, None if self.primary_data is not None: tables = [self.primary_data] + list(self.more_data.values()) domain = self.primary_data.domain elif self.more_data: if self.ignore_compute_value: tables = self._dumb_tables() else: tables = self.more_data.values() domains = [table.domain for table in tables] domain = self.merge_domains(domains) if tables and self.append_source_column: assert domain is not None names = [getattr(t, 'name', '') for t in tables] if len(names) != len(set(names)): names = [ '{} ({})'.format(name, i) for i, name in enumerate(names) ] source_var = Orange.data.DiscreteVariable(get_unique_names( domain, self.source_attr_name), values=names) places = ["class_vars", "attributes", "metas"] domain = add_columns( domain, **{places[self.source_column_role]: (source_var, )}) tables = [table.transform(domain) for table in tables] if tables: data = type(tables[0]).concatenate(tables) if source_var: source_ids = np.array( list( flatten([i] * len(table) for i, table in enumerate(tables)))).reshape( (-1, 1)) data[:, source_var] = source_ids self.info.set_output_summary(len(data), format_summary_details(data)) else: data = None self.info.set_output_summary(self.info.NoOutput) self.Outputs.data.send(data)
def write_file(cls, filename, data): vars = list( chain((ContinuousVariable('_w'), ) if data.has_weights() else (), data.domain.attributes, data.domain.class_vars, data.domain.metas)) formatters = [cls.formatter(v) for v in vars] zipped_list_data = zip( data.W if data.W.ndim > 1 else data.W[:, np.newaxis], data.X, data.Y if data.Y.ndim > 1 else data.Y[:, np.newaxis], data.metas) headers = cls.header_names(data) workbook = xlsxwriter.Workbook(filename) sheet = workbook.add_worksheet() for c, header in enumerate(headers): sheet.write(0, c, header) for i, row in enumerate(zipped_list_data, 1): for j, (fmt, v) in enumerate(zip(formatters, flatten(row))): sheet.write(i, j, fmt(v)) workbook.close()
def write_file(cls, filename, data): vars = list(chain((ContinuousVariable('_w'),) if data.has_weights() else (), data.domain.attributes, data.domain.class_vars, data.domain.metas)) formatters = [cls.formatter(v) for v in vars] zipped_list_data = zip(data.W if data.W.ndim > 1 else data.W[:, np.newaxis], data.X, data.Y if data.Y.ndim > 1 else data.Y[:, np.newaxis], data.metas) headers = cls.header_names(data) workbook = xlsxwriter.Workbook(filename) sheet = workbook.add_worksheet() for c, header in enumerate(headers): sheet.write(0, c, header) for i, row in enumerate(zipped_list_data, 1): for j, (fmt, v) in enumerate(zip(formatters, flatten(row))): sheet.write(i, j, fmt(v)) workbook.close()
def apply(self): tables, domain, source_var = [], None, None if self.primary_data is not None: tables = [self.primary_data] + list(self.more_data.values()) domain = self.primary_data.domain elif self.more_data: tables = self.more_data.values() if self.merge_type == OWConcatenate.MergeUnion: domain = reduce(domain_union, (table.domain for table in tables)) else: domain = reduce(domain_intersection, (table.domain for table in tables)) if tables and self.append_source_column: assert domain is not None names = [getattr(t, 'name', '') for t in tables] if len(names) != len(set(names)): names = [ '{} ({})'.format(name, i) for i, name in enumerate(names) ] source_var = Orange.data.DiscreteVariable(self.source_attr_name, values=names) places = ["class_vars", "attributes", "metas"] domain = add_columns( domain, **{places[self.source_column_role]: (source_var, )}) tables = [table.transform(domain) for table in tables] if tables: data = type(tables[0]).concatenate(tables) if source_var: source_ids = np.array( list( flatten([i] * len(table) for i, table in enumerate(tables)))).reshape( (-1, 1)) data[:, source_var] = source_ids else: data = None self.Outputs.data.send(data)
def apply(self): tables, domain, source_var = [], None, None if self.primary_data is not None: tables = [self.primary_data] + list(self.more_data.values()) domain = self.primary_data.domain elif self.more_data: tables = self.more_data.values() if self.merge_type == OWConcatenate.MergeUnion: domain = reduce(domain_union, (table.domain for table in tables)) else: domain = reduce(domain_intersection, (table.domain for table in tables)) if tables and self.append_source_column: assert domain is not None names = [getattr(t, 'name', '') for t in tables] if len(names) != len(set(names)): names = ['{} ({})'.format(name, i) for i, name in enumerate(names)] source_var = Orange.data.DiscreteVariable( self.source_attr_name, values=names ) places = ["class_vars", "attributes", "metas"] domain = add_columns( domain, **{places[self.source_column_role]: (source_var,)}) tables = [table.transform(domain) for table in tables] if tables: data = type(tables[0]).concatenate(tables, axis=0) if source_var: source_ids = np.array(list(flatten( [i] * len(table) for i, table in enumerate(tables)))).reshape((-1, 1)) data[:, source_var] = source_ids else: data = None self.Outputs.data.send(data)
def write_data(cls, write, data): """`write` is a callback that accepts an iterable""" vars = list( chain( (ContinuousVariable("_w"), ) if data.has_weights() else (), data.domain.attributes, data.domain.class_vars, data.domain.metas, )) for row in zip( data.W if data.W.ndim > 1 else data.W[:, np.newaxis], data.X, data.Y if data.Y.ndim > 1 else data.Y[:, np.newaxis], data.metas, ): write([ "" if isinstance(val, Number) and isnan(val) else var.values[int(val)] if var.is_discrete else var.repr_val(val) if isinstance(var, TimeVariable) else val for var, val in zip(vars, flatten(row)) ])
def apply(self): tables, domain, source_var = [], None, None if self.primary_data is not None: tables = [self.primary_data] + list(self.more_data.values()) domain = self.primary_data.domain elif self.more_data: tables = self.more_data.values() if self.merge_type == OWConcatenate.MergeUnion: domain = reduce(domain_union, (table.domain for table in tables)) else: domain = reduce(domain_intersection, (table.domain for table in tables)) if self.append_source_column: source_var = Orange.data.DiscreteVariable( self.source_attr_name, values=["{}".format(i) for i in range(len(tables))]) places = ["class_vars", "attributes", "metas"] domain = add_columns( domain, **{places[self.source_column_role]: (source_var, )}) tables = [table.transform(domain) for table in tables] if tables: data = type(tables[0]).concatenate(tables, axis=0) if source_var: source_ids = np.array( list( flatten([i] * len(table) for i, table in enumerate(tables)))).reshape( (-1, 1)) data[:, source_var] = source_ids else: data = None self.send("Data", data)
for i in chain(iterable, args)).lstrip() @staticmethod def split(s): return [ i.replace('\\' + Flags.DELIMITER, Flags.DELIMITER) for i in Flags._RE_SPLIT(s) ] # Matches discrete specification where all the values are listed, space-separated _RE_DISCRETE_LIST = re.compile(r'^\s*[^\s]+(\s[^\s]+)+\s*$') _RE_TYPES = re.compile(r'^\s*({}|{}|)\s*$'.format( _RE_DISCRETE_LIST.pattern, '|'.join( flatten( getattr(vartype, 'TYPE_HEADERS') for vartype in Variable.registry.values())))) _RE_FLAGS = re.compile(r'^\s*( |{}|)*\s*$'.format('|'.join( flatten(filter(None, i) for i in Flags.ALL.items())))) class FileFormatMeta(Registry): def __new__(cls, name, bases, attrs): newcls = super().__new__(cls, name, bases, attrs) # Optionally add compressed versions of extensions as supported if getattr(newcls, 'SUPPORT_COMPRESSED', False): new_extensions = list(getattr(newcls, 'EXTENSIONS', ())) for compression in Compression.all: for ext in newcls.EXTENSIONS: new_extensions.append(ext + compression)
class SpotbugsReader(FileFormat): """Reader for comma separated files""" EXTENSIONS = ('.xml', ) DESCRIPTION = 'SpotBugs XML report' SUPPORT_SPARSE_DATA = True def read(self): exported_data = [] def append_bug(info): exported_data.append(info) parse_spotbugs_report(self.filename, append_bug) headers = [[ "m#SourceFile", "D#BugClass", "D#BugMethod", "D#BugType", "D#Priority", "C#Rank", "D#Category", "D#SinkMethod", "D#UnknownSource" ]] # ["d" ,"d" ,"d" ,"c" ,"d" ,"d" ,"d"]] return self.data_table(exported_data, headers) # Matches discrete specification where all the values are listed, space-separated _RE_DISCRETE_LIST = re.compile(r'^\s*[^\s]+(\s[^\s]+)+\s*$') _RE_TYPES = re.compile(r'^\s*({}|{}|)\s*$'.format( _RE_DISCRETE_LIST.pattern, '|'.join( flatten( getattr(vartype, 'TYPE_HEADERS') for vartype in Variable.registry.values())))) _RE_FLAGS = re.compile(r'^\s*( |{}|)*\s*$'.format('|'.join( flatten(filter(None, i) for i in Flags.ALL.items())))) @classmethod def data_table(cls, data, headers=None): """ Return Orange.data.Table given rows of `headers` (iterable of iterable) and rows of `data` (iterable of iterable; if ``numpy.ndarray``, might as well **have it sorted column-major**, e.g. ``order='F'``). Basically, the idea of subclasses is to produce those two iterables, however they might. If `headers` is not provided, the header rows are extracted from `data`, assuming they precede it. """ if not headers: headers, data = cls.parse_headers(data) # Consider various header types (single-row, two-row, three-row, none) if len(headers) == 3: names, types, flags = map(list, headers) else: if len(headers) == 1: HEADER1_FLAG_SEP = '#' # First row format either: # 1) delimited column names # 2) -||- with type and flags prepended, separated by #, # e.g. d#sex,c#age,cC#IQ _flags, names = zip(*[ i.split(HEADER1_FLAG_SEP, 1) if HEADER1_FLAG_SEP in i else ('', i) for i in headers[0] ]) names = list(names) elif len(headers) == 2: names, _flags = map(list, headers) else: # Use heuristics for everything names, _flags = [], [] types = [ ''.join(filter(str.isupper, flag)).lower() for flag in _flags ] flags = [Flags.join(filter(str.islower, flag)) for flag in _flags] # Determine maximum row length rowlen = max(map(len, (names, types, flags))) def _equal_length(lst): lst.extend([''] * (rowlen - len(lst))) return lst # Ensure all data is of equal width in a column-contiguous array data = np.array([_equal_length(list(row)) for row in data if any(row)], copy=False, dtype=object, order='F') # Data may actually be longer than headers were try: rowlen = data.shape[1] except IndexError: pass else: for lst in (names, types, flags): _equal_length(lst) NAMEGEN = namegen('Feature ', 1) Xcols, attrs = [], [] Mcols, metas = [], [] Ycols, clses = [], [] Wcols = [] # Rename variables if necessary # Reusing across files still works if both files have same duplicates name_counts = Counter(names) del name_counts[""] if len(name_counts) != len(names) and name_counts: uses = { name: 0 for name, count in name_counts.items() if count > 1 } for i, name in enumerate(names): if name in uses: uses[name] += 1 names[i] = "{}_{}".format(name, uses[name]) # Iterate through the columns for col in range(rowlen): flag = Flags(Flags.split(flags[col])) if flag.i: continue type_flag = types and types[col].strip() try: orig_values = [ np.nan if i in MISSING_VALUES else i for i in (i.strip() for i in data[:, col]) ] except IndexError: # No data instances leads here orig_values = [] # In this case, coltype could be anything. It's set as-is # only to satisfy test_table.TableTestCase.test_append coltype = DiscreteVariable coltype_kwargs = {} valuemap = [] values = orig_values if type_flag in StringVariable.TYPE_HEADERS: coltype = StringVariable elif type_flag in ContinuousVariable.TYPE_HEADERS: coltype = ContinuousVariable try: values = [float(i) for i in orig_values] except ValueError: for row, num in enumerate(orig_values): try: float(num) except ValueError: break raise ValueError('Non-continuous value in (1-based) ' 'line {}, column {}'.format( row + len(headers) + 1, col + 1)) elif type_flag in TimeVariable.TYPE_HEADERS: coltype = TimeVariable elif (type_flag in DiscreteVariable.TYPE_HEADERS or _RE_DISCRETE_LIST.match(type_flag)): coltype = DiscreteVariable if _RE_DISCRETE_LIST.match(type_flag): valuemap = Flags.split(type_flag) coltype_kwargs.update(ordered=True) else: valuemap = sorted(set(orig_values) - {np.nan}) else: # No known type specified, use heuristics valuemap, values, coltype = guess_data_type(orig_values) if flag.m or coltype is StringVariable: append_to = (Mcols, metas) elif flag.w: append_to = (Wcols, None) elif flag.c: append_to = (Ycols, clses) else: append_to = (Xcols, attrs) cols, domain_vars = append_to cols.append(col) existing_var, new_var_name = None, None if domain_vars is not None: existing_var = names and names[col] if not existing_var: new_var_name = next(NAMEGEN) values, var = sanitize_variable(valuemap, values, orig_values, coltype, coltype_kwargs, domain_vars, existing_var, new_var_name, data) if domain_vars is not None: var.attributes.update(flag.attributes) domain_vars.append(var) # Write back the changed data. This is needeed to pass the # correct, converted values into Table.from_numpy below try: data[:, col] = values except IndexError: pass domain = Domain(attrs, clses, metas) if not data.size: return Table.from_domain(domain, 0) table = Table.from_numpy(domain, data[:, Xcols].astype(float, order='C'), data[:, Ycols].astype(float, order='C'), data[:, Mcols].astype(object, order='C'), data[:, Wcols].astype(float, order='C')) return table
@staticmethod def join(iterable, *args): return Flags.DELIMITER.join(i.strip().replace(Flags.DELIMITER, '\\' + Flags.DELIMITER) for i in chain(iterable, args)).lstrip() @staticmethod def split(s): return [i.replace('\\' + Flags.DELIMITER, Flags.DELIMITER) for i in Flags._RE_SPLIT(s)] # Matches discrete specification where all the values are listed, space-separated _RE_DISCRETE_LIST = re.compile(r'^\s*[^\s]+(\s[^\s]+)+\s*$') _RE_TYPES = re.compile(r'^\s*({}|{}|)\s*$'.format(_RE_DISCRETE_LIST.pattern, '|'.join(flatten(getattr(vartype, 'TYPE_HEADERS') for vartype in Variable.registry.values())))) _RE_FLAGS = re.compile(r'^\s*( |{}|)*\s*$'.format('|'.join(flatten(filter(None, i) for i in Flags.ALL.items())))) class FileFormatMeta(Registry): def __new__(cls, name, bases, attrs): newcls = super().__new__(cls, name, bases, attrs) # Optionally add compressed versions of extensions as supported if getattr(newcls, 'SUPPORT_COMPRESSED', False): new_extensions = list(getattr(newcls, 'EXTENSIONS', ())) for compression in Compression.all: for ext in newcls.EXTENSIONS: new_extensions.append(ext + compression) # OSX file dialog doesn't support filtering on double
@staticmethod def join(iterable, *args): return Flags.DELIMITER.join(i.strip().replace(Flags.DELIMITER, '\\' + Flags.DELIMITER) for i in chain(iterable, args)).lstrip() @staticmethod def split(s): return [i.replace('\\' + Flags.DELIMITER, Flags.DELIMITER) for i in Flags._RE_SPLIT(s)] # Matches discrete specification where all the values are listed, space-separated _RE_DISCRETE_LIST = re.compile(r'^\s*[^\s]+(\s[^\s]+)+\s*$') _RE_TYPES = re.compile(r'^\s*({}|{}|)\s*$'.format( _RE_DISCRETE_LIST.pattern, '|'.join(flatten(getattr(vartype, 'TYPE_HEADERS') for vartype in Variable.registry.values())) )) _RE_FLAGS = re.compile(r'^\s*( |{}|)*\s*$'.format( '|'.join(flatten(filter(None, i) for i in Flags.ALL.items())) )) class FileFormatMeta(Registry): def __new__(cls, name, bases, attrs): newcls = super().__new__(cls, name, bases, attrs) # Optionally add compressed versions of extensions as supported if getattr(newcls, 'SUPPORT_COMPRESSED', False): new_extensions = list(getattr(newcls, 'EXTENSIONS', ())) for compression in Compression.all:
def test_flatten(self): self.assertEqual(list(flatten([[1, 2], [3]])), [1, 2, 3])
def join(iterable, *args): return Flags.DELIMITER.join( i.strip().replace(Flags.DELIMITER, "\\" + Flags.DELIMITER) for i in chain(iterable, args) ).lstrip() @staticmethod def split(s): return [i.replace("\\" + Flags.DELIMITER, Flags.DELIMITER) for i in Flags._RE_SPLIT(s)] # Matches discrete specification where all the values are listed, space-separated _RE_DISCRETE_LIST = re.compile(r"^\s*[^\s]+(\s[^\s]+)+\s*$") _RE_TYPES = re.compile( r"^\s*({}|{}|)\s*$".format( _RE_DISCRETE_LIST.pattern, "|".join(flatten(getattr(vartype, "TYPE_HEADERS") for vartype in Variable.registry.values())), ) ) _RE_FLAGS = re.compile(r"^\s*( |{}|)*\s*$".format("|".join(flatten(filter(None, i) for i in Flags.ALL.items())))) class FileFormatMeta(Registry): def __new__(cls, name, bases, attrs): newcls = super().__new__(cls, name, bases, attrs) # Optionally add compressed versions of extensions as supported if getattr(newcls, "SUPPORT_COMPRESSED", False): new_extensions = list(getattr(newcls, "EXTENSIONS", ())) for compression in Compression.all: for ext in newcls.EXTENSIONS: new_extensions.append(ext + compression)