def read_header(ofile): """Read the header of the iterable ofile.""" i = next(ofile) # Pass first comments while r_comment.match(i): i = next(ofile) # Header is everything up to DATA attribute ? relation = None attributes = [] while not r_datameta.match(i): m = r_headerline.match(i) if m: isattr = r_attribute.match(i) if isattr: name, type, i = tokenize_attribute(ofile, i) attributes.append((name, type)) else: isrel = r_relation.match(i) if isrel: relation = isrel.group(1) else: raise ValueError("Error parsing line %s" % i) i = next(ofile) else: i = next(ofile) return relation, attributes
def generator(row_iter, delim=','): # TODO: this is where we are spending times (~80%). I think things # could be made more efficiently: # - We could for example "compile" the function, because some values # do not change here. # - The function to convert a line to dtyped values could also be # generated on the fly from a string and be executed instead of # looping. # - The regex are overkill: for comments, checking that a line starts # by % should be enough and faster, and for empty lines, same thing # --> this does not seem to change anything. # We do not abstract skipping comments and empty lines for performances # reason. raw = next(row_iter) while r_empty.match(raw): raw = next(row_iter) while r_comment.match(raw): raw = next(row_iter) # 'compiling' the range since it does not change # Note, I have already tried zipping the converters and # row elements and got slightly worse performance. elems = list(range(ni)) row = raw.split(delim) yield tuple([convertors[i](row[i]) for i in elems]) for raw in row_iter: while r_comment.match(raw): raw = next(row_iter) while r_empty.match(raw): raw = next(row_iter) row = raw.split(delim) yield tuple([convertors[i](row[i]) for i in elems])
def next_data_line(row_iter): """Assumes we are already in the data part (eg after @data).""" raw = next(row_iter) while r_empty.match(raw): raw = next(row_iter) while r_comment.match(raw): raw = next(row_iter) return raw
def read_data_list(ofile): """Read each line of the iterable and put it in a list.""" data = [next(ofile)] if data[0].strip()[0] == '{': raise ValueError("This looks like a sparse ARFF: not supported yet") data.extend([i for i in ofile]) return data
def floupi(filename): data, meta = loadarff(filename) from attrselect import print_dataset_info print_dataset_info(data) print("relation %s, has %d instances" % (meta.name, data.size)) itp = iter(types) for i in data.dtype.names: print_attribute(i,next(itp),data[i])
def get_ndata(ofile): """Read the whole file to get number of data attributes.""" data = [next(ofile)] loc = 1 if data[0].strip()[0] == '{': raise ValueError("This looks like a sparse ARFF: not supported yet") for i in ofile: loc += 1 return loc
def tokenize_multilines(iterable, val): """Can tokenize an attribute spread over several lines.""" # If one line does not match, read all the following lines up to next # line with meta character, and try to parse everything up to there. if not r_mcomattrval.match(val): all = [val] i = next(iterable) while not r_meta.match(i): all.append(i) i = next(iterable) if r_mend.search(i): raise ValueError("relational attribute not supported yet") print("".join(all[:-1])) m = r_comattrval.match("".join(all[:-1])) return m.group(1), m.group(2), i else: raise ValueError("Cannot parse attribute names spread over multi " "lines yet")
def read_header(ofile, returnclasses=False): """Read the header of the iterable ofile.""" i = next(ofile) # Pass first comments while r_comment.match(i): i = next(ofile) # Header is everything up to DATA attribute ? relation = None attributes = [] nrclasses = 0 while not r_datameta.match(i): m = r_headerline.match(i) if m: isattr = r_attribute.match(i) if isattr: name, type, i = tokenize_attribute(ofile, i) attributes.append((name, type)) else: isrel = r_relation.match(i) if isrel: relation = isrel.group(1) words = i.split() found = 0 for tk in range(len(words)): if words[tk] == '-C': if tk+1 < len(words): nrclasses = int(words[tk+1].replace("'",'')) else: raise ValueError("Error parsing line %s" % i) i = next(ofile) else: i = next(ofile) if returnclasses: return relation, attributes, nrclasses else: return relation, attributes
def _loadarff(ofile): # Parse the header file try: rel, attr = read_header(ofile) except ValueError as e: msg = "Error while parsing header, error was: " + str(e) raise ParseArffError(msg) # Check whether we have a string attribute (not supported yet) hasstr = False for name, value in attr: type = parse_type(value) if type == 'string': hasstr = True meta = MetaData(rel, attr) # XXX The following code is not great # Build the type descriptor descr and the list of convertors to convert # each attribute to the suitable type (which should match the one in # descr). # This can be used once we want to support integer as integer values and # not as numeric anymore (using masked arrays ?). acls2dtype = {'real': np.float, 'integer': np.float, 'numeric': np.float} acls2conv = { 'real': safe_float, 'integer': safe_float, 'numeric': safe_float } descr = [] convertors = [] if not hasstr: for name, value in attr: type = parse_type(value) if type == 'date': date_format, datetime_unit = get_date_format(value) descr.append((name, "datetime64[%s]" % datetime_unit)) convertors.append( partial(safe_date, date_format=date_format, datetime_unit=datetime_unit)) elif type == 'nominal': n = maxnomlen(value) descr.append((name, 'S%d' % n)) pvalue = get_nom_val(value) convertors.append(partial(safe_nominal, pvalue=pvalue)) else: descr.append((name, acls2dtype[type])) convertors.append(safe_float) #dc.append(acls2conv[type]) #sdescr.append((name, acls2sdtype[type])) else: # How to support string efficiently ? Ideally, we should know the max # size of the string before allocating the numpy array. raise NotImplementedError("String attributes not supported yet, sorry") ni = len(convertors) # Get the delimiter from the first line of data: def next_data_line(row_iter): """Assumes we are already in the data part (eg after @data).""" raw = next(row_iter) while r_empty.match(raw): raw = next(row_iter) while r_comment.match(raw): raw = next(row_iter) return raw try: try: dtline = next_data_line(ofile) delim = get_delim(dtline) except ValueError as e: raise ParseArffError("Error while parsing delimiter: " + str(e)) finally: ofile.seek(0, 0) ofile = go_data(ofile) # skip the @data line next(ofile) def generator(row_iter, delim=','): # TODO: this is where we are spending times (~80%). I think things # could be made more efficiently: # - We could for example "compile" the function, because some values # do not change here. # - The function to convert a line to dtyped values could also be # generated on the fly from a string and be executed instead of # looping. # - The regex are overkill: for comments, checking that a line starts # by % should be enough and faster, and for empty lines, same thing # --> this does not seem to change anything. # We do not abstract skipping comments and empty lines for performances # reason. raw = next(row_iter) while r_empty.match(raw): raw = next(row_iter) while r_comment.match(raw): raw = next(row_iter) # 'compiling' the range since it does not change # Note, I have already tried zipping the converters and # row elements and got slightly worse performance. elems = list(range(ni)) row = raw.split(delim) yield tuple([convertors[i](row[i]) for i in elems]) for raw in row_iter: while r_comment.match(raw): raw = next(row_iter) while r_empty.match(raw): raw = next(row_iter) row = raw.split(delim) yield tuple([convertors[i](row[i]) for i in elems]) a = generator(ofile, delim=delim) # No error should happen here: it is a bug otherwise data = np.fromiter(a, descr) return data, meta
def tokenize_attribute(iterable, attribute): """Parse a raw string in header (eg starts by @attribute). Given a raw string attribute, try to get the name and type of the attribute. Constraints: * The first line must start with @attribute (case insensitive, and space like characters before @attribute are allowed) * Works also if the attribute is spread on multilines. * Works if empty lines or comments are in between Parameters ---------- attribute : str the attribute string. Returns ------- name : str name of the attribute value : str value of the attribute next : str next line to be parsed Examples -------- If attribute is a string defined in python as r"floupi real", will return floupi as name, and real as value. >>> iterable = iter([0] * 10) # dummy iterator >>> tokenize_attribute(iterable, r"@attribute floupi real") ('floupi', 'real', 0) If attribute is r"'floupi 2' real", will return 'floupi 2' as name, and real as value. >>> tokenize_attribute(iterable, r" @attribute 'floupi 2' real ") ('floupi 2', 'real', 0) """ sattr = attribute.strip() mattr = r_attribute.match(sattr) if mattr: # atrv is everything after @attribute atrv = mattr.group(1) if r_comattrval.match(atrv): name, type = tokenize_single_comma(atrv) next_item = next(iterable) elif r_wcomattrval.match(atrv): name, type = tokenize_single_wcomma(atrv) next_item = next(iterable) else: # Not sure we should support this, as it does not seem supported by # weka. raise ValueError("multi line not supported yet") #name, type, next_item = tokenize_multilines(iterable, atrv) else: raise ValueError("First line unparsable: %s" % sattr) if type == 'relational': raise ValueError("relational attributes not supported yet") return name, type, next_item
def _loadarff(ofile): # Parse the header file try: rel, attr = read_header(ofile) except ValueError as e: msg = "Error while parsing header, error was: " + str(e) raise ParseArffError(msg) # Check whether we have a string attribute (not supported yet) hasstr = False for name, value in attr: type = parse_type(value) if type == 'string': hasstr = True meta = MetaData(rel, attr) # XXX The following code is not great # Build the type descriptor descr and the list of convertors to convert # each attribute to the suitable type (which should match the one in # descr). # This can be used once we want to support integer as integer values and # not as numeric anymore (using masked arrays ?). acls2dtype = {'real': np.float, 'integer': np.float, 'numeric': np.float} acls2conv = {'real': safe_float, 'integer': safe_float, 'numeric': safe_float} descr = [] convertors = [] if not hasstr: for name, value in attr: type = parse_type(value) if type == 'date': raise ValueError("date type not supported yet, sorry") elif type == 'nominal': n = maxnomlen(value) descr.append((name, 'S%d' % n)) pvalue = get_nom_val(value) convertors.append(partial(safe_nominal, pvalue=pvalue)) else: descr.append((name, acls2dtype[type])) convertors.append(safe_float) #dc.append(acls2conv[type]) #sdescr.append((name, acls2sdtype[type])) else: # How to support string efficiently ? Ideally, we should know the max # size of the string before allocating the numpy array. raise NotImplementedError("String attributes not supported yet, sorry") ni = len(convertors) # Get the delimiter from the first line of data: def next_data_line(row_iter): """Assumes we are already in the data part (eg after @data).""" raw = next(row_iter) while r_empty.match(raw): raw = next(row_iter) while r_comment.match(raw): raw = next(row_iter) return raw try: try: dtline = next_data_line(ofile) delim = get_delim(dtline) except ValueError as e: raise ParseArffError("Error while parsing delimiter: " + str(e)) finally: ofile.seek(0, 0) ofile = go_data(ofile) # skip the @data line next(ofile) def generator(row_iter, delim=','): # TODO: this is where we are spending times (~80%). I think things # could be made more efficiently: # - We could for example "compile" the function, because some values # do not change here. # - The function to convert a line to dtyped values could also be # generated on the fly from a string and be executed instead of # looping. # - The regex are overkill: for comments, checking that a line starts # by % should be enough and faster, and for empty lines, same thing # --> this does not seem to change anything. # We do not abstract skipping comments and empty lines for performances # reason. raw = next(row_iter) while r_empty.match(raw): raw = next(row_iter) while r_comment.match(raw): raw = next(row_iter) # 'compiling' the range since it does not change # Note, I have already tried zipping the converters and # row elements and got slightly worse performance. elems = list(range(ni)) row = raw.split(delim) yield tuple([convertors[i](row[i]) for i in elems]) for raw in row_iter: while r_comment.match(raw): raw = next(row_iter) while r_empty.match(raw): raw = next(row_iter) row = raw.split(delim) yield tuple([convertors[i](row[i]) for i in elems]) a = generator(ofile, delim=delim) # No error should happen here: it is a bug otherwise data = np.fromiter(a, descr) return data, meta