def read_header(ofile): """Read the header of the iterable ofile.""" i = next(ofile) # Pass first comments while r_comment.match(i): i = next(ofile) # Header is everything up to DATA attribute ? relation = None attributes = [] while not r_datameta.match(i): m = r_headerline.match(i) if m: isattr = r_attribute.match(i) if isattr: name, type, i = tokenize_attribute(ofile, i) attributes.append((name, type)) else: isrel = r_relation.match(i) if isrel: relation = isrel.group(1) else: raise ValueError("Error parsing line %s" % i) i = next(ofile) else: i = next(ofile) return relation, attributes
def generator(row_iter, delim=','): # TODO: this is where we are spending times (~80%). I think things # could be made more efficiently: # - We could for example "compile" the function, because some values # do not change here. # - The function to convert a line to dtyped values could also be # generated on the fly from a string and be executed instead of # looping. # - The regex are overkill: for comments, checking that a line starts # by % should be enough and faster, and for empty lines, same thing # --> this does not seem to change anything. # We do not abstract skipping comments and empty lines for performances # reason. raw = next(row_iter) while r_empty.match(raw) or r_comment.match(raw): raw = next(row_iter) # 'compiling' the range since it does not change # Note, I have already tried zipping the converters and # row elements and got slightly worse performance. elems = list(range(ni)) row = raw.split(delim) yield tuple([convertors[i](row[i]) for i in elems]) for raw in row_iter: while r_comment.match(raw) or r_empty.match(raw): raw = next(row_iter) row = raw.split(delim) yield tuple([convertors[i](row[i]) for i in elems])
def split_data_line(line, dialect=None): delimiters = ",\t" # This can not be done in a per reader basis, and relational fields # can be HUGE csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2)) # Remove the line end if any if line[-1] == '\n': line = line[:-1] sniff_line = line # Add a delimiter if none is present, so that the csv.Sniffer # does not complain for a single-field CSV. if not any(d in line for d in delimiters): sniff_line += "," if dialect is None: dialect = csv.Sniffer().sniff(sniff_line, delimiters=delimiters) workaround_csv_sniffer_bug_last_field(sniff_line=sniff_line, dialect=dialect, delimiters=delimiters) row = next(csv.reader([line], dialect)) return row, dialect
def read_data_list(ofile): """Read each line of the iterable and put it in a list.""" data = [next(ofile)] if data[0].strip()[0] == '{': raise ValueError("This looks like a sparse ARFF: not supported yet") data.extend([i for i in ofile]) return data
def get_ndata(ofile): """Read the whole file to get number of data attributes.""" data = [next(ofile)] loc = 1 if data[0].strip()[0] == '{': raise ValueError("This looks like a sparse ARFF: not supported yet") for i in ofile: loc += 1 return loc
def read_relational_attribute(ofile, relational_attribute, i): """Read the nested attributes of a relational attribute""" r_end_relational = re.compile(r'^@[Ee][Nn][Dd]\s*' + relational_attribute.name + r'\s*$') while not r_end_relational.match(i): m = r_headerline.match(i) if m: isattr = r_attribute.match(i) if isattr: attr, i = tokenize_attribute(ofile, i) relational_attribute.attributes.append(attr) else: raise ValueError("Error parsing line %s" % i) else: i = next(ofile) i = next(ofile) return i
def split_data_line(line, dialect=None): delimiters = ",\t" # This can not be done in a per reader basis, and relational fields # can be HUGE csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2)) # Remove the line end if any if line[-1] == '\n': line = line[:-1] sniff_line = line # Add a delimiter if none is present, so that the csv.Sniffer # does not complain for a single-field CSV. if not any(d in line for d in delimiters): sniff_line += "," if dialect is None: dialect = csv.Sniffer().sniff(sniff_line, delimiters=delimiters) row = next(csv.reader([line], dialect)) return row, dialect
def next_data_line(row_iter): """Assumes we are already in the data part (eg after @data).""" raw = next(row_iter) while r_empty.match(raw) or r_comment.match(raw): raw = next(row_iter) return raw
def _loadarff(ofile): # Parse the header file try: rel, attr = read_header(ofile) except ValueError as e: msg = "Error while parsing header, error was: " + str(e) raise ParseArffError(msg) # Check whether we have a string attribute (not supported yet) hasstr = False for name, value in attr: type = parse_type(value) if type == 'string': hasstr = True meta = MetaData(rel, attr) # XXX The following code is not great # Build the type descriptor descr and the list of convertors to convert # each attribute to the suitable type (which should match the one in # descr). # This can be used once we want to support integer as integer values and # not as numeric anymore (using masked arrays ?). acls2dtype = {'real': float, 'integer': float, 'numeric': float} acls2conv = { 'real': safe_float, 'integer': safe_float, 'numeric': safe_float } descr = [] convertors = [] if not hasstr: for name, value in attr: type = parse_type(value) if type == 'date': date_format, datetime_unit = get_date_format(value) descr.append((name, "datetime64[%s]" % datetime_unit)) convertors.append( partial(safe_date, date_format=date_format, datetime_unit=datetime_unit)) elif type == 'nominal': n = maxnomlen(value) descr.append((name, 'S%d' % n)) pvalue = get_nom_val(value) convertors.append(partial(safe_nominal, pvalue=pvalue)) else: descr.append((name, acls2dtype[type])) convertors.append(safe_float) #dc.append(acls2conv[type]) #sdescr.append((name, acls2sdtype[type])) else: # How to support string efficiently ? Ideally, we should know the max # size of the string before allocating the numpy array. raise NotImplementedError("String attributes not supported yet, sorry") ni = len(convertors) # Get the delimiter from the first line of data: def next_data_line(row_iter): """Assumes we are already in the data part (eg after @data).""" raw = next(row_iter) while r_empty.match(raw) or r_comment.match(raw): raw = next(row_iter) return raw try: try: dtline = next_data_line(ofile) delim = get_delim(dtline) except ValueError as e: raise ParseArffError("Error while parsing delimiter: " + str(e)) finally: ofile.seek(0, 0) ofile = go_data(ofile) # skip the @data line next(ofile) def generator(row_iter, delim=','): # TODO: this is where we are spending times (~80%). I think things # could be made more efficiently: # - We could for example "compile" the function, because some values # do not change here. # - The function to convert a line to dtyped values could also be # generated on the fly from a string and be executed instead of # looping. # - The regex are overkill: for comments, checking that a line starts # by % should be enough and faster, and for empty lines, same thing # --> this does not seem to change anything. # We do not abstract skipping comments and empty lines for performances # reason. raw = next(row_iter) while r_empty.match(raw) or r_comment.match(raw): raw = next(row_iter) # 'compiling' the range since it does not change # Note, I have already tried zipping the converters and # row elements and got slightly worse performance. elems = list(range(ni)) row = raw.split(delim) yield tuple([convertors[i](row[i]) for i in elems]) for raw in row_iter: while r_comment.match(raw) or r_empty.match(raw): raw = next(row_iter) row = raw.split(delim) yield tuple([convertors[i](row[i]) for i in elems]) a = generator(ofile, delim=delim) # No error should happen here: it is a bug otherwise data = np.fromiter(a, descr) return data, meta
def _loadarff(ofile): # Parse the header file try: rel, attr = read_header(ofile) except ValueError as e: msg = "Error while parsing header, error was: " + str(e) raise ParseArffError(msg) # Check whether we have a string attribute (not supported yet) hasstr = False for name, value in attr: type = parse_type(value) if type == 'string': hasstr = True meta = MetaData(rel, attr) # XXX The following code is not great # Build the type descriptor descr and the list of convertors to convert # each attribute to the suitable type (which should match the one in # descr). # This can be used once we want to support integer as integer values and # not as numeric anymore (using masked arrays ?). acls2dtype = {'real': float, 'integer': float, 'numeric': float} acls2conv = {'real': safe_float, 'integer': safe_float, 'numeric': safe_float} descr = [] convertors = [] if not hasstr: for name, value in attr: type = parse_type(value) if type == 'date': date_format, datetime_unit = get_date_format(value) descr.append((name, "datetime64[%s]" % datetime_unit)) convertors.append(partial(safe_date, date_format=date_format, datetime_unit=datetime_unit)) elif type == 'nominal': n = maxnomlen(value) descr.append((name, 'S%d' % n)) pvalue = get_nom_val(value) convertors.append(partial(safe_nominal, pvalue=pvalue)) else: descr.append((name, acls2dtype[type])) convertors.append(safe_float) #dc.append(acls2conv[type]) #sdescr.append((name, acls2sdtype[type])) else: # How to support string efficiently ? Ideally, we should know the max # size of the string before allocating the numpy array. raise NotImplementedError("String attributes not supported yet, sorry") ni = len(convertors) # Get the delimiter from the first line of data: def next_data_line(row_iter): """Assumes we are already in the data part (eg after @data).""" raw = next(row_iter) while r_empty.match(raw) or r_comment.match(raw): raw = next(row_iter) return raw try: try: dtline = next_data_line(ofile) delim = get_delim(dtline) except ValueError as e: raise ParseArffError("Error while parsing delimiter: " + str(e)) finally: ofile.seek(0, 0) ofile = go_data(ofile) # skip the @data line next(ofile) def generator(row_iter, delim=','): # TODO: this is where we are spending times (~80%). I think things # could be made more efficiently: # - We could for example "compile" the function, because some values # do not change here. # - The function to convert a line to dtyped values could also be # generated on the fly from a string and be executed instead of # looping. # - The regex are overkill: for comments, checking that a line starts # by % should be enough and faster, and for empty lines, same thing # --> this does not seem to change anything. # We do not abstract skipping comments and empty lines for performances # reason. raw = next(row_iter) while r_empty.match(raw) or r_comment.match(raw): raw = next(row_iter) # 'compiling' the range since it does not change # Note, I have already tried zipping the converters and # row elements and got slightly worse performance. elems = list(range(ni)) row = raw.split(delim) yield tuple([convertors[i](row[i]) for i in elems]) for raw in row_iter: while r_comment.match(raw) or r_empty.match(raw): raw = next(row_iter) row = raw.split(delim) yield tuple([convertors[i](row[i]) for i in elems]) a = generator(ofile, delim=delim) # No error should happen here: it is a bug otherwise data = np.fromiter(a, descr) return data, meta
def tokenize_attribute(iterable, attribute): """Parse a raw string in header (eg starts by @attribute). Given a raw string attribute, try to get the name and type of the attribute. Constraints: * The first line must start with @attribute (case insensitive, and space like characters before @attribute are allowed) * Works also if the attribute is spread on multilines. * Works if empty lines or comments are in between Parameters ---------- attribute : str the attribute string. Returns ------- name : str name of the attribute value : str value of the attribute next : str next line to be parsed Examples -------- If attribute is a string defined in python as r"floupi real", will return floupi as name, and real as value. >>> iterable = iter([0] * 10) # dummy iterator >>> tokenize_attribute(iterable, r"@attribute floupi real") ('floupi', 'real', 0) If attribute is r"'floupi 2' real", will return 'floupi 2' as name, and real as value. >>> tokenize_attribute(iterable, r" @attribute 'floupi 2' real ") ('floupi 2', 'real', 0) """ sattr = attribute.strip() mattr = r_attribute.match(sattr) if mattr: # atrv is everything after @attribute atrv = mattr.group(1) if r_comattrval.match(atrv): name, type = tokenize_single_comma(atrv) next_item = next(iterable) elif r_wcomattrval.match(atrv): name, type = tokenize_single_wcomma(atrv) next_item = next(iterable) else: # Not sure we should support this, as it does not seem supported by # weka. raise ValueError("multi line not supported yet") #name, type, next_item = tokenize_multilines(iterable, atrv) else: raise ValueError("First line unparsable: %s" % sattr) if type == 'relational': raise ValueError("relational attributes not supported yet") return name, type, next_item