Esempio n. 1
0
def read_header(ofile):
    """Read the header of the iterable ofile."""
    i = next(ofile)

    # Pass first comments
    while r_comment.match(i):
        i = next(ofile)

    # Header is everything up to DATA attribute ?
    relation = None
    attributes = []
    while not r_datameta.match(i):
        m = r_headerline.match(i)
        if m:
            isattr = r_attribute.match(i)
            if isattr:
                name, type, i = tokenize_attribute(ofile, i)
                attributes.append((name, type))
            else:
                isrel = r_relation.match(i)
                if isrel:
                    relation = isrel.group(1)
                else:
                    raise ValueError("Error parsing line %s" % i)
                i = next(ofile)
        else:
            i = next(ofile)

    return relation, attributes
Esempio n. 2
0
    def generator(row_iter, delim=','):
        # TODO: this is where we are spending times (~80%). I think things
        # could be made more efficiently:
        #   - We could for example "compile" the function, because some values
        #   do not change here.
        #   - The function to convert a line to dtyped values could also be
        #   generated on the fly from a string and be executed instead of
        #   looping.
        #   - The regex are overkill: for comments, checking that a line starts
        #   by % should be enough and faster, and for empty lines, same thing
        #   --> this does not seem to change anything.

        # We do not abstract skipping comments and empty lines for performances
        # reason.
        raw = next(row_iter)
        while r_empty.match(raw):
            raw = next(row_iter)
        while r_comment.match(raw):
            raw = next(row_iter)

        # 'compiling' the range since it does not change
        # Note, I have already tried zipping the converters and
        # row elements and got slightly worse performance.
        elems = list(range(ni))

        row = raw.split(delim)
        yield tuple([convertors[i](row[i]) for i in elems])
        for raw in row_iter:
            while r_comment.match(raw):
                raw = next(row_iter)
            while r_empty.match(raw):
                raw = next(row_iter)
            row = raw.split(delim)
            yield tuple([convertors[i](row[i]) for i in elems])
Esempio n. 3
0
def read_header(ofile):
    """Read the header of the iterable ofile."""
    i = next(ofile)

    # Pass first comments
    while r_comment.match(i):
        i = next(ofile)

    # Header is everything up to DATA attribute ?
    relation = None
    attributes = []
    while not r_datameta.match(i):
        m = r_headerline.match(i)
        if m:
            isattr = r_attribute.match(i)
            if isattr:
                name, type, i = tokenize_attribute(ofile, i)
                attributes.append((name, type))
            else:
                isrel = r_relation.match(i)
                if isrel:
                    relation = isrel.group(1)
                else:
                    raise ValueError("Error parsing line %s" % i)
                i = next(ofile)
        else:
            i = next(ofile)

    return relation, attributes
Esempio n. 4
0
    def generator(row_iter, delim=','):
        # TODO: this is where we are spending times (~80%). I think things
        # could be made more efficiently:
        #   - We could for example "compile" the function, because some values
        #   do not change here.
        #   - The function to convert a line to dtyped values could also be
        #   generated on the fly from a string and be executed instead of
        #   looping.
        #   - The regex are overkill: for comments, checking that a line starts
        #   by % should be enough and faster, and for empty lines, same thing
        #   --> this does not seem to change anything.

        # We do not abstract skipping comments and empty lines for performances
        # reason.
        raw = next(row_iter)
        while r_empty.match(raw):
            raw = next(row_iter)
        while r_comment.match(raw):
            raw = next(row_iter)

        # 'compiling' the range since it does not change
        # Note, I have already tried zipping the converters and
        # row elements and got slightly worse performance.
        elems = list(range(ni))

        row = raw.split(delim)
        yield tuple([convertors[i](row[i]) for i in elems])
        for raw in row_iter:
            while r_comment.match(raw):
                raw = next(row_iter)
            while r_empty.match(raw):
                raw = next(row_iter)
            row = raw.split(delim)
            yield tuple([convertors[i](row[i]) for i in elems])
Esempio n. 5
0
 def next_data_line(row_iter):
     """Assumes we are already in the data part (eg after @data)."""
     raw = next(row_iter)
     while r_empty.match(raw):
         raw = next(row_iter)
     while r_comment.match(raw):
         raw = next(row_iter)
     return raw
Esempio n. 6
0
 def next_data_line(row_iter):
     """Assumes we are already in the data part (eg after @data)."""
     raw = next(row_iter)
     while r_empty.match(raw):
         raw = next(row_iter)
     while r_comment.match(raw):
         raw = next(row_iter)
     return raw
Esempio n. 7
0
def read_data_list(ofile):
    """Read each line of the iterable and put it in a list."""
    data = [next(ofile)]
    if data[0].strip()[0] == '{':
        raise ValueError("This looks like a sparse ARFF: not supported yet")
    data.extend([i for i in ofile])
    return data
Esempio n. 8
0
def read_data_list(ofile):
    """Read each line of the iterable and put it in a list."""
    data = [next(ofile)]
    if data[0].strip()[0] == '{':
        raise ValueError("This looks like a sparse ARFF: not supported yet")
    data.extend([i for i in ofile])
    return data
Esempio n. 9
0
def floupi(filename):
    data, meta = loadarff(filename)
    from attrselect import print_dataset_info
    print_dataset_info(data)
    print("relation %s, has %d instances" % (meta.name, data.size))
    itp = iter(types)
    for i in data.dtype.names:
        print_attribute(i,next(itp),data[i])
Esempio n. 10
0
def floupi(filename):
    data, meta = loadarff(filename)
    from attrselect import print_dataset_info
    print_dataset_info(data)
    print("relation %s, has %d instances" % (meta.name, data.size))
    itp = iter(types)
    for i in data.dtype.names:
        print_attribute(i,next(itp),data[i])
Esempio n. 11
0
def get_ndata(ofile):
    """Read the whole file to get number of data attributes."""
    data = [next(ofile)]
    loc = 1
    if data[0].strip()[0] == '{':
        raise ValueError("This looks like a sparse ARFF: not supported yet")
    for i in ofile:
        loc += 1
    return loc
Esempio n. 12
0
def get_ndata(ofile):
    """Read the whole file to get number of data attributes."""
    data = [next(ofile)]
    loc = 1
    if data[0].strip()[0] == '{':
        raise ValueError("This looks like a sparse ARFF: not supported yet")
    for i in ofile:
        loc += 1
    return loc
Esempio n. 13
0
def tokenize_multilines(iterable, val):
    """Can tokenize an attribute spread over several lines."""
    # If one line does not match, read all the following lines up to next
    # line with meta character, and try to parse everything up to there.
    if not r_mcomattrval.match(val):
        all = [val]
        i = next(iterable)
        while not r_meta.match(i):
            all.append(i)
            i = next(iterable)
        if r_mend.search(i):
            raise ValueError("relational attribute not supported yet")
        print("".join(all[:-1]))
        m = r_comattrval.match("".join(all[:-1]))
        return m.group(1), m.group(2), i
    else:
        raise ValueError("Cannot parse attribute names spread over multi "
                        "lines yet")
Esempio n. 14
0
def tokenize_multilines(iterable, val):
    """Can tokenize an attribute spread over several lines."""
    # If one line does not match, read all the following lines up to next
    # line with meta character, and try to parse everything up to there.
    if not r_mcomattrval.match(val):
        all = [val]
        i = next(iterable)
        while not r_meta.match(i):
            all.append(i)
            i = next(iterable)
        if r_mend.search(i):
            raise ValueError("relational attribute not supported yet")
        print("".join(all[:-1]))
        m = r_comattrval.match("".join(all[:-1]))
        return m.group(1), m.group(2), i
    else:
        raise ValueError("Cannot parse attribute names spread over multi "
                        "lines yet")
Esempio n. 15
0
def read_header(ofile, returnclasses=False):
    """Read the header of the iterable ofile."""
    
    i = next(ofile)

    # Pass first comments
    while r_comment.match(i):
        i = next(ofile)

    # Header is everything up to DATA attribute ?
    relation = None
    attributes = []
    nrclasses = 0
    while not r_datameta.match(i):
        m = r_headerline.match(i)
        if m:
            isattr = r_attribute.match(i)
            if isattr:
                name, type, i = tokenize_attribute(ofile, i)
                attributes.append((name, type))
            else:
                isrel = r_relation.match(i)
                if isrel:
                    relation = isrel.group(1)
                    words = i.split()
                    found = 0
                    for tk in range(len(words)):
                        if words[tk] == '-C':
                            if tk+1 < len(words):
                                nrclasses = int(words[tk+1].replace("'",''))
                else:
                    raise ValueError("Error parsing line %s" % i)
                i = next(ofile)
        else:
            i = next(ofile)

    if returnclasses:
        return relation, attributes, nrclasses
    else:
        return relation, attributes
Esempio n. 16
0
def _loadarff(ofile):
    # Parse the header file
    try:
        rel, attr = read_header(ofile)
    except ValueError as e:
        msg = "Error while parsing header, error was: " + str(e)
        raise ParseArffError(msg)

    # Check whether we have a string attribute (not supported yet)
    hasstr = False
    for name, value in attr:
        type = parse_type(value)
        if type == 'string':
            hasstr = True

    meta = MetaData(rel, attr)

    # XXX The following code is not great
    # Build the type descriptor descr and the list of convertors to convert
    # each attribute to the suitable type (which should match the one in
    # descr).

    # This can be used once we want to support integer as integer values and
    # not as numeric anymore (using masked arrays ?).
    acls2dtype = {'real': np.float, 'integer': np.float, 'numeric': np.float}
    acls2conv = {
        'real': safe_float,
        'integer': safe_float,
        'numeric': safe_float
    }
    descr = []
    convertors = []
    if not hasstr:
        for name, value in attr:
            type = parse_type(value)
            if type == 'date':
                date_format, datetime_unit = get_date_format(value)
                descr.append((name, "datetime64[%s]" % datetime_unit))
                convertors.append(
                    partial(safe_date,
                            date_format=date_format,
                            datetime_unit=datetime_unit))
            elif type == 'nominal':
                n = maxnomlen(value)
                descr.append((name, 'S%d' % n))
                pvalue = get_nom_val(value)
                convertors.append(partial(safe_nominal, pvalue=pvalue))
            else:
                descr.append((name, acls2dtype[type]))
                convertors.append(safe_float)
                #dc.append(acls2conv[type])
                #sdescr.append((name, acls2sdtype[type]))
    else:
        # How to support string efficiently ? Ideally, we should know the max
        # size of the string before allocating the numpy array.
        raise NotImplementedError("String attributes not supported yet, sorry")

    ni = len(convertors)

    # Get the delimiter from the first line of data:
    def next_data_line(row_iter):
        """Assumes we are already in the data part (eg after @data)."""
        raw = next(row_iter)
        while r_empty.match(raw):
            raw = next(row_iter)
        while r_comment.match(raw):
            raw = next(row_iter)
        return raw

    try:
        try:
            dtline = next_data_line(ofile)
            delim = get_delim(dtline)
        except ValueError as e:
            raise ParseArffError("Error while parsing delimiter: " + str(e))
    finally:
        ofile.seek(0, 0)
        ofile = go_data(ofile)
        # skip the @data line
        next(ofile)

    def generator(row_iter, delim=','):
        # TODO: this is where we are spending times (~80%). I think things
        # could be made more efficiently:
        #   - We could for example "compile" the function, because some values
        #   do not change here.
        #   - The function to convert a line to dtyped values could also be
        #   generated on the fly from a string and be executed instead of
        #   looping.
        #   - The regex are overkill: for comments, checking that a line starts
        #   by % should be enough and faster, and for empty lines, same thing
        #   --> this does not seem to change anything.

        # We do not abstract skipping comments and empty lines for performances
        # reason.
        raw = next(row_iter)
        while r_empty.match(raw):
            raw = next(row_iter)
        while r_comment.match(raw):
            raw = next(row_iter)

        # 'compiling' the range since it does not change
        # Note, I have already tried zipping the converters and
        # row elements and got slightly worse performance.
        elems = list(range(ni))

        row = raw.split(delim)
        yield tuple([convertors[i](row[i]) for i in elems])
        for raw in row_iter:
            while r_comment.match(raw):
                raw = next(row_iter)
            while r_empty.match(raw):
                raw = next(row_iter)
            row = raw.split(delim)
            yield tuple([convertors[i](row[i]) for i in elems])

    a = generator(ofile, delim=delim)
    # No error should happen here: it is a bug otherwise
    data = np.fromiter(a, descr)
    return data, meta
Esempio n. 17
0
def tokenize_attribute(iterable, attribute):
    """Parse a raw string in header (eg starts by @attribute).

    Given a raw string attribute, try to get the name and type of the
    attribute. Constraints:

    * The first line must start with @attribute (case insensitive, and
      space like characters before @attribute are allowed)
    * Works also if the attribute is spread on multilines.
    * Works if empty lines or comments are in between

    Parameters
    ----------
    attribute : str
       the attribute string.

    Returns
    -------
    name : str
       name of the attribute
    value : str
       value of the attribute
    next : str
       next line to be parsed

    Examples
    --------
    If attribute is a string defined in python as r"floupi real", will
    return floupi as name, and real as value.

    >>> iterable = iter([0] * 10) # dummy iterator
    >>> tokenize_attribute(iterable, r"@attribute floupi real")
    ('floupi', 'real', 0)

    If attribute is r"'floupi 2' real", will return 'floupi 2' as name,
    and real as value.

    >>> tokenize_attribute(iterable, r"  @attribute 'floupi 2' real   ")
    ('floupi 2', 'real', 0)

    """
    sattr = attribute.strip()
    mattr = r_attribute.match(sattr)
    if mattr:
        # atrv is everything after @attribute
        atrv = mattr.group(1)
        if r_comattrval.match(atrv):
            name, type = tokenize_single_comma(atrv)
            next_item = next(iterable)
        elif r_wcomattrval.match(atrv):
            name, type = tokenize_single_wcomma(atrv)
            next_item = next(iterable)
        else:
            # Not sure we should support this, as it does not seem supported by
            # weka.
            raise ValueError("multi line not supported yet")
            #name, type, next_item = tokenize_multilines(iterable, atrv)
    else:
        raise ValueError("First line unparsable: %s" % sattr)

    if type == 'relational':
        raise ValueError("relational attributes not supported yet")
    return name, type, next_item
Esempio n. 18
0
def tokenize_attribute(iterable, attribute):
    """Parse a raw string in header (eg starts by @attribute).

    Given a raw string attribute, try to get the name and type of the
    attribute. Constraints:

    * The first line must start with @attribute (case insensitive, and
      space like characters before @attribute are allowed)
    * Works also if the attribute is spread on multilines.
    * Works if empty lines or comments are in between

    Parameters
    ----------
    attribute : str
       the attribute string.

    Returns
    -------
    name : str
       name of the attribute
    value : str
       value of the attribute
    next : str
       next line to be parsed

    Examples
    --------
    If attribute is a string defined in python as r"floupi real", will
    return floupi as name, and real as value.

    >>> iterable = iter([0] * 10) # dummy iterator
    >>> tokenize_attribute(iterable, r"@attribute floupi real")
    ('floupi', 'real', 0)

    If attribute is r"'floupi 2' real", will return 'floupi 2' as name,
    and real as value.

    >>> tokenize_attribute(iterable, r"  @attribute 'floupi 2' real   ")
    ('floupi 2', 'real', 0)

    """
    sattr = attribute.strip()
    mattr = r_attribute.match(sattr)
    if mattr:
        # atrv is everything after @attribute
        atrv = mattr.group(1)
        if r_comattrval.match(atrv):
            name, type = tokenize_single_comma(atrv)
            next_item = next(iterable)
        elif r_wcomattrval.match(atrv):
            name, type = tokenize_single_wcomma(atrv)
            next_item = next(iterable)
        else:
            # Not sure we should support this, as it does not seem supported by
            # weka.
            raise ValueError("multi line not supported yet")
            #name, type, next_item = tokenize_multilines(iterable, atrv)
    else:
        raise ValueError("First line unparsable: %s" % sattr)

    if type == 'relational':
        raise ValueError("relational attributes not supported yet")
    return name, type, next_item
Esempio n. 19
0
def _loadarff(ofile):
    # Parse the header file
    try:
        rel, attr = read_header(ofile)
    except ValueError as e:
        msg = "Error while parsing header, error was: " + str(e)
        raise ParseArffError(msg)

    # Check whether we have a string attribute (not supported yet)
    hasstr = False
    for name, value in attr:
        type = parse_type(value)
        if type == 'string':
            hasstr = True

    meta = MetaData(rel, attr)

    # XXX The following code is not great
    # Build the type descriptor descr and the list of convertors to convert
    # each attribute to the suitable type (which should match the one in
    # descr).

    # This can be used once we want to support integer as integer values and
    # not as numeric anymore (using masked arrays ?).
    acls2dtype = {'real': np.float, 'integer': np.float, 'numeric': np.float}
    acls2conv = {'real': safe_float, 'integer': safe_float, 'numeric': safe_float}
    descr = []
    convertors = []
    if not hasstr:
        for name, value in attr:
            type = parse_type(value)
            if type == 'date':
                raise ValueError("date type not supported yet, sorry")
            elif type == 'nominal':
                n = maxnomlen(value)
                descr.append((name, 'S%d' % n))
                pvalue = get_nom_val(value)
                convertors.append(partial(safe_nominal, pvalue=pvalue))
            else:
                descr.append((name, acls2dtype[type]))
                convertors.append(safe_float)
                #dc.append(acls2conv[type])
                #sdescr.append((name, acls2sdtype[type]))
    else:
        # How to support string efficiently ? Ideally, we should know the max
        # size of the string before allocating the numpy array.
        raise NotImplementedError("String attributes not supported yet, sorry")

    ni = len(convertors)

    # Get the delimiter from the first line of data:
    def next_data_line(row_iter):
        """Assumes we are already in the data part (eg after @data)."""
        raw = next(row_iter)
        while r_empty.match(raw):
            raw = next(row_iter)
        while r_comment.match(raw):
            raw = next(row_iter)
        return raw

    try:
        try:
            dtline = next_data_line(ofile)
            delim = get_delim(dtline)
        except ValueError as e:
            raise ParseArffError("Error while parsing delimiter: " + str(e))
    finally:
        ofile.seek(0, 0)
        ofile = go_data(ofile)
        # skip the @data line
        next(ofile)

    def generator(row_iter, delim=','):
        # TODO: this is where we are spending times (~80%). I think things
        # could be made more efficiently:
        #   - We could for example "compile" the function, because some values
        #   do not change here.
        #   - The function to convert a line to dtyped values could also be
        #   generated on the fly from a string and be executed instead of
        #   looping.
        #   - The regex are overkill: for comments, checking that a line starts
        #   by % should be enough and faster, and for empty lines, same thing
        #   --> this does not seem to change anything.

        # We do not abstract skipping comments and empty lines for performances
        # reason.
        raw = next(row_iter)
        while r_empty.match(raw):
            raw = next(row_iter)
        while r_comment.match(raw):
            raw = next(row_iter)

        # 'compiling' the range since it does not change
        # Note, I have already tried zipping the converters and
        # row elements and got slightly worse performance.
        elems = list(range(ni))

        row = raw.split(delim)
        yield tuple([convertors[i](row[i]) for i in elems])
        for raw in row_iter:
            while r_comment.match(raw):
                raw = next(row_iter)
            while r_empty.match(raw):
                raw = next(row_iter)
            row = raw.split(delim)
            yield tuple([convertors[i](row[i]) for i in elems])

    a = generator(ofile, delim=delim)
    # No error should happen here: it is a bug otherwise
    data = np.fromiter(a, descr)
    return data, meta