Example #1
0
def _csv2rec(fname,
             comments='#',
             skiprows=0,
             checkrows=0,
             delimiter=',',
             converterd=None,
             names=None,
             missing='',
             missingd=None,
             use_mrecords=False,
             dayfirst=False,
             yearfirst=False):
    """
    Load data from comma/space/tab delimited file in *fname* into a
    numpy record array and return the record array.

    If *names* is *None*, a header row is required to automatically
    assign the recarray names.  The headers will be lower cased,
    spaces will be converted to underscores, and illegal attribute
    name characters removed.  If *names* is not *None*, it is a
    sequence of names to use for the column names.  In this case, it
    is assumed there is no header row.


    - *fname*: can be a filename or a file handle.  Support for gzipped
      files is automatic, if the filename ends in '.gz'

    - *comments*: the character used to indicate the start of a comment
      in the file, or *None* to switch off the removal of comments

    - *skiprows*: is the number of rows from the top to skip

    - *checkrows*: is the number of rows to check to validate the column
      data type.  When set to zero all rows are validated.

    - *converterd*: if not *None*, is a dictionary mapping column number or
      munged column name to a converter function.

    - *names*: if not None, is a list of header names.  In this case, no
      header will be read from the file

    - *missingd* is a dictionary mapping munged column names to field values
      which signify that the field does not contain actual data and should
      be masked, e.g., '0000-00-00' or 'unused'

    - *missing*: a string whose value signals a missing field regardless of
      the column it appears in

    - *use_mrecords*: if True, return an mrecords.fromrecords record array if
      any of the data are missing

    - *dayfirst*: default is False so that MM-DD-YY has precedence over
      DD-MM-YY.  See
      http://labix.org/python-dateutil#head-b95ce2094d189a89f80f5ae52a05b4ab7b41af47
      for further information.

    - *yearfirst*: default is False so that MM-DD-YY has precedence over
      YY-MM-DD. See
      http://labix.org/python-dateutil#head-b95ce2094d189a89f80f5ae52a05b4ab7b41af47
      for further information.

      If no rows are found, *None* is returned
    """

    if converterd is None:
        converterd = dict()

    if missingd is None:
        missingd = {}

    import dateutil.parser
    import datetime

    fh = cbook.to_filehandle(fname)

    delimiter = str(delimiter)

    class FH:
        """
        For space-delimited files, we want different behavior than
        comma or tab.  Generally, we want multiple spaces to be
        treated as a single separator, whereas with comma and tab we
        want multiple commas to return multiple (empty) fields.  The
        join/strip trick below effects this.
        """
        def __init__(self, fh):
            self.fh = fh

        def close(self):
            self.fh.close()

        def seek(self, arg):
            self.fh.seek(arg)

        def fix(self, s):
            return ' '.join(s.split())

        def __next__(self):
            return self.fix(next(self.fh))

        def __iter__(self):
            for line in self.fh:
                yield self.fix(line)

    if delimiter == ' ':
        fh = FH(fh)

    reader = csv.reader(fh, delimiter=delimiter)

    def process_skiprows(reader):
        if skiprows:
            for i, row in enumerate(reader):
                if i >= (skiprows - 1):
                    break

        return fh, reader

    process_skiprows(reader)

    def ismissing(name, val):
        "Should the value val in column name be masked?"
        return val == missing or val == missingd.get(name) or val == ''

    def with_default_value(func, default):
        def newfunc(name, val):
            if ismissing(name, val):
                return default
            else:
                return func(val)

        return newfunc

    def mybool(x):
        if x == 'True':
            return True
        elif x == 'False':
            return False
        else:
            raise ValueError('invalid bool')

    dateparser = dateutil.parser.parse

    def mydateparser(x):
        # try and return a datetime object
        d = dateparser(x, dayfirst=dayfirst, yearfirst=yearfirst)
        return d

    mydateparser = with_default_value(mydateparser, datetime.datetime(1, 1, 1))

    myfloat = with_default_value(float, np.nan)
    myint = with_default_value(int, -1)
    mystr = with_default_value(str, '')
    mybool = with_default_value(mybool, None)

    def mydate(x):
        # try and return a date object
        d = dateparser(x, dayfirst=dayfirst, yearfirst=yearfirst)

        if d.hour > 0 or d.minute > 0 or d.second > 0:
            raise ValueError('not a date')
        return d.date()

    mydate = with_default_value(mydate, datetime.date(1, 1, 1))

    def get_func(name, item, func):
        # promote functions in this order
        funcs = [mybool, myint, myfloat, mydate, mydateparser, mystr]
        for func in funcs[funcs.index(func):]:
            try:
                func(name, item)
            except Exception:
                continue
            return func
        raise ValueError('Could not find a working conversion function')

    # map column names that clash with builtins -- TODO - extend this list
    itemd = {
        'return': 'return_',
        'file': 'file_',
        'print': 'print_',
    }

    def get_converters(reader, comments):

        converters = None
        i = 0
        for row in reader:
            if (len(row) and comments is not None
                    and row[0].startswith(comments)):
                continue
            if i == 0:
                converters = [mybool] * len(row)
            if checkrows and i > checkrows:
                break
            i += 1

            for j, (name, item) in enumerate(zip(names, row)):
                func = converterd.get(j)
                if func is None:
                    func = converterd.get(name)
                if func is None:
                    func = converters[j]
                    if len(item.strip()):
                        func = get_func(name, item, func)
                else:
                    # how should we handle custom converters and defaults?
                    func = with_default_value(func, None)
                converters[j] = func
        return converters

    # Get header and remove invalid characters
    needheader = names is None

    if needheader:
        for row in reader:
            if (len(row) and comments is not None
                    and row[0].startswith(comments)):
                continue
            headers = row
            break

        # remove these chars
        delete = set(r"""~!@#$%^&*()-=+~\|}[]{';: /?.>,<""")
        delete.add('"')

        names = []
        seen = dict()
        for i, item in enumerate(headers):
            item = item.strip().lower().replace(' ', '_')
            item = ''.join([c for c in item if c not in delete])
            if not len(item):
                item = 'column%d' % i

            item = itemd.get(item, item)
            cnt = seen.get(item, 0)
            if cnt > 0:
                names.append(item + '_%d' % cnt)
            else:
                names.append(item)
            seen[item] = cnt + 1

    else:
        if isinstance(names, str):
            names = [n.strip() for n in names.split(',')]

    # get the converter functions by inspecting checkrows
    converters = get_converters(reader, comments)
    if converters is None:
        raise ValueError('Could not find any valid data in CSV file')

    # reset the reader and start over
    fh.seek(0)
    reader = csv.reader(fh, delimiter=delimiter)
    process_skiprows(reader)

    if needheader:
        while True:
            # skip past any comments and consume one line of column header
            row = next(reader)
            if (len(row) and comments is not None
                    and row[0].startswith(comments)):
                continue
            break

    # iterate over the remaining rows and convert the data to date
    # objects, ints, or floats as appropriate
    rows = []
    rowmasks = []
    for i, row in enumerate(reader):
        if not len(row):
            continue
        if comments is not None and row[0].startswith(comments):
            continue
        # Ensure that the row returned always has the same nr of elements
        row.extend([''] * (len(converters) - len(row)))
        rows.append([
            func(name, val) for func, name, val in zip(converters, names, row)
        ])
        rowmasks.append(
            [ismissing(name, val) for name, val in zip(names, row)])
    fh.close()

    if not len(rows):
        return None

    if use_mrecords and np.any(rowmasks):
        r = np.ma.mrecords.fromrecords(rows, names=names, mask=rowmasks)
    else:
        r = np.rec.fromrecords(rows, names=names)
    return r
Example #2
0
def _csv2rec(fname, comments='#', skiprows=0, checkrows=0, delimiter=',',
             converterd=None, names=None, missing='', missingd=None,
             use_mrecords=False, dayfirst=False, yearfirst=False):
    """
    Load data from comma/space/tab delimited file in *fname* into a
    numpy record array and return the record array.

    If *names* is *None*, a header row is required to automatically
    assign the recarray names.  The headers will be lower cased,
    spaces will be converted to underscores, and illegal attribute
    name characters removed.  If *names* is not *None*, it is a
    sequence of names to use for the column names.  In this case, it
    is assumed there is no header row.


    - *fname*: can be a filename or a file handle.  Support for gzipped
      files is automatic, if the filename ends in '.gz'

    - *comments*: the character used to indicate the start of a comment
      in the file, or *None* to switch off the removal of comments

    - *skiprows*: is the number of rows from the top to skip

    - *checkrows*: is the number of rows to check to validate the column
      data type.  When set to zero all rows are validated.

    - *converterd*: if not *None*, is a dictionary mapping column number or
      munged column name to a converter function.

    - *names*: if not None, is a list of header names.  In this case, no
      header will be read from the file

    - *missingd* is a dictionary mapping munged column names to field values
      which signify that the field does not contain actual data and should
      be masked, e.g., '0000-00-00' or 'unused'

    - *missing*: a string whose value signals a missing field regardless of
      the column it appears in

    - *use_mrecords*: if True, return an mrecords.fromrecords record array if
      any of the data are missing

    - *dayfirst*: default is False so that MM-DD-YY has precedence over
      DD-MM-YY.  See
      http://labix.org/python-dateutil#head-b95ce2094d189a89f80f5ae52a05b4ab7b41af47
      for further information.

    - *yearfirst*: default is False so that MM-DD-YY has precedence over
      YY-MM-DD. See
      http://labix.org/python-dateutil#head-b95ce2094d189a89f80f5ae52a05b4ab7b41af47
      for further information.

      If no rows are found, *None* is returned
    """

    if converterd is None:
        converterd = dict()

    if missingd is None:
        missingd = {}

    import dateutil.parser
    import datetime

    fh = cbook.to_filehandle(fname)

    delimiter = str(delimiter)

    class FH:
        """
        For space-delimited files, we want different behavior than
        comma or tab.  Generally, we want multiple spaces to be
        treated as a single separator, whereas with comma and tab we
        want multiple commas to return multiple (empty) fields.  The
        join/strip trick below effects this.
        """
        def __init__(self, fh):
            self.fh = fh

        def close(self):
            self.fh.close()

        def seek(self, arg):
            self.fh.seek(arg)

        def fix(self, s):
            return ' '.join(s.split())

        def __next__(self):
            return self.fix(next(self.fh))

        def __iter__(self):
            for line in self.fh:
                yield self.fix(line)

    if delimiter == ' ':
        fh = FH(fh)

    reader = csv.reader(fh, delimiter=delimiter)

    def process_skiprows(reader):
        if skiprows:
            for i, row in enumerate(reader):
                if i >= (skiprows-1):
                    break

        return fh, reader

    process_skiprows(reader)

    def ismissing(name, val):
        "Should the value val in column name be masked?"
        return val == missing or val == missingd.get(name) or val == ''

    def with_default_value(func, default):
        def newfunc(name, val):
            if ismissing(name, val):
                return default
            else:
                return func(val)
        return newfunc

    def mybool(x):
        if x == 'True':
            return True
        elif x == 'False':
            return False
        else:
            raise ValueError('invalid bool')

    dateparser = dateutil.parser.parse

    def mydateparser(x):
        # try and return a datetime object
        d = dateparser(x, dayfirst=dayfirst, yearfirst=yearfirst)
        return d

    mydateparser = with_default_value(mydateparser, datetime.datetime(1, 1, 1))

    myfloat = with_default_value(float, np.nan)
    myint = with_default_value(int, -1)
    mystr = with_default_value(str, '')
    mybool = with_default_value(mybool, None)

    def mydate(x):
        # try and return a date object
        d = dateparser(x, dayfirst=dayfirst, yearfirst=yearfirst)

        if d.hour > 0 or d.minute > 0 or d.second > 0:
            raise ValueError('not a date')
        return d.date()
    mydate = with_default_value(mydate, datetime.date(1, 1, 1))

    def get_func(name, item, func):
        # promote functions in this order
        funcs = [mybool, myint, myfloat, mydate, mydateparser, mystr]
        for func in funcs[funcs.index(func):]:
            try:
                func(name, item)
            except Exception:
                continue
            return func
        raise ValueError('Could not find a working conversion function')

    # map column names that clash with builtins -- TODO - extend this list
    itemd = {
        'return': 'return_',
        'file':   'file_',
        'print':  'print_',
        }

    def get_converters(reader, comments):

        converters = None
        i = 0
        for row in reader:
            if (len(row) and comments is not None and
                    row[0].startswith(comments)):
                continue
            if i == 0:
                converters = [mybool]*len(row)
            if checkrows and i > checkrows:
                break
            i += 1

            for j, (name, item) in enumerate(zip(names, row)):
                func = converterd.get(j)
                if func is None:
                    func = converterd.get(name)
                if func is None:
                    func = converters[j]
                    if len(item.strip()):
                        func = get_func(name, item, func)
                else:
                    # how should we handle custom converters and defaults?
                    func = with_default_value(func, None)
                converters[j] = func
        return converters

    # Get header and remove invalid characters
    needheader = names is None

    if needheader:
        for row in reader:
            if (len(row) and comments is not None and
                    row[0].startswith(comments)):
                continue
            headers = row
            break

        # remove these chars
        delete = set(r"""~!@#$%^&*()-=+~\|}[]{';: /?.>,<""")
        delete.add('"')

        names = []
        seen = dict()
        for i, item in enumerate(headers):
            item = item.strip().lower().replace(' ', '_')
            item = ''.join([c for c in item if c not in delete])
            if not len(item):
                item = 'column%d' % i

            item = itemd.get(item, item)
            cnt = seen.get(item, 0)
            if cnt > 0:
                names.append(item + '_%d' % cnt)
            else:
                names.append(item)
            seen[item] = cnt+1

    else:
        if isinstance(names, str):
            names = [n.strip() for n in names.split(',')]

    # get the converter functions by inspecting checkrows
    converters = get_converters(reader, comments)
    if converters is None:
        raise ValueError('Could not find any valid data in CSV file')

    # reset the reader and start over
    fh.seek(0)
    reader = csv.reader(fh, delimiter=delimiter)
    process_skiprows(reader)

    if needheader:
        while True:
            # skip past any comments and consume one line of column header
            row = next(reader)
            if (len(row) and comments is not None and
                    row[0].startswith(comments)):
                continue
            break

    # iterate over the remaining rows and convert the data to date
    # objects, ints, or floats as appropriate
    rows = []
    rowmasks = []
    for i, row in enumerate(reader):
        if not len(row):
            continue
        if comments is not None and row[0].startswith(comments):
            continue
        # Ensure that the row returned always has the same nr of elements
        row.extend([''] * (len(converters) - len(row)))
        rows.append([func(name, val)
                     for func, name, val in zip(converters, names, row)])
        rowmasks.append([ismissing(name, val)
                         for name, val in zip(names, row)])
    fh.close()

    if not len(rows):
        return None

    if use_mrecords and np.any(rowmasks):
        r = np.ma.mrecords.fromrecords(rows, names=names, mask=rowmasks)
    else:
        r = np.rec.fromrecords(rows, names=names)
    return r