Example #1
0
 def _conv_date(self, *date_cols):
     if self.date_parser is None:
         return lib.try_parse_dates(_concat_date_cols(date_cols),
                                    dayfirst=self.dayfirst)
     else:
         try:
             return self.date_parser(*date_cols)
         except:
             return lib.try_parse_dates(_concat_date_cols(date_cols),
                                        parser=self.date_parser,
                                        dayfirst=self.dayfirst)
Example #2
0
def test_try_parse_dates():
    from dateutil.parser import parse

    arr = np.array(['5/1/2000', '6/1/2000', '7/1/2000'], dtype=object)

    result = lib.try_parse_dates(arr, dayfirst=True)
    expected = [parse(d, dayfirst=True) for d in arr]
    assert(np.array_equal(result, expected))
Example #3
0
def _maybe_convert_int_mindex(index, parse_dates, date_parser):
    for i in range(len(index)):
        try:
            int(index[i][0])
            index[i] = map(int, index[i])
        except ValueError:
            if parse_dates:
                index[i] = lib.try_parse_dates(index[i], date_parser)

    return index
Example #4
0
    def get_chunk(self, rows=None):
        if rows is not None and self.skip_footer:
            print "skip_footer not supported for iteration"

        try:
            content = self._get_lines(rows)
        except StopIteration:
            if self._first_chunk:
                content = []
            else:
                raise

        # done with first read, next time raise StopIteration
        self._first_chunk = False

        if len(content) == 0:  # pragma: no cover
            if self.index_col is not None:
                if np.isscalar(self.index_col):
                    index = Index([], name=self.index_name)
                else:
                    index = MultiIndex.from_arrays([[]] * len(self.index_col), names=self.index_name)
            else:
                index = Index([])

            return DataFrame(index=index, columns=self.columns)

        zipped_content = list(lib.to_object_array(content).T)

        # no index column specified, so infer that's what is wanted
        if self.index_col is not None:
            if np.isscalar(self.index_col):
                index = zipped_content.pop(self.index_col)
            else:  # given a list of index
                index = []
                for idx in self.index_col:
                    index.append(zipped_content[idx])
                # remove index items from content and columns, don't pop in loop
                for i in reversed(sorted(self.index_col)):
                    zipped_content.pop(i)

            if np.isscalar(self.index_col):
                if self.parse_dates:
                    index = lib.try_parse_dates(index, parser=self.date_parser)
                index = Index(_convert_types(index, self.na_values), name=self.index_name)
            else:
                arrays = []
                for arr in index:
                    if self.parse_dates:
                        arr = lib.try_parse_dates(arr, parser=self.date_parser)
                    arrays.append(_convert_types(arr, self.na_values))
                index = MultiIndex.from_arrays(arrays, names=self.index_name)
        else:
            index = Index(np.arange(len(content)))

        if not index._verify_integrity():
            dups = index._get_duplicates()
            raise Exception("Index has duplicates: %s" % str(dups))

        if len(self.columns) != len(zipped_content):
            raise Exception("wrong number of columns")

        data = dict((k, v) for k, v in zip(self.columns, zipped_content))

        # apply converters
        for col, f in self.converters.iteritems():
            if isinstance(col, int) and col not in self.columns:
                col = self.columns[col]
            data[col] = np.vectorize(f)(data[col])

        data = _convert_to_ndarrays(data, self.na_values)

        return DataFrame(data=data, columns=self.columns, index=index)
Example #5
0
    def get_chunk(self, rows=None):
        if rows is not None and self.skip_footer:
            raise ValueError('skip_footer not supported for iteration')

        try:
            content = self._get_lines(rows)
        except StopIteration:
            if self._first_chunk:
                content = []
            else:
                raise

        # done with first read, next time raise StopIteration
        self._first_chunk = False

        if len(content) == 0:  # pragma: no cover
            if self.index_col is not None:
                if np.isscalar(self.index_col):
                    index = Index([], name=self.index_name)
                else:
                    index = MultiIndex.from_arrays([[]] * len(self.index_col),
                                                   names=self.index_name)
            else:
                index = Index([])

            return DataFrame(index=index, columns=self.columns)

        zipped_content = list(lib.to_object_array(content).T)

        # no index column specified, so infer that's what is wanted
        if self.index_col is not None:
            if np.isscalar(self.index_col):
                index = zipped_content.pop(self.index_col)
            else:  # given a list of index
                index = []
                for idx in self.index_col:
                    index.append(zipped_content[idx])
                # remove index items from content and columns, don't pop in loop
                for i in reversed(sorted(self.index_col)):
                    zipped_content.pop(i)

            if np.isscalar(self.index_col):
                if self.parse_dates:
                    index = lib.try_parse_dates(index, parser=self.date_parser)
                index, na_count = _convert_types(index, self.na_values)
                index = Index(index, name=self.index_name)
                if self.verbose and na_count:
                    print 'Found %d NA values in the index' % na_count
            else:
                arrays = []
                for arr in index:
                    if self.parse_dates:
                        arr = lib.try_parse_dates(arr, parser=self.date_parser)
                    arr, _ = _convert_types(arr, self.na_values)
                    arrays.append(arr)
                index = MultiIndex.from_arrays(arrays, names=self.index_name)
        else:
            index = Index(np.arange(len(content)))

        if not index._verify_integrity():
            dups = index.get_duplicates()
            idx_str = 'Index' if not self.implicit_idx else 'Implicit index'
            err_msg = ('%s (columns %s) have duplicate values %s' %
                       (idx_str, self.index_col, str(dups)))
            raise Exception(err_msg)

        if len(self.columns) != len(zipped_content):
            raise Exception('wrong number of columns')

        data = dict((k, v) for k, v in izip(self.columns, zipped_content))

        # apply converters
        for col, f in self.converters.iteritems():
            if isinstance(col, int) and col not in self.columns:
                col = self.columns[col]
            data[col] = lib.map_infer(data[col], f)

        data = _convert_to_ndarrays(data, self.na_values, self.verbose)

        return DataFrame(data=data, columns=self.columns, index=index)
Example #6
0
    def get_chunk(self, rows=None):
        if rows is not None and self.skip_footer:
            raise ValueError('skip_footer not supported for iteration')

        try:
            content = self._get_lines(rows)
        except StopIteration:
            if self._first_chunk:
                content = []
            else:
                raise

        # done with first read, next time raise StopIteration
        self._first_chunk = False

        if len(content) == 0: # pragma: no cover
            if self.index_col is not None:
                if np.isscalar(self.index_col):
                    index = Index([], name=self.index_name)
                else:
                    index = MultiIndex.from_arrays([[]] * len(self.index_col),
                                                   names=self.index_name)
            else:
                index = Index([])

            return DataFrame(index=index, columns=self.columns)

        zipped_content = list(lib.to_object_array(content).T)

        # no index column specified, so infer that's what is wanted
        if self.index_col is not None:
            if np.isscalar(self.index_col):
                index = zipped_content.pop(self.index_col)
            else: # given a list of index
                index = []
                for idx in self.index_col:
                    index.append(zipped_content[idx])
                # remove index items from content and columns, don't pop in loop
                for i in reversed(sorted(self.index_col)):
                    zipped_content.pop(i)

            if np.isscalar(self.index_col):
                if self.parse_dates:
                    index = lib.try_parse_dates(index, parser=self.date_parser)
                index, na_count = _convert_types(index, self.na_values)
                index = Index(index, name=self.index_name)
                if self.verbose and na_count:
                    print 'Found %d NA values in the index' % na_count
            else:
                arrays = []
                for arr in index:
                    if self.parse_dates:
                        arr = lib.try_parse_dates(arr, parser=self.date_parser)
                    arr, _ = _convert_types(arr, self.na_values)
                    arrays.append(arr)
                index = MultiIndex.from_arrays(arrays, names=self.index_name)
        else:
            index = Index(np.arange(len(content)))

        if not index._verify_integrity():
            dups = index.get_duplicates()
            idx_str = 'Index' if not self.implicit_idx else 'Implicit index'
            err_msg = ('%s (columns %s) have duplicate values %s'
                       % (idx_str, self.index_col, str(dups)))
            raise Exception(err_msg)

        if len(self.columns) != len(zipped_content):
            raise Exception('wrong number of columns')

        data = dict((k, v) for k, v in izip(self.columns, zipped_content))

        # apply converters
        for col, f in self.converters.iteritems():
            if isinstance(col, int) and col not in self.columns:
                col = self.columns[col]
            data[col] = lib.map_infer(data[col], f)

        data = _convert_to_ndarrays(data, self.na_values, self.verbose)

        return DataFrame(data=data, columns=self.columns, index=index)
Example #7
0
 def func(*date_cols):
     return lib.try_parse_dates(parsers._concat_date_cols(date_cols))
Example #8
0
def _simple_parser(lines, colNames=None, header=0, index_col=0,
                   na_values=None, date_parser=None, parse_dates=True):
    """
    Workhorse function for processing nested list into DataFrame

    Should be replaced by np.genfromtxt eventually?
    """
    if header is not None:
        columns = []
        for i, c in enumerate(lines[header]):
            if c == '':
                columns.append('Unnamed: %d' % i)
            else:
                columns.append(c)

        content = lines[header+1:]

        counts = {}
        for i, col in enumerate(columns):
            cur_count = counts.get(col, 0)
            if cur_count > 0:
                columns[i] = '%s.%d' % (col, cur_count)
            counts[col] = cur_count + 1
    else:
        ncols = len(lines[0])
        if not colNames:
            columns = ['X.%d' % (i + 1) for i in range(ncols)]
        else:
            assert(len(colNames) == ncols)
            columns = colNames
        content = lines

    if len(content) == 0: # pragma: no cover
        if index_col is not None:
            if np.isscalar(index_col):
                index = Index([], name=columns.pop(index_col))
            else:
                cp_cols = list(columns)
                names = []
                for i in index_col:
                    name = cp_cols[i]
                    columns.remove(name)
                    names.append(name)
                index = MultiIndex.fromarrays([[]] * len(index_col),
                                              names=names)
        else:
            index = Index([])

        return DataFrame(index=index, columns=columns)


    # common NA values
    # no longer excluding inf representations
    # '1.#INF','-1.#INF', '1.#INF000000',
    NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN',
                     '#N/A N/A', 'NA', '#NA', 'NULL', 'NaN',
                     'nan', ''])
    if na_values is None:
        na_values = NA_VALUES
    else:
        na_values = set(list(na_values)) | NA_VALUES


    zipped_content = list(lib.to_object_array(content).T)

    if index_col is None and len(content[0]) == len(columns) + 1:
        index_col = 0

    # no index column specified, so infer that's what is wanted
    if index_col is not None:
        if np.isscalar(index_col):
            index = zipped_content.pop(index_col)

            if len(content[0]) == len(columns) + 1:
                name = None
            else:
                name = columns.pop(index_col)

        else: # given a list of index
            idx_names = []
            index = []
            for idx in index_col:
                idx_names.append(columns[idx])
                index.append(zipped_content[idx])
            #remove index items from content and columns, don't pop in loop
            for i in range(len(index_col)):
                columns.remove(idx_names[i])
                zipped_content.remove(index[i])

        if np.isscalar(index_col):
            if parse_dates:
                index = lib.try_parse_dates(index, parser=date_parser)
            index = Index(_convert_types(index, na_values), name=name)
        else:
            arrays = _maybe_convert_int_mindex(index, parse_dates,
                                               date_parser)
            index = MultiIndex.from_arrays(arrays,
                                                 names=idx_names)
    else:
        index = Index(np.arange(len(content)))

    if not index._verify_integrity():
        dups = index._get_duplicates()
        raise Exception('Index has duplicates: %s' % str(dups))

    if len(columns) != len(zipped_content):
        raise Exception('wrong number of columns')

    data = dict((k, v) for k, v in zip(columns, zipped_content))
    data = _convert_to_ndarrays(data, na_values)
    return DataFrame(data=data, columns=columns, index=index)
Example #9
0
    def get_chunk(self, rows=None):
        if rows is not None and self.skip_footer:
            raise ValueError('skip_footer not supported for iteration')

        try:
            content = self._get_lines(rows)
        except StopIteration:
            if self._first_chunk:
                content = []
            else:
                raise

        # done with first read, next time raise StopIteration
        self._first_chunk = False

        if len(content) == 0: # pragma: no cover
            if self.index_col is not None:
                if np.isscalar(self.index_col):
                    index = Index([], name=self.index_name)
                else:
                    index = MultiIndex.from_arrays([[]] * len(self.index_col),
                                                   names=self.index_name)
            else:
                index = Index([])

            return DataFrame(index=index, columns=self.columns)

        zipped_content = list(lib.to_object_array(content).T)

        # no index column specified, so infer that's what is wanted
        if self.index_col is not None:
            if np.isscalar(self.index_col):
                index = zipped_content.pop(self.index_col)
            else: # given a list of index
                index = []
                for idx in self.index_col:
                    index.append(zipped_content[idx])
                # remove index items from content and columns, don't pop in
                # loop
                for i in reversed(sorted(self.index_col)):
                    zipped_content.pop(i)

            if np.isscalar(self.index_col):
                if self._should_parse_dates(0):
                    index = lib.try_parse_dates(index, parser=self.date_parser,
                                                dayfirst=self.dayfirst)
                index, na_count = _convert_types(index, self.na_values)
                index = Index(index, name=self.index_name)
                if self.verbose and na_count:
                    print 'Found %d NA values in the index' % na_count
            else:
                arrays = []
                for i, arr in enumerate(index):
                    if self._should_parse_dates(i):
                        arr = lib.try_parse_dates(arr, parser=self.date_parser,
                                                  dayfirst=self.dayfirst)
                    arr, _ = _convert_types(arr, self.na_values)
                    arrays.append(arr)
                index = MultiIndex.from_arrays(arrays, names=self.index_name)
        else:
            index = Index(np.arange(len(content)))

        # if not index.is_unique:
        #     dups = index.get_duplicates()
        #     idx_str = 'Index' if not self._implicit_index else 'Implicit index'
        #     err_msg = ('%s (columns %s) have duplicate values %s'
        #                % (idx_str, self.index_col, str(dups)))
        #     raise Exception(err_msg)

        col_len, zip_len = len(self.columns), len(zipped_content)
        if col_len != zip_len:
            row_num = -1
            for (i, l) in enumerate(content):
                if len(l) != col_len:
                    break

            footers = 0
            if self.skip_footer:
                footers = self.skip_footer
            row_num = self.pos - (len(content) - i + footers)

            msg = ('Expecting %d columns, got %d in row %d' %
                   (col_len, zip_len, row_num))
            raise ValueError(msg)

        data = dict((k, v) for k, v in izip(self.columns, zipped_content))

        # apply converters
        for col, f in self.converters.iteritems():
            if isinstance(col, int) and col not in self.columns:
                col = self.columns[col]
            data[col] = lib.map_infer(data[col], f)

        if not isinstance(self.parse_dates, bool):
            for x in self.parse_dates:
                if isinstance(x, int) and x not in data:
                    x = self.orig_columns[x]
                if x in self.index_col or x in self.index_name:
                    continue
                data[x] = lib.try_parse_dates(data[x], parser=self.date_parser,
                                              dayfirst=self.dayfirst)

        data = _convert_to_ndarrays(data, self.na_values, self.verbose)

        return DataFrame(data=data, columns=self.columns, index=index)