def test_append_empty_preserve_name(self): left = Index([], name="foo") right = Index([1, 2, 3], name="foo") result = left.append(right) self.assert_(result.name == "foo") left = Index([], name="foo") right = Index([1, 2, 3], name="bar") result = left.append(right) self.assert_(result.name is None)
def test_append_multiple(self): index = Index(['a', 'b', 'c', 'd', 'e', 'f']) foos = [index[:2], index[2:4], index[4:]] result = foos[0].append(foos[1:]) self.assert_(result.equals(index)) # empty result = index.append([]) self.assert_(result.equals(index))
def test_append_multiple(self): index = Index(["a", "b", "c", "d", "e", "f"]) foos = [index[:2], index[2:4], index[4:]] result = foos[0].append(foos[1:]) self.assert_(result.equals(index)) # empty result = index.append([]) self.assert_(result.equals(index))
def get_chunk(self, rows=None): if rows is not None and self.skip_footer: print "skip_footer not supported for iteration" try: content = self._get_lines(rows) except StopIteration: if self._first_chunk: content = [] else: raise # done with first read, next time raise StopIteration self._first_chunk = False if len(content) == 0: # pragma: no cover if self.index_col is not None: if np.isscalar(self.index_col): index = Index([], name=self.index_name) else: index = MultiIndex.from_arrays([[]] * len(self.index_col), names=self.index_name) else: index = Index([]) return DataFrame(index=index, columns=self.columns) zipped_content = list(lib.to_object_array(content).T) # no index column specified, so infer that's what is wanted if self.index_col is not None: if np.isscalar(self.index_col): index = zipped_content.pop(self.index_col) else: # given a list of index index = [] for idx in self.index_col: index.append(zipped_content[idx]) # remove index items from content and columns, don't pop in loop for i in reversed(sorted(self.index_col)): zipped_content.pop(i) if np.isscalar(self.index_col): if self.parse_dates: index = lib.try_parse_dates(index, parser=self.date_parser) index = Index(_convert_types(index, self.na_values), name=self.index_name) else: arrays = [] for arr in index: if self.parse_dates: arr = lib.try_parse_dates(arr, parser=self.date_parser) arrays.append(_convert_types(arr, self.na_values)) index = MultiIndex.from_arrays(arrays, names=self.index_name) else: index = Index(np.arange(len(content))) if not index._verify_integrity(): dups = index._get_duplicates() raise Exception("Index has duplicates: %s" % str(dups)) if len(self.columns) != len(zipped_content): raise Exception("wrong number of columns") data = dict((k, v) for k, v in zip(self.columns, zipped_content)) # apply converters for col, f in self.converters.iteritems(): if isinstance(col, int) and col not in self.columns: col = self.columns[col] data[col] = np.vectorize(f)(data[col]) data = _convert_to_ndarrays(data, self.na_values) return DataFrame(data=data, columns=self.columns, index=index)
def get_chunk(self, rows=None): if rows is not None and self.skip_footer: raise ValueError('skip_footer not supported for iteration') try: content = self._get_lines(rows) except StopIteration: if self._first_chunk: content = [] else: raise # done with first read, next time raise StopIteration self._first_chunk = False if len(content) == 0: # pragma: no cover if self.index_col is not None: if np.isscalar(self.index_col): index = Index([], name=self.index_name) else: index = MultiIndex.from_arrays([[]] * len(self.index_col), names=self.index_name) else: index = Index([]) return DataFrame(index=index, columns=self.columns) zipped_content = list(lib.to_object_array(content).T) # no index column specified, so infer that's what is wanted if self.index_col is not None: if np.isscalar(self.index_col): index = zipped_content.pop(self.index_col) else: # given a list of index index = [] for idx in self.index_col: index.append(zipped_content[idx]) # remove index items from content and columns, don't pop in loop for i in reversed(sorted(self.index_col)): zipped_content.pop(i) if np.isscalar(self.index_col): if self.parse_dates: index = lib.try_parse_dates(index, parser=self.date_parser) index, na_count = _convert_types(index, self.na_values) index = Index(index, name=self.index_name) if self.verbose and na_count: print 'Found %d NA values in the index' % na_count else: arrays = [] for arr in index: if self.parse_dates: arr = lib.try_parse_dates(arr, parser=self.date_parser) arr, _ = _convert_types(arr, self.na_values) arrays.append(arr) index = MultiIndex.from_arrays(arrays, names=self.index_name) else: index = Index(np.arange(len(content))) if not index._verify_integrity(): dups = index.get_duplicates() idx_str = 'Index' if not self.implicit_idx else 'Implicit index' err_msg = ('%s (columns %s) have duplicate values %s' % (idx_str, self.index_col, str(dups))) raise Exception(err_msg) if len(self.columns) != len(zipped_content): raise Exception('wrong number of columns') data = dict((k, v) for k, v in izip(self.columns, zipped_content)) # apply converters for col, f in self.converters.iteritems(): if isinstance(col, int) and col not in self.columns: col = self.columns[col] data[col] = lib.map_infer(data[col], f) data = _convert_to_ndarrays(data, self.na_values, self.verbose) return DataFrame(data=data, columns=self.columns, index=index)
def _simple_parser(lines, colNames=None, header=0, index_col=0, na_values=None, date_parser=None, parse_dates=True): """ Workhorse function for processing nested list into DataFrame Should be replaced by np.genfromtxt eventually? """ if header is not None: columns = [] for i, c in enumerate(lines[header]): if c == '': columns.append('Unnamed: %d' % i) else: columns.append(c) content = lines[header+1:] counts = {} for i, col in enumerate(columns): cur_count = counts.get(col, 0) if cur_count > 0: columns[i] = '%s.%d' % (col, cur_count) counts[col] = cur_count + 1 else: ncols = len(lines[0]) if not colNames: columns = ['X.%d' % (i + 1) for i in range(ncols)] else: assert(len(colNames) == ncols) columns = colNames content = lines if len(content) == 0: # pragma: no cover if index_col is not None: if np.isscalar(index_col): index = Index([], name=columns.pop(index_col)) else: cp_cols = list(columns) names = [] for i in index_col: name = cp_cols[i] columns.remove(name) names.append(name) index = MultiIndex.fromarrays([[]] * len(index_col), names=names) else: index = Index([]) return DataFrame(index=index, columns=columns) # common NA values # no longer excluding inf representations # '1.#INF','-1.#INF', '1.#INF000000', NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', 'NA', '#NA', 'NULL', 'NaN', 'nan', '']) if na_values is None: na_values = NA_VALUES else: na_values = set(list(na_values)) | NA_VALUES zipped_content = list(lib.to_object_array(content).T) if index_col is None and len(content[0]) == len(columns) + 1: index_col = 0 # no index column specified, so infer that's what is wanted if index_col is not None: if np.isscalar(index_col): index = zipped_content.pop(index_col) if len(content[0]) == len(columns) + 1: name = None else: name = columns.pop(index_col) else: # given a list of index idx_names = [] index = [] for idx in index_col: idx_names.append(columns[idx]) index.append(zipped_content[idx]) #remove index items from content and columns, don't pop in loop for i in range(len(index_col)): columns.remove(idx_names[i]) zipped_content.remove(index[i]) if np.isscalar(index_col): if parse_dates: index = lib.try_parse_dates(index, parser=date_parser) index = Index(_convert_types(index, na_values), name=name) else: arrays = _maybe_convert_int_mindex(index, parse_dates, date_parser) index = MultiIndex.from_arrays(arrays, names=idx_names) else: index = Index(np.arange(len(content))) if not index._verify_integrity(): dups = index._get_duplicates() raise Exception('Index has duplicates: %s' % str(dups)) if len(columns) != len(zipped_content): raise Exception('wrong number of columns') data = dict((k, v) for k, v in zip(columns, zipped_content)) data = _convert_to_ndarrays(data, na_values) return DataFrame(data=data, columns=columns, index=index)
def get_chunk(self, rows=None): if rows is not None and self.skip_footer: raise ValueError('skip_footer not supported for iteration') try: content = self._get_lines(rows) except StopIteration: if self._first_chunk: content = [] else: raise # done with first read, next time raise StopIteration self._first_chunk = False if len(content) == 0: # pragma: no cover if self.index_col is not None: if np.isscalar(self.index_col): index = Index([], name=self.index_name) else: index = MultiIndex.from_arrays([[]] * len(self.index_col), names=self.index_name) else: index = Index([]) return DataFrame(index=index, columns=self.columns) zipped_content = list(lib.to_object_array(content).T) if self.index_col is not None: if np.isscalar(self.index_col): index = zipped_content.pop(self.index_col) else: # given a list of index index = [] for idx in self.index_col: index.append(zipped_content[idx]) # remove index items from content and columns, don't pop in # loop for i in reversed(sorted(self.index_col)): zipped_content.pop(i) if np.isscalar(self.index_col): if self._should_parse_dates(self.index_col): index = self._conv_date(index) index, na_count = _convert_types(index, self.na_values) index = Index(index, name=self.index_name) if self.verbose and na_count: print 'Found %d NA values in the index' % na_count else: arrays = [] for i, arr in enumerate(index): if self._should_parse_dates(self.index_col[i]): arr = self._conv_date(arr) arr, _ = _convert_types(arr, self.na_values) arrays.append(arr) index = MultiIndex.from_arrays(arrays, names=self.index_name) else: index = Index(np.arange(len(content))) # if not index.is_unique: # dups = index.get_duplicates() # idx_str = 'Index' if not self._implicit_index else 'Implicit index' # err_msg = ('%s (columns %s) have duplicate values %s' # % (idx_str, self.index_col, str(dups))) # raise Exception(err_msg) col_len, zip_len = len(self.columns), len(zipped_content) if col_len != zip_len: row_num = -1 for (i, l) in enumerate(content): if len(l) != col_len: break footers = 0 if self.skip_footer: footers = self.skip_footer row_num = self.pos - (len(content) - i + footers) msg = ('Expecting %d columns, got %d in row %d' % (col_len, zip_len, row_num)) raise ValueError(msg) data = dict((k, v) for k, v in izip(self.columns, zipped_content)) # apply converters for col, f in self.converters.iteritems(): if isinstance(col, int) and col not in self.columns: col = self.columns[col] data[col] = lib.map_infer(data[col], f) columns = self.columns if self.parse_dates is not None: data, columns = self._process_date_conversion(data) data = _convert_to_ndarrays(data, self.na_values, self.verbose) return DataFrame(data=data, columns=columns, index=index)