Beispiel #1
0
 def _append_new_column(self, name, value):
     assert is_string(name)
     assert name not in self._names
     tmp = self._create_array(value)
     if len(tmp) == self._nrow:
         # FIXME: Override Array.append by checking for type
         self._names.extend(Array([name]))
         self._data.extend(Array([tmp]))
         self._update_nrow_ncol()
         self._update_names_to_index()
     else:
         msg = 'value does not have match existing number of rows = {}'
         raise ValueError(msg.format(self._nrow))
Beispiel #2
0
 def names(self, value):
     if len(self._names) == get_length(value):
         if is_iterable_string(value):
             self._names = Array(value)
             self._update_names_to_index()
         else:
             msg = 'non string names are not allowed'
             raise ValueError(msg)
     else:
         msg = 'number of names must match the number of columns'
         raise ValueError(msg)
Beispiel #3
0
def which(array, ignore_missing=False):
    assert isinstance(array, Array)
    if array.dtype is bool:
        if not ignore_missing:
            if any(is_na(array)):
                msg = 'logical array contains missing values (None)'
                raise IndexError(msg)
        return Array([i for i, e in enumerate(array) if e])
    else:
        msg = 'array must be logical (dtype = bool)'
        raise TypeError(msg)
Beispiel #4
0
    def _init_from_dict(self, data):
        scalarity_per_value = [is_scalar(value) for value in data.values()]
        if all(scalarity_per_value):
            # Box all scalar values
            _data = [Array(value) for value in data.values()]
        elif any(scalarity_per_value):
            # At least one value is scalar but all values are not scalars
            # Allocate a list and put non-scalar values inside.
            _data = [None] * len(data)
            length_per_value = [None] * len(data)
            for i, value in enumerate(data.values()):
                if not scalarity_per_value[i]:
                    _data[i] = Array(value)
                    length_per_value[i] = len(_data[i])

            # All non-scalar columns must have the same length or
            # we raise a ValueError
            length_non_scalars = set([
                length for length, scalarity in zip(
                    length_per_value, scalarity_per_value) if not scalarity
            ])
            if len(length_non_scalars) > 1:
                msg = 'columns do not have the same length'
                raise ValueError(msg)
            elif len(length_non_scalars) == 0:
                msg = 'you found a bug, please report it'
                raise InternalError(msg)
            else:
                length = list(length_non_scalars)[0]

            # Now that we have the length, we can fill out the columns
            # using scalars.
            for i, value in enumerate(data.values()):
                if scalarity_per_value[i]:
                    _data[i] = Array([value] * length)
        else:
            # All values are non-scalars. No need to box them.
            _data = [Array(value) for value in data.values()]

        # Ensure dict keys are string types
        if not is_iterable_string(data.keys()):
            msg = 'non string names are not allowed'
            raise ValueError(msg)
        else:
            _names = data.keys()

        # Ensure all columns have the same length
        if not is_list_same([len(column) for column in _data]):
            msg = 'columns do not have the same lengths'
            raise ValueError(msg)

        # Set curated internal vars
        self._data = Array(_data)
        self._names = Array(_names)

        # Update all other fields
        self._update_nrow_ncol()
        self._update_names_to_index()
Beispiel #5
0
 def groupby(self, names=None):
     groupby_dict, selected_columns = self._get_groupby_dict(names)
     print('selected_columns =', selected_columns)
     print('len(groupby_dict) =', len(groupby_dict))
     for row in groupby_dict.iterkeys():
         import time
         start = time.time()
         selected_rows = Array([True] * self._nrow)
         for i, e in zip(selected_columns, row):
             if any(selected_rows):
                 tmp = self._data[i][selected_rows]._eqnone(e)
                 selected_rows[selected_rows] = \
                     selected_rows[selected_rows] & tmp
             else:
                 break
         groupby_dict[row] = selected_rows
         end = time.time()
         time_taken = end - start
         print('time_taken =', time_taken)
     return groupby_dict
Beispiel #6
0
    def rename(self, rename_dict):
        '''
            Rename the columns of the DataFrame object. Renaming happens
            in place.

            Args
            -----
            rename_dict (dict): a dictionary of the form
                {'existing_column_name': 'new_column_name', ... }. Keys of
                `rename_dict` are the existing column names. Values of
                `rename_dict` are the intended new column names.

            Returns
            --------
            Nothing. Renaming happens in place.
        '''
        assert isinstance(rename_dict, dict)
        updated_names = Array(self._names)
        for current, new in rename_dict.items():
            # FIXME: This will fail when some names are unicode but
            # others are not.
            updated_names[self._names_to_index[current]] = new
        if is_iterable_string(updated_names):
            if is_iterable_unique(updated_names):
                self._names = updated_names
                self._update_names_to_index()
                if set(self._names_to_index.keys()) != set(self._names):
                    msg = ('renaming violated internal consistency ',
                           'this is a bug, please report it')
                    raise InternalError(msg)
            else:
                msg = 'renaming cannot create duplicate names'
                raise ValueError(msg)
        else:
            msg = 'non string names are not allowed'
            raise ValueError(msg)
Beispiel #7
0
 def _create_array(self, value):
     if is_scalar(value):
         return Array([value] * self._nrow)
     else:
         return Array(value)
Beispiel #8
0
 def __getitem__(self, key):
     if is_float(key):
         msg = 'float index is not supported; please cast to int'
         raise KeyError(msg)
     elif is_bool(key):
         msg = 'logical indexing must provide a list of full length'
         raise KeyError(msg)
     elif is_integer(key):
         return self._data[key]
     elif is_string(key):
         return self._data[self._names_to_index[key]]
     elif isinstance(key, slice):
         return type(self)(_DataFrameSlice(self._data[key],
                                           self._names[key]))
     elif isinstance(key, Iterable) and not isinstance(key, tuple):
         if is_iterable_string(key):
             key = [self._names_to_index[k] for k in key]
         if not is_iterable_unique(self._names[key]):
             msg = 'duplicate column names found'
             raise KeyError(msg)
         return type(self)(_DataFrameSlice(self._data[key],
                                           self._names[key]))
     elif isinstance(key, tuple):
         # Dual Indexing. Select both rows and columns.
         if len(key) == 2:
             rowkey = key[0]
             colkey = key[1]
             if is_float(colkey):
                 msg = ('float column index is not supported; '
                        'please cast to int')
                 raise KeyError(msg)
             elif is_bool(colkey):
                 msg = ('logical column indexing must provide a '
                        'list of full length')
                 raise KeyError(msg)
             elif is_integer(colkey):
                 return self._data[colkey][rowkey]
             elif is_string(colkey):
                 return self._data[self._names_to_index[colkey]][rowkey]
             elif isinstance(colkey, (slice, Iterable)):
                 if isinstance(colkey, Iterable):
                     if is_iterable_string(colkey):
                         colkey = [self._names_to_index[k] for k in colkey]
                 _names = self._names[colkey]
                 if not is_iterable_unique(_names):
                     msg = 'duplicate column names found'
                     raise KeyError(msg)
                 if is_integer(rowkey):
                     rowkey = [rowkey]
                 _data = Array(
                     [column[rowkey] for column in self._data[colkey]])
                 return type(self)(_DataFrameSlice(_data, _names))
             else:
                 # Catchall for all other column addresses
                 msg = ('column address must be int, string, slice,'
                        ' or iterable')
                 raise KeyError(msg)
         else:
             msg = 'tuple indexing must have exactly 2 elements'
             raise KeyError(msg)
     elif isinstance(key, Iterable):
         return self[list(key)]
     else:
         # Catchall for all other addresses
         msg = 'address must be int, string, list, slice, or a 2-tuple'
         raise KeyError(msg)
Beispiel #9
0
 def keys(self):
     # Copy and provide to user to avoid accidental changes.
     # Use Array over list so user can use array functions.
     return Array(self._names)
Beispiel #10
0
 def dtypes(self):
     return Array([column.dtype for column in self._data])
Beispiel #11
0
def is_na(array):
    assert isinstance(array, Array)
    return Array([e is None for e in array])
Beispiel #12
0
def unique(array):
    return Array(list(set(array)))