Beispiel #1
0
 def row_to_doc(obj):
     for gval, gdf in obj.groupby(groupby):
         if hasattr(gval,'astype'):
             gval = make_tuple(gval.astype('O'))
         else:
             gval = make_tuple(gval)
         doc = dict(zip(groupby, gval))
         datacols = list(set(gdf.columns) - set(groupby))
         doc['_data'] = gdf[datacols].astype('O').to_dict('records')
         yield doc
Beispiel #2
0
    def __setitem__(self, sel, val):
        """
        add a projection to a sub context

        ctx['col'] = value-expression
        """
        mapping = {
            col: v
            for (col, v) in zip(make_tuple(sel), make_tuple(val))
        }
        self.project(mapping)
Beispiel #3
0
 def _get_cursor(self):
     projection = make_tuple(self.columns)
     projection += make_tuple(self._get_frame_index())
     if not self.sort_order:
         # implicit sort
         projection += make_tuple(self._get_frame_om_fields())
     cursor = self.collection.find(projection=projection)
     if self.sort_order:
         cursor.sort(qops.make_sortkey(make_tuple(self.sort_order)))
     if self.head_limit:
         cursor.limit(self.head_limit)
     if self.skip_topn:
         cursor.skip(self.skip_topn)
     return cursor
Beispiel #4
0
    def value(self):
        """
        return the value of the series

        this is a Series unless unique() was called. If unique()
        only distinct values are returned as an array, matching
        the behavior of a Series

        :return: pandas.Series 
        """
        cursor = self._get_cursor()
        column = make_tuple(self.columns)[0]
        if self.is_unique:
            # the .distinct() cursor returns a list of values
            # this is to make sure we return the same thing as pandas
            val = [v for v in cursor]
        else:
            val = self._get_dataframe_from_cursor(cursor)
            val = val[column]
            val.name = self.name
            if len(val) == 1 and self.from_loc_indexer:
                val = val.iloc[0]
        if self.auto_inspect:
            self._inspect_cache.append(self.inspect(explain=True, cursor=cursor, raw=True))
        if self._preparefn:
            df = self._preparefn(val)
        return val
Beispiel #5
0
    def groupby(self, by, expr=None, append=None, **kwargs):
        """
        add a groupby accumulation using $group

        :param by: the groupby columns, if provided as a list will be transformed
        :param expr:
        :param append:
        :param kwargs:
        :return:

        """
        by = make_tuple(by)
        self.index_columns = self.index_columns + list(by)
        # define groupby
        by = {col: '$' + col for col in by}
        stage = self._getGroupBy(by)
        groupby = stage['$group']
        # add acccumulators
        expr = expr or {col: colExpr for col, colExpr in six.iteritems(kwargs)}
        groupby.update(expr)
        # add a projection to extract groupby values
        extractId = {col: '$_id.' + col for col in by}
        # add a projection to keep accumulator columns
        keepCols = {col: 1 for col in expr}
        keepCols.update(extractId)
        self.project(keepCols, append=True)
        # sort by groupby keys
        self.add({'$sort': {col: 1 for col in by}})
        return self
Beispiel #6
0
 def inner(self, other, *args):
     # get all values passed and build terms from them
     values = list(make_tuple(other) + args)
     terms = []
     for term in values:
         if isinstance(term, six.string_types):
             # if the term is a column name, add as a column name
             if term in self.columns:
                 term = '$' + term
             # allow to specify values explicitely by $$<value> => <value>
             term = term.replace('$$', '')
         terms.append(term)
     # limit number of terms if requested
     if max_terms:
         terms = terms[:max_terms]
     # add projection of output columns to operator
     mapping = {
         col: {
             op: terms if base is None else ['$' + col] + terms,
         }
         for col in self.columns
     }
     self.project(mapping)
     # unwind all columns if requested
     if unwind:
         exprs = [{
             '$unwind': {
                 'path': '$' + col
             }
         } for col in self.columns]
         self.stages.extend(exprs)
     return self
Beispiel #7
0
 def _get_cursor(self):
     if self.is_unique:
         # this way indexes get applied
         cursor = self.collection.distinct(make_tuple(self.columns)[0])
     else:
         cursor = super(MSeries, self)._get_cursor()
     return cursor
Beispiel #8
0
 def inner(self, columns=None):
     columns = make_tuple(columns or self.columns)
     mapping = {col: {
         op: '$' + col,
     }
                for col in columns}
     self.project(mapping)
     return self
Beispiel #9
0
 def __getitem__(self, sel):
     """
     return a stage subset on a column
     """
     subctx = ApplyContext(self.caller,
                           columns=make_tuple(sel),
                           index=self.index_columns)
     self.add(subctx)
     return subctx
Beispiel #10
0
 def __init__(self,
              collection,
              columns=None,
              query=None,
              limit=None,
              skip=None,
              sort_order=None,
              force_columns=None,
              immediate_loc=False,
              auto_inspect=False,
              preparefn=None,
              **kwargs):
     self.collection = PickableCollection(collection)
     # columns in frame
     self.columns = make_tuple(columns) if columns else self._get_fields()
     self.columns = [str(col) for col in self.columns]
     # columns to sort by, defaults to not sorted
     self.sort_order = sort_order
     # top n documents to fetch
     self.head_limit = limit
     # top n documents to skip before returning
     self.skip_topn = skip
     # filter criteria
     self.filter_criteria = query or {}
     # force columns -- on output add columns not present
     self.force_columns = force_columns or []
     # was this created from the loc indexer?
     self.from_loc_indexer = kwargs.get('from_loc_indexer', False)
     # was the loc index used a range? Else a single value
     self.from_loc_range = None
     # setup query for filter criteries, if provided
     if self.filter_criteria:
         # make sure we have a filtered collection with the criteria given
         if isinstance(self.filter_criteria, dict):
             self.query_inplace(**self.filter_criteria)
         elif isinstance(self.filter_criteria, Filter):
             self.query_inplace(self.filter_criteria)
         else:
             raise ValueError(
                 'Invalid query specification of type {}'.format(
                     type(self.filter_criteria)))
     # if immediate_loc is True, .loc and .iloc always evaluate
     self.immediate_loc = immediate_loc
     # __array__ will return this value if it is set, set it otherwise
     self._evaluated = None
     # set true to automatically capture inspects on .value. retrieve using .inspect(cached=True)
     self.auto_inspect = auto_inspect
     self._inspect_cache = INSPECT_CACHE
     # apply mixins
     self._applyto = str(self.__class__)
     self._apply_mixins()
     # prepare function to be applied just before returning from .value
     self._preparefn = preparefn
Beispiel #11
0
        def inner(self, columns=None):
            columns = make_tuple(columns or self.columns)
            mapping = {col: {
                op: '$' + col,
            }
                       for col in columns}
            self.project(mapping)
            if unwind:
                self.stages.append({'$unwind': {''}})
            return self

            inner.__doc__ = op.replace('$', '')
Beispiel #12
0
 def inner(self, columns=None):
     columns = make_tuple(columns or self.columns)
     stage = self._getGroupBy(by='$$last')
     groupby = stage['$group']
     groupby.update({
         '{}_{}'.format(col, opname): {
             op: '$' + col
         }
         for col in columns
     })
     self.computed.extend(groupby.keys())
     self.project_keeper_columns()
     return self
Beispiel #13
0
 def _getcopy_kwargs(self, without=None):
     """ return all parameters required on a copy of this MDataFrame """
     kwargs = dict(columns=self.columns,
                   sort_order=self.sort_order,
                   limit=self.head_limit,
                   skip=self.skip_topn,
                   from_loc_indexer=self.from_loc_indexer,
                   immediate_loc=self.immediate_loc,
                   query=self.filter_criteria,
                   auto_inspect=self.auto_inspect,
                   preparefn=self._preparefn)
     [kwargs.pop(k) for k in make_tuple(without or [])]
     return kwargs
Beispiel #14
0
    def sort(self, columns):
        """
        sort by specified columns

        :param columns: str of single column or a list of columns. Sort order
                        is specified as the + (ascending) or - (descending)
                        prefix to the column name. Default sort order is
                        ascending.
        :return: the MDataFrame
        """
        self._evaluated = None
        self.sort_order = make_tuple(columns)
        return self
Beispiel #15
0
    def make_index(self, columns, **kwargs):
        """
        return an index specification suitable for collection.create_index()

        using columns specs like ['+A', '-A'] returns (key, index)
        pairs suitable for passing on to create_index. also generates
        a name for the index based on the columns and ordering. Use
        '@coord' to create a geospecial index. The coord column must
        be in GeoJSON format

        :param columns: a single index column, or a list of columns
        :param kwargs: optional kwargs to merge. if kwargs contains the
        'name' key it will be preserved
        :return: (idx, **kwargs) tuple, pass as create_index(idx, **kwargs)  
        """
        SORTPREFIX = ['-', '+', '@']
        DIRECTIONMAP = {
            '-': pymongo.DESCENDING,
            '+': pymongo.ASCENDING,
            '@': pymongo.GEOSPHERE,
            'default': pymongo.ASCENDING,
        }
        columns = make_tuple(columns)
        direction_default = DIRECTIONMAP.get('default')
        sort_cols = [
            '+' + col if col[0] not in SORTPREFIX else col for col in columns
        ]

        # get sort kwargs
        def direction(col):
            return DIRECTIONMAP.get(col[0], direction_default)

        idx = [(col.replace('+', '').replace('-',
                                             '').replace('@',
                                                         ''), direction(col))
               for col in sort_cols]
        name = '__'.join([
            (col.replace('-', 'desc_').replace('+',
                                               'asc_').replace('@', 'geo_'))
            for col in sort_cols
        ])
        kwargs.setdefault('name', name)
        return idx, kwargs
Beispiel #16
0
    def aggregate(self, specs, **kwargs):
        """
        aggregate by given specs

        See the following link for a list of supported operations. 
        https://docs.mongodb.com/manual/reference/operator/aggregation/group/

        :param specs: a dictionary of { column : function | list[functions] } 
           pairs. 
        """

        def add_stats(specs, column, stat):
            specs['%s_%s' % (column, stat)] = {
                '$%s' % MGrouper.STATS_MAP.get(stat, stat): '$%s' % column}

        # generate $group command
        _specs = {}
        for column, stats in six.iteritems(specs):
            stats = make_tuple(stats)
            for stat in stats:
                add_stats(_specs, column, stat)
        groupby = qops.GROUP(columns=self.columns,
                             **_specs)
        # execute and return a dataframe
        pipeline = self._amend_pipeline([groupby])
        data = self.collection.aggregate(pipeline, allowDiskUse=True)

        def get_data():
            # we need this to build a pipeline for from_records
            # to process, otherwise the cursor will be exhausted already
            for group in data:
                _id = group.pop('_id')
                if isinstance(_id, dict):
                    group.update(_id)
                yield group

        df = pd.DataFrame.from_records(get_data())
        columns = make_list(self.columns)
        if columns:
            df = df.set_index(columns, drop=True)
        return df
Beispiel #17
0
        def inner(self, other):
            terms = []
            for term in make_tuple(other):
                if isinstance(term, six.string_types):
                    term = '$' + term
                terms.append(term)

            def wrap(expr):
                if wrap_op is not None:
                    expr = {wrap_op: expr}
                return expr

            mapping = {
                col: wrap({
                    op: ['$' + col] + terms,
                })
                for col in self.columns
            }
            keepCols = {col: '$' + col for col in self.index_columns}
            mapping.update(keepCols)
            self.project(mapping)
            return self
Beispiel #18
0
 def _get_filter(self, specs):
     filterq = []
     projection = []
     if self.positional:
         idx_cols = ['_om#rowid']
     else:
         idx_cols = self.mdataframe._get_frame_index()
     flt_kwargs = {}
     enumerable_types = (list, tuple, np.ndarray)
     if isinstance(specs, np.ndarray):
         specs = specs.tolist()
     if (isinstance(specs, enumerable_types)
             and isscalar(specs[0]) and len(idx_cols) == 1
             and not any(isinstance(s, slice) for s in specs)):
         # single column index with list of scalar values
         if (self.positional and isinstance(specs, tuple) and len(specs) == 2
                 and all(isscalar(v) for v in specs)):
             # iloc[int, int] is a cell access
             flt_kwargs[idx_cols[0]] = specs[0]
             projection.extend(self._get_projection(specs[1]))
         else:
             flt_kwargs['{}__in'.format(idx_cols[0])] = specs
             self._from_range = True
     elif isinstance(specs, (int, str)):
         flt_kwargs[idx_cols[0]] = specs
     else:
         specs = make_tuple(specs)
         # list/tuple of slices or scalar values, or MultiIndex
         for i, spec in enumerate(specs):
             if i < len(idx_cols):
                 col = idx_cols[i]
                 if isinstance(spec, slice):
                     self._from_range = True
                     start, stop = spec.start, spec.stop
                     if start is not None:
                         flt_kwargs['{}__gte'.format(col)] = start
                     if stop is not None:
                         if isinstance(stop, int):
                             stop -= int(self.positional)
                         flt_kwargs['{}__lte'.format(col)] = stop
                 elif isinstance(spec, enumerable_types) and isscalar(spec[0]):
                     self._from_range = True
                     # single column index with list of scalar values
                     # -- convert to list for PyMongo serialization
                     if isinstance(spec, np.ndarray):
                         spec = spec.tolist()
                     flt_kwargs['{}__in'.format(col)] = spec
                 elif isscalar(col):
                     flt_kwargs[col] = spec
             else:
                 # we're out of index columns, let's look at columns
                 projection.extend(self._get_projection(spec))
     if flt_kwargs:
         filterq.append(MongoQ(**flt_kwargs))
     finalq = None
     for q in filterq:
         if finalq:
             finalq |= q
         else:
             finalq = q
     return finalq, projection
Beispiel #19
0
 def __init__(self, mdataframe, collection, columns, sort=True):
     self.mdataframe = mdataframe
     self.collection = collection
     self.columns = make_tuple(columns)
     self.should_sort = sort
Beispiel #20
0
 def _as_mseries(self, column):
     kwargs = self._getcopy_kwargs()
     kwargs.update(columns=make_tuple(column))
     return MSeries(self.collection, **kwargs)
Beispiel #21
0
 def set_index(self, columns):
     self.index_columns = make_tuple(columns)
     return self
Beispiel #22
0
 def PROJECT(self, fields, include=True):
     fields = make_tuple(fields)
     return {'$project': {key: 1 if include else 0 for key in fields}}