Exemple #1
0
 def count(self):
     """ return counts by group columns """
     counts = self._count()
     # remove mongo object _id
     for group in counts:
         group.update(group.pop('_id'))
     # transform results to dataframe, then return as pandas would
     resultdf = pd.DataFrame(counts).set_index(make_list(self.columns),
                                               drop=True)
     return resultdf
Exemple #2
0
    def aggregate(self, specs, **kwargs):
        """
        aggregate by given specs

        See the following link for a list of supported operations. 
        https://docs.mongodb.com/manual/reference/operator/aggregation/group/

        :param specs: a dictionary of { column : function | list[functions] } 
           pairs. 
        """

        def add_stats(specs, column, stat):
            specs['%s_%s' % (column, stat)] = {
                '$%s' % MGrouper.STATS_MAP.get(stat, stat): '$%s' % column}

        # generate $group command
        _specs = {}
        for column, stats in six.iteritems(specs):
            stats = make_tuple(stats)
            for stat in stats:
                add_stats(_specs, column, stat)
        groupby = qops.GROUP(columns=self.columns,
                             **_specs)
        # execute and return a dataframe
        pipeline = self._amend_pipeline([groupby])
        data = self.collection.aggregate(pipeline, allowDiskUse=True)

        def get_data():
            # we need this to build a pipeline for from_records
            # to process, otherwise the cursor will be exhausted already
            for group in data:
                _id = group.pop('_id')
                if isinstance(_id, dict):
                    group.update(_id)
                yield group

        df = pd.DataFrame.from_records(get_data())
        columns = make_list(self.columns)
        if columns:
            df = df.set_index(columns, drop=True)
        return df
Exemple #3
0
    def merge(self, right, on=None, left_on=None, right_on=None,
              how='inner', target=None, suffixes=('_x', '_y'),
              sort=False, inspect=False):
        """
        merge this dataframe with another dataframe. only left outer joins
        are currently supported. the output is saved as a new collection,
        target name (defaults to a generated name if not specified).

        :param right: the other MDataFrame
        :param on: the list of key columns to merge by
        :param left_on: the list of the key columns to merge on this dataframe
        :param right_on: the list of the key columns to merge on the other 
           dataframe
        :param how: the method to merge. supported are left, inner, right. 
           Defaults to inner
        :param target: the name of the collection to store the merge results
           in. If not provided a temporary name will be created.
        :param suffixes: the suffixes to apply to identical left and right 
           columns
        :param sort: if True the merge results will be sorted. If False the
           MongoDB natural order is implied.
        :returns: the MDataFrame to the target MDataFrame
        """
        # validate input
        supported_how = ["left", 'inner', 'right']
        assert how in supported_how, "only %s merges are currently supported" % supported_how
        for key in [on, left_on, right_on]:
            if key:
                assert isinstance(
                    key, six.string_types), "only single column merge keys are supported (%s)" % key
        if isinstance(right, Collection):
            right = MDataFrame(right)
        assert isinstance(
            right, MDataFrame), "both must be MDataFrames, got right=%" % type(right)
        if how == 'right':
            # A right B == B left A
            return right.merge(self, on=on, left_on=right_on, right_on=left_on,
                               how='left', target=target, suffixes=suffixes)
        # generate lookup parameters
        on = on or '_id'
        right_name = self._get_collection_name_of(right, right)
        target_name = self._get_collection_name_of(
            target, '_temp.merge.%s' % uuid4().hex)
        target_field = (
                "%s_%s" % (right_name.replace('.', '_'), right_on or on))
        lookup = qops.LOOKUP(right_name,
                             key=on,
                             left_key=left_on,
                             right_key=right_on,
                             target=target_field)
        # unwind merged documents from arrays to top-level document fields
        unwind = qops.UNWIND(target_field, preserve=how != 'inner')
        # get all fields from left, right
        project = {}
        for left_col in self.columns:
            source_left_col = left_col
            if left_col == '_id':
                project[left_col] = 1
                continue
            if left_col.startswith('_idx'):
                continue
            if left_col.startswith('_om#'):
                continue
            if left_col != (on or left_on) and left_col in right.columns:
                left_col = '%s%s' % (left_col, suffixes[0])
            project[left_col] = "$%s" % source_left_col
        for right_col in right.columns:
            if right_col == '_id':
                continue
            if right_col.startswith('_idx'):
                continue
            if right_col.startswith('_om#'):
                continue
            if right_col == (on or right_on) and right_col == (on or left_on):
                # if the merge field is the same in both frames, we already
                # have it from left
                continue
            if right_col in self.columns:
                left_col = '%s%s' % (right_col, suffixes[1])
            else:
                left_col = '%s' % right_col
            project[left_col] = '$%s.%s' % (target_field, right_col)
        expected_columns = list(project.keys())
        project = {"$project": project}
        # store merged documents and return an MDataFrame to it
        out = qops.OUT(target_name)
        pipeline = [lookup, unwind, project]
        if sort:
            sort_cols = make_list(on or [left_on, right_on])
            sort_key = qops.make_sortkey(sort_cols)
            sort = qops.SORT(**dict(sort_key))
            pipeline.append(sort)
        pipeline.append(out)
        if inspect:
            result = pipeline
        else:
            result = self.collection.aggregate(pipeline, allowDiskUse=True)
            result = MDataFrame(self.collection.database[target_name],
                                force_columns=expected_columns)
        return result