def count(self): """ return counts by group columns """ counts = self._count() # remove mongo object _id for group in counts: group.update(group.pop('_id')) # transform results to dataframe, then return as pandas would resultdf = pd.DataFrame(counts).set_index(make_list(self.columns), drop=True) return resultdf
def aggregate(self, specs, **kwargs): """ aggregate by given specs See the following link for a list of supported operations. https://docs.mongodb.com/manual/reference/operator/aggregation/group/ :param specs: a dictionary of { column : function | list[functions] } pairs. """ def add_stats(specs, column, stat): specs['%s_%s' % (column, stat)] = { '$%s' % MGrouper.STATS_MAP.get(stat, stat): '$%s' % column} # generate $group command _specs = {} for column, stats in six.iteritems(specs): stats = make_tuple(stats) for stat in stats: add_stats(_specs, column, stat) groupby = qops.GROUP(columns=self.columns, **_specs) # execute and return a dataframe pipeline = self._amend_pipeline([groupby]) data = self.collection.aggregate(pipeline, allowDiskUse=True) def get_data(): # we need this to build a pipeline for from_records # to process, otherwise the cursor will be exhausted already for group in data: _id = group.pop('_id') if isinstance(_id, dict): group.update(_id) yield group df = pd.DataFrame.from_records(get_data()) columns = make_list(self.columns) if columns: df = df.set_index(columns, drop=True) return df
def merge(self, right, on=None, left_on=None, right_on=None, how='inner', target=None, suffixes=('_x', '_y'), sort=False, inspect=False): """ merge this dataframe with another dataframe. only left outer joins are currently supported. the output is saved as a new collection, target name (defaults to a generated name if not specified). :param right: the other MDataFrame :param on: the list of key columns to merge by :param left_on: the list of the key columns to merge on this dataframe :param right_on: the list of the key columns to merge on the other dataframe :param how: the method to merge. supported are left, inner, right. Defaults to inner :param target: the name of the collection to store the merge results in. If not provided a temporary name will be created. :param suffixes: the suffixes to apply to identical left and right columns :param sort: if True the merge results will be sorted. If False the MongoDB natural order is implied. :returns: the MDataFrame to the target MDataFrame """ # validate input supported_how = ["left", 'inner', 'right'] assert how in supported_how, "only %s merges are currently supported" % supported_how for key in [on, left_on, right_on]: if key: assert isinstance( key, six.string_types), "only single column merge keys are supported (%s)" % key if isinstance(right, Collection): right = MDataFrame(right) assert isinstance( right, MDataFrame), "both must be MDataFrames, got right=%" % type(right) if how == 'right': # A right B == B left A return right.merge(self, on=on, left_on=right_on, right_on=left_on, how='left', target=target, suffixes=suffixes) # generate lookup parameters on = on or '_id' right_name = self._get_collection_name_of(right, right) target_name = self._get_collection_name_of( target, '_temp.merge.%s' % uuid4().hex) target_field = ( "%s_%s" % (right_name.replace('.', '_'), right_on or on)) lookup = qops.LOOKUP(right_name, key=on, left_key=left_on, right_key=right_on, target=target_field) # unwind merged documents from arrays to top-level document fields unwind = qops.UNWIND(target_field, preserve=how != 'inner') # get all fields from left, right project = {} for left_col in self.columns: source_left_col = left_col if left_col == '_id': project[left_col] = 1 continue if left_col.startswith('_idx'): continue if left_col.startswith('_om#'): continue if left_col != (on or left_on) and left_col in right.columns: left_col = '%s%s' % (left_col, suffixes[0]) project[left_col] = "$%s" % source_left_col for right_col in right.columns: if right_col == '_id': continue if right_col.startswith('_idx'): continue if right_col.startswith('_om#'): continue if right_col == (on or right_on) and right_col == (on or left_on): # if the merge field is the same in both frames, we already # have it from left continue if right_col in self.columns: left_col = '%s%s' % (right_col, suffixes[1]) else: left_col = '%s' % right_col project[left_col] = '$%s.%s' % (target_field, right_col) expected_columns = list(project.keys()) project = {"$project": project} # store merged documents and return an MDataFrame to it out = qops.OUT(target_name) pipeline = [lookup, unwind, project] if sort: sort_cols = make_list(on or [left_on, right_on]) sort_key = qops.make_sortkey(sort_cols) sort = qops.SORT(**dict(sort_key)) pipeline.append(sort) pipeline.append(out) if inspect: result = pipeline else: result = self.collection.aggregate(pipeline, allowDiskUse=True) result = MDataFrame(self.collection.database[target_name], force_columns=expected_columns) return result