def append(self, other): if isinstance(other, Collection): right = MDataFrame(other) assert isinstance( other, MDataFrame), "both must be MDataFrames, got other={}".format( type(other)) outname = self.collection.name mrout = { 'merge': outname, 'nonAtomic': True, } mapfn = Code(""" function() { this._id = ObjectId(); if(this['_om#rowid']) { this['_om#rowid'] += %s; } emit(this._id, this); } """ % len(self)) reducefn = Code(""" function(key, value) { return value; } """) finfn = Code(""" function(key, value) { return value; } """) other.collection.map_reduce(mapfn, reducefn, mrout, finalize=finfn, jsMode=True) unwind = { "$replaceRoot": { "newRoot": { "$ifNull": ["$value", "$$CURRENT"], } } } output = qops.OUT(outname) pipeline = [unwind, output] self.collection.aggregate(pipeline, allowDiskUse=True) return self
def merge(self, right, on=None, left_on=None, right_on=None, how='inner', target=None, suffixes=('_x', '_y'), sort=False, inspect=False): """ merge this dataframe with another dataframe. only left outer joins are currently supported. the output is saved as a new collection, target name (defaults to a generated name if not specified). :param right: the other MDataFrame :param on: the list of key columns to merge by :param left_on: the list of the key columns to merge on this dataframe :param right_on: the list of the key columns to merge on the other dataframe :param how: the method to merge. supported are left, inner, right. Defaults to inner :param target: the name of the collection to store the merge results in. If not provided a temporary name will be created. :param suffixes: the suffixes to apply to identical left and right columns :param sort: if True the merge results will be sorted. If False the MongoDB natural order is implied. :returns: the MDataFrame to the target MDataFrame """ # validate input supported_how = ["left", 'inner', 'right'] assert how in supported_how, "only %s merges are currently supported" % supported_how for key in [on, left_on, right_on]: if key: assert isinstance( key, six.string_types), "only single column merge keys are supported (%s)" % key if isinstance(right, Collection): right = MDataFrame(right) assert isinstance( right, MDataFrame), "both must be MDataFrames, got right=%" % type(right) if how == 'right': # A right B == B left A return right.merge(self, on=on, left_on=right_on, right_on=left_on, how='left', target=target, suffixes=suffixes) # generate lookup parameters on = on or '_id' right_name = self._get_collection_name_of(right, right) target_name = self._get_collection_name_of( target, '_temp.merge.%s' % uuid4().hex) target_field = ( "%s_%s" % (right_name.replace('.', '_'), right_on or on)) lookup = qops.LOOKUP(right_name, key=on, left_key=left_on, right_key=right_on, target=target_field) # unwind merged documents from arrays to top-level document fields unwind = qops.UNWIND(target_field, preserve=how != 'inner') # get all fields from left, right project = {} for left_col in self.columns: source_left_col = left_col if left_col == '_id': project[left_col] = 1 continue if left_col.startswith('_idx'): continue if left_col.startswith('_om#'): continue if left_col != (on or left_on) and left_col in right.columns: left_col = '%s%s' % (left_col, suffixes[0]) project[left_col] = "$%s" % source_left_col for right_col in right.columns: if right_col == '_id': continue if right_col.startswith('_idx'): continue if right_col.startswith('_om#'): continue if right_col == (on or right_on) and right_col == (on or left_on): # if the merge field is the same in both frames, we already # have it from left continue if right_col in self.columns: left_col = '%s%s' % (right_col, suffixes[1]) else: left_col = '%s' % right_col project[left_col] = '$%s.%s' % (target_field, right_col) expected_columns = list(project.keys()) project = {"$project": project} # store merged documents and return an MDataFrame to it out = qops.OUT(target_name) pipeline = [lookup, unwind, project] if sort: sort_cols = make_list(on or [left_on, right_on]) sort_key = qops.make_sortkey(sort_cols) sort = qops.SORT(**dict(sort_key)) pipeline.append(sort) pipeline.append(out) if inspect: result = pipeline else: result = self.collection.aggregate(pipeline, allowDiskUse=True) result = MDataFrame(self.collection.database[target_name], force_columns=expected_columns) return result