Ejemplo n.º 1
0
 def append(self, other):
     if isinstance(other, Collection):
         right = MDataFrame(other)
     assert isinstance(
         other,
         MDataFrame), "both must be MDataFrames, got other={}".format(
             type(other))
     outname = self.collection.name
     mrout = {
         'merge': outname,
         'nonAtomic': True,
     }
     mapfn = Code("""
     function() {
        this._id = ObjectId();
        if(this['_om#rowid']) {
           this['_om#rowid'] += %s;
        }
        emit(this._id, this);
     }
     """ % len(self))
     reducefn = Code("""
     function(key, value) {
        return value;
     }
     """)
     finfn = Code("""
     function(key, value) {
        return value;
     }
     """)
     other.collection.map_reduce(mapfn,
                                 reducefn,
                                 mrout,
                                 finalize=finfn,
                                 jsMode=True)
     unwind = {
         "$replaceRoot": {
             "newRoot": {
                 "$ifNull": ["$value", "$$CURRENT"],
             }
         }
     }
     output = qops.OUT(outname)
     pipeline = [unwind, output]
     self.collection.aggregate(pipeline, allowDiskUse=True)
     return self
Ejemplo n.º 2
0
    def merge(self, right, on=None, left_on=None, right_on=None,
              how='inner', target=None, suffixes=('_x', '_y'),
              sort=False, inspect=False):
        """
        merge this dataframe with another dataframe. only left outer joins
        are currently supported. the output is saved as a new collection,
        target name (defaults to a generated name if not specified).

        :param right: the other MDataFrame
        :param on: the list of key columns to merge by
        :param left_on: the list of the key columns to merge on this dataframe
        :param right_on: the list of the key columns to merge on the other 
           dataframe
        :param how: the method to merge. supported are left, inner, right. 
           Defaults to inner
        :param target: the name of the collection to store the merge results
           in. If not provided a temporary name will be created.
        :param suffixes: the suffixes to apply to identical left and right 
           columns
        :param sort: if True the merge results will be sorted. If False the
           MongoDB natural order is implied.
        :returns: the MDataFrame to the target MDataFrame
        """
        # validate input
        supported_how = ["left", 'inner', 'right']
        assert how in supported_how, "only %s merges are currently supported" % supported_how
        for key in [on, left_on, right_on]:
            if key:
                assert isinstance(
                    key, six.string_types), "only single column merge keys are supported (%s)" % key
        if isinstance(right, Collection):
            right = MDataFrame(right)
        assert isinstance(
            right, MDataFrame), "both must be MDataFrames, got right=%" % type(right)
        if how == 'right':
            # A right B == B left A
            return right.merge(self, on=on, left_on=right_on, right_on=left_on,
                               how='left', target=target, suffixes=suffixes)
        # generate lookup parameters
        on = on or '_id'
        right_name = self._get_collection_name_of(right, right)
        target_name = self._get_collection_name_of(
            target, '_temp.merge.%s' % uuid4().hex)
        target_field = (
                "%s_%s" % (right_name.replace('.', '_'), right_on or on))
        lookup = qops.LOOKUP(right_name,
                             key=on,
                             left_key=left_on,
                             right_key=right_on,
                             target=target_field)
        # unwind merged documents from arrays to top-level document fields
        unwind = qops.UNWIND(target_field, preserve=how != 'inner')
        # get all fields from left, right
        project = {}
        for left_col in self.columns:
            source_left_col = left_col
            if left_col == '_id':
                project[left_col] = 1
                continue
            if left_col.startswith('_idx'):
                continue
            if left_col.startswith('_om#'):
                continue
            if left_col != (on or left_on) and left_col in right.columns:
                left_col = '%s%s' % (left_col, suffixes[0])
            project[left_col] = "$%s" % source_left_col
        for right_col in right.columns:
            if right_col == '_id':
                continue
            if right_col.startswith('_idx'):
                continue
            if right_col.startswith('_om#'):
                continue
            if right_col == (on or right_on) and right_col == (on or left_on):
                # if the merge field is the same in both frames, we already
                # have it from left
                continue
            if right_col in self.columns:
                left_col = '%s%s' % (right_col, suffixes[1])
            else:
                left_col = '%s' % right_col
            project[left_col] = '$%s.%s' % (target_field, right_col)
        expected_columns = list(project.keys())
        project = {"$project": project}
        # store merged documents and return an MDataFrame to it
        out = qops.OUT(target_name)
        pipeline = [lookup, unwind, project]
        if sort:
            sort_cols = make_list(on or [left_on, right_on])
            sort_key = qops.make_sortkey(sort_cols)
            sort = qops.SORT(**dict(sort_key))
            pipeline.append(sort)
        pipeline.append(out)
        if inspect:
            result = pipeline
        else:
            result = self.collection.aggregate(pipeline, allowDiskUse=True)
            result = MDataFrame(self.collection.database[target_name],
                                force_columns=expected_columns)
        return result