def MRorphans(colA, colAfld, colAq, colB, colBfld, colBq, out='mr_orphans', jsMode=False, verbose=3): ''' A kind of set operation on 2 collections Map Reduce two collection objects (colA, colB) on a common field (colAfld, colBFld) allowing queries (colAq, colBq) returns tuple (result collection, MapReduce1 results, MapReduce2 results) resultsCollection value.A = count of documents in A, value.B count of documents in B, sum count of documents in both A+B documents non existing in ColA resultCollection.find( {'value.A':0}} documents existing in both ColA and ColB resultCollection.find( ={'value.A':{'$gt':0}, 'value.B':{'$gt':0}}) or if documents are unique in both collections resultCollection.find({'value.sum':2}} call example: MapOrphans(bof.TstatusesSrc, '_id', {}, colB=ag13, colBfld='_id.vl', colBq={'_id.kt': 'src', '_id.idsu': ''}, out='mr_join', jsMode=False, verbose=3) ''' MapOrphansJs = parseJSfunFromFile(PATH_JS, 'MapOrphans') ReduceOrphansJs = parseJSfunFromFile(PATH_JS, 'ReduceOrphans') scope = {'phase': 1} sort = {colAfld: 1} if colAfld in collectionIndexedFields(colA) else None mr1 = MRsimple(colA, MapOrphansJs % (colAfld), FunReduce=ReduceOrphansJs, query=colAq, out={'db': 'local', 'replace': out}, finalize=None, scope=scope, sort=sort, verbose=verbose, jsMode=jsMode) scope['phase'] = 2 sort = {colBfld: 1} if colBfld in collectionIndexedFields(colB) else None mr2 = MRsimple(colB, MapOrphansJs % (colBfld), FunReduce=ReduceOrphansJs, query=colBq, out={'db': 'local', 'reduce': out}, finalize=None, scope=scope, sort=sort, verbose=verbose, jsMode=jsMode) return mr2[1], mr1[0], mr2[0]
def MRfields(collection, query={}, out={"replace": 'mrfields.tmp'}, verbose=2, doMeta=False, scope={'parms': {'levelMax': -1, 'inclHeaderKeys': True, 'ExamplesMax': 2}}): """A utility which intentifies all field's names used by documents of a collection """ FunMap = parseJSfunFromFile(PATH_JS, 'MapKeys') FunReduce = parseJSfunFromFile(PATH_JS, 'ReduceKeys') rt = MRsimple(collection, FunMap, FunReduce=FunReduce, query=query, out=out, scope=scope, verbose=verbose) collection = rt[1] totalRecords = float(rt[0]['counts']['input']) totalCnt = 0 if verbose > 0: print "calculating percentages" for doc in collection.find(): cnt = doc['value']['cnt'] percent = (cnt / totalRecords) * 100 doc['value']['percent'] = percent collection.update({'_id.fields': doc['_id']['fields']}, {"$set": {"value.percent": percent}}, safe=True, multi=False) #@warning: don't use {_id:id} does not work posibly coz diffirent subfields order #print collection.find_one({'_id':doc['_id']}, safe=True) totalCnt += cnt if verbose > 0: print "creating intexes" rt[1].ensure_index('_id.type', background=True) if doMeta: rtMeta = MRfieldsMeta(rt, verbose=verbose) return rt, rtMeta else: return rt
def MRfieldsMeta(mr_keys_results, verbose=2): FunMap = parseJSfunFromFile(PATH_JS, 'MetaMapKeys') FunReduce = parseJSfunFromFile(PATH_JS, 'MetaReduceKeys') totalRecords = mr_keys_results[0]['counts']['output'] out = {'reduce': mr_keys_results[1].name} rt = MRsimple(mr_keys_results[1], FunMap, FunReduce=FunReduce, query={'_id.type': 'fieldsGrp'}, out=out, scope={'parms': {'totalRecords': totalRecords}}, verbose=verbose) return rt
def MRorphans(colA, colAfld, colAq, colB, colBfld, colBq, out='mr_orphans', jsMode=False, verbose=3): ''' A kind of set operation on 2 collections Map Reduce two collection objects (colA, colB) on a common field (colAfld, colBFld) allowing queries (colAq, colBq) returns tuple (result collection, MapReduce1 results, MapReduce2 results) resultsCollection value.A = count of documents in A, value.B count of documents in B, sum count of documents in both A+B documents non existing in ColA resultCollection.find( {'value.A':0}} documents existing in both ColA and ColB resultCollection.find( ={'value.A':{'$gt':0}, 'value.B':{'$gt':0}}) or if documents are unique in both collections resultCollection.find({'value.sum':2}} call example: MapOrphans(bof.TstatusesSrc, '_id', {}, colB=ag13, colBfld='_id.vl', colBq={'_id.kt': 'src', '_id.idsu': ''}, out='mr_join', jsMode=False, verbose=3) ''' MapOrphansJs = parseJSfunFromFile(PATH_JS, 'MapOrphans') ReduceOrphansJs = parseJSfunFromFile(PATH_JS, 'ReduceOrphans') scope = {'phase': 1} sort = {colAfld: 1} if colAfld in collectionIndexedFields(colA) else None mr1 = MRsimple(colA, MapOrphansJs % (colAfld), FunReduce=ReduceOrphansJs, query=colAq, out={ 'db': 'local', 'replace': out }, finalize=None, scope=scope, sort=sort, verbose=verbose, jsMode=jsMode) scope['phase'] = 2 sort = {colBfld: 1} if colBfld in collectionIndexedFields(colB) else None mr2 = MRsimple(colB, MapOrphansJs % (colBfld), FunReduce=ReduceOrphansJs, query=colBq, out={ 'db': 'local', 'reduce': out }, finalize=None, scope=scope, sort=sort, verbose=verbose, jsMode=jsMode) return mr2[1], mr1[0], mr2[0]
def MRJoin(colA, colAfld, colAq, colB, colBfld, colBq, out='mr_join', jsMode=False, verbose=3): MapJoin = parseJSfunFromFile(PATH_JS, 'MapJoin') ReduseJoin = parseJSfunFromFile(PATH_JS, 'ReduceJoin') scope = {'phase': 1} sort = {colAfld: 1} if colAfld in collectionIndexedFields(colA) else None mr1 = MRsimple(colA, MapJoin % (colAfld), FunReduce=ReduseJoin, query=colAq, out={'db': 'local', 'replace': out}, finalize=None, scope=scope, sort=sort, verbose=verbose, jsMode=jsMode) scope['phase'] = 2 sort = {colBfld: 1} if colBfld in collectionIndexedFields(colB) else None mr2 = MRsimple(colB, MapJoin % (colBfld), FunReduce=ReduseJoin, query=colBq, out={'db': 'local', 'reduce': out}, finalize=None, scope=scope, sort=sort, verbose=verbose, jsMode=jsMode) return mr2[1], mr1[0], mr2[0]
def tmpEduMR(colA, query={}, verbose=2): map_closest = parseJSfunFromFile(PATH_JS, 'map_closest') red_closest = parseJSfunFromFile(PATH_JS, 'red_closest') mr1 = MRsimple(colA, map_closest, FunReduce=red_closest, query=query, out={ 'db': 'edu', 'replace': 'mr_closest' }, finalize=None, verbose=verbose) return mr1
def MRfields(collection, query={}, out={"replace": 'mrfields.tmp'}, verbose=2, doMeta=False, scope={ 'parms': { 'levelMax': -1, 'inclHeaderKeys': True, 'ExamplesMax': 2 } }): """A utility which intentifies all field's names used by documents of a collection """ FunMap = parseJSfunFromFile(PATH_JS, 'MapKeys') FunReduce = parseJSfunFromFile(PATH_JS, 'ReduceKeys') rt = MRsimple(collection, FunMap, FunReduce=FunReduce, query=query, out=out, scope=scope, verbose=verbose) collection = rt[1] totalRecords = float(rt[0]['counts']['input']) totalCnt = 0 if verbose > 0: print "calculating percentages" for doc in collection.find(): cnt = doc['value']['cnt'] percent = (cnt / totalRecords) * 100 doc['value']['percent'] = percent collection.update({'_id.fields': doc['_id']['fields']}, {"$set": { "value.percent": percent }}, safe=True, multi=False) #@warning: don't use {_id:id} does not work posibly coz diffirent subfields order #print collection.find_one({'_id':doc['_id']}, safe=True) totalCnt += cnt if verbose > 0: print "creating intexes" rt[1].ensure_index('_id.type', background=True) if doMeta: rtMeta = MRfieldsMeta(rt, verbose=verbose) return rt, rtMeta else: return rt
def MRfieldsMeta(mr_keys_results, verbose=2): FunMap = parseJSfunFromFile(PATH_JS, 'MetaMapKeys') FunReduce = parseJSfunFromFile(PATH_JS, 'MetaReduceKeys') totalRecords = mr_keys_results[0]['counts']['output'] out = {'reduce': mr_keys_results[1].name} rt = MRsimple(mr_keys_results[1], FunMap, FunReduce=FunReduce, query={'_id.type': 'fieldsGrp'}, out=out, scope={'parms': { 'totalRecords': totalRecords }}, verbose=verbose) return rt
def MRJoin(colA, colAfld, colAq, colB, colBfld, colBq, out='mr_join', jsMode=False, verbose=3): MapJoin = parseJSfunFromFile(PATH_JS, 'MapJoin') ReduseJoin = parseJSfunFromFile(PATH_JS, 'ReduceJoin') scope = {'phase': 1} sort = {colAfld: 1} if colAfld in collectionIndexedFields(colA) else None mr1 = MRsimple(colA, MapJoin % (colAfld), FunReduce=ReduseJoin, query=colAq, out={ 'db': 'local', 'replace': out }, finalize=None, scope=scope, sort=sort, verbose=verbose, jsMode=jsMode) scope['phase'] = 2 sort = {colBfld: 1} if colBfld in collectionIndexedFields(colB) else None mr2 = MRsimple(colB, MapJoin % (colBfld), FunReduce=ReduseJoin, query=colBq, out={ 'db': 'local', 'reduce': out }, finalize=None, scope=scope, sort=sort, verbose=verbose, jsMode=jsMode) return mr2[1], mr1[0], mr2[0]
def tmpEduMR(colA, query={}, verbose=2): map_closest = parseJSfunFromFile(PATH_JS, 'map_closest') red_closest = parseJSfunFromFile(PATH_JS, 'red_closest') mr1 = MRsimple(colA, map_closest, FunReduce=red_closest, query=query, out={'db': 'edu', 'replace': 'mr_closest'}, finalize=None, verbose=verbose) return mr1