Exemple #1
0
def MRorphans(colA, colAfld, colAq, colB, colBfld, colBq, out='mr_orphans',
              jsMode=False, verbose=3):
    ''' A kind of set operation on 2 collections
        Map Reduce two collection objects (colA, colB) on a common field (colAfld, colBFld)
        allowing queries (colAq, colBq)
        returns tuple (result collection, MapReduce1 results, MapReduce2 results)
        resultsCollection
               value.A = count of documents in A, value.B count of documents in B,
               sum count of documents in both A+B
               documents non existing in ColA             resultCollection.find( {'value.A':0}}
               documents existing in both ColA and ColB
                       resultCollection.find( ={'value.A':{'$gt':0}, 'value.B':{'$gt':0}})
               or if documents are unique in both collections resultCollection.find({'value.sum':2}}
        call example:
               MapOrphans(bof.TstatusesSrc, '_id', {}, colB=ag13, colBfld='_id.vl',
               colBq={'_id.kt': 'src', '_id.idsu': ''}, out='mr_join', jsMode=False, verbose=3)
    '''
    MapOrphansJs = parseJSfunFromFile(PATH_JS, 'MapOrphans')
    ReduceOrphansJs = parseJSfunFromFile(PATH_JS, 'ReduceOrphans')
    scope = {'phase': 1}
    sort = {colAfld: 1} if colAfld in collectionIndexedFields(colA) else None
    mr1 = MRsimple(colA, MapOrphansJs % (colAfld), FunReduce=ReduceOrphansJs,
                   query=colAq, out={'db': 'local', 'replace': out},
                   finalize=None, scope=scope, sort=sort, verbose=verbose, jsMode=jsMode)
    scope['phase'] = 2
    sort = {colBfld: 1} if colBfld in collectionIndexedFields(colB) else None
    mr2 = MRsimple(colB, MapOrphansJs % (colBfld), FunReduce=ReduceOrphansJs, query=colBq,
                   out={'db': 'local', 'reduce': out}, finalize=None, scope=scope, sort=sort,
                   verbose=verbose, jsMode=jsMode)
    return mr2[1], mr1[0], mr2[0]
Exemple #2
0
def MRfields(collection, query={}, out={"replace": 'mrfields.tmp'}, verbose=2, doMeta=False,
             scope={'parms': {'levelMax': -1, 'inclHeaderKeys': True, 'ExamplesMax': 2}}):
    """A utility which intentifies all field's names used by documents of a collection
    """
    FunMap = parseJSfunFromFile(PATH_JS, 'MapKeys')
    FunReduce = parseJSfunFromFile(PATH_JS, 'ReduceKeys')
    rt = MRsimple(collection, FunMap, FunReduce=FunReduce, query=query, out=out,
                  scope=scope, verbose=verbose)
    collection = rt[1]
    totalRecords = float(rt[0]['counts']['input'])
    totalCnt = 0
    if verbose > 0:
        print "calculating percentages"
    for doc in collection.find():
        cnt = doc['value']['cnt']
        percent = (cnt / totalRecords) * 100
        doc['value']['percent'] = percent
        collection.update({'_id.fields': doc['_id']['fields']},
                          {"$set": {"value.percent": percent}}, safe=True, multi=False)
        #@warning:  don't use {_id:id} does not work posibly coz diffirent subfields order
        #print collection.find_one({'_id':doc['_id']}, safe=True)
        totalCnt += cnt
    if verbose > 0:
        print "creating intexes"
    rt[1].ensure_index('_id.type', background=True)
    if doMeta:
        rtMeta = MRfieldsMeta(rt, verbose=verbose)
        return rt, rtMeta
    else:
        return rt
Exemple #3
0
def MRfieldsMeta(mr_keys_results, verbose=2):
    FunMap = parseJSfunFromFile(PATH_JS, 'MetaMapKeys')
    FunReduce = parseJSfunFromFile(PATH_JS, 'MetaReduceKeys')
    totalRecords = mr_keys_results[0]['counts']['output']
    out = {'reduce': mr_keys_results[1].name}
    rt = MRsimple(mr_keys_results[1], FunMap, FunReduce=FunReduce, query={'_id.type': 'fieldsGrp'},
                  out=out, scope={'parms': {'totalRecords': totalRecords}}, verbose=verbose)
    return rt
Exemple #4
0
def MRorphans(colA,
              colAfld,
              colAq,
              colB,
              colBfld,
              colBq,
              out='mr_orphans',
              jsMode=False,
              verbose=3):
    ''' A kind of set operation on 2 collections
        Map Reduce two collection objects (colA, colB) on a common field (colAfld, colBFld)
        allowing queries (colAq, colBq)
        returns tuple (result collection, MapReduce1 results, MapReduce2 results)
        resultsCollection
               value.A = count of documents in A, value.B count of documents in B,
               sum count of documents in both A+B
               documents non existing in ColA             resultCollection.find( {'value.A':0}}
               documents existing in both ColA and ColB
                       resultCollection.find( ={'value.A':{'$gt':0}, 'value.B':{'$gt':0}})
               or if documents are unique in both collections resultCollection.find({'value.sum':2}}
        call example:
               MapOrphans(bof.TstatusesSrc, '_id', {}, colB=ag13, colBfld='_id.vl',
               colBq={'_id.kt': 'src', '_id.idsu': ''}, out='mr_join', jsMode=False, verbose=3)
    '''
    MapOrphansJs = parseJSfunFromFile(PATH_JS, 'MapOrphans')
    ReduceOrphansJs = parseJSfunFromFile(PATH_JS, 'ReduceOrphans')
    scope = {'phase': 1}
    sort = {colAfld: 1} if colAfld in collectionIndexedFields(colA) else None
    mr1 = MRsimple(colA,
                   MapOrphansJs % (colAfld),
                   FunReduce=ReduceOrphansJs,
                   query=colAq,
                   out={
                       'db': 'local',
                       'replace': out
                   },
                   finalize=None,
                   scope=scope,
                   sort=sort,
                   verbose=verbose,
                   jsMode=jsMode)
    scope['phase'] = 2
    sort = {colBfld: 1} if colBfld in collectionIndexedFields(colB) else None
    mr2 = MRsimple(colB,
                   MapOrphansJs % (colBfld),
                   FunReduce=ReduceOrphansJs,
                   query=colBq,
                   out={
                       'db': 'local',
                       'reduce': out
                   },
                   finalize=None,
                   scope=scope,
                   sort=sort,
                   verbose=verbose,
                   jsMode=jsMode)
    return mr2[1], mr1[0], mr2[0]
Exemple #5
0
def MRJoin(colA, colAfld, colAq, colB, colBfld, colBq, out='mr_join', jsMode=False, verbose=3):
    MapJoin = parseJSfunFromFile(PATH_JS, 'MapJoin')
    ReduseJoin = parseJSfunFromFile(PATH_JS, 'ReduceJoin')
    scope = {'phase': 1}
    sort = {colAfld: 1} if colAfld in collectionIndexedFields(colA) else None
    mr1 = MRsimple(colA, MapJoin % (colAfld), FunReduce=ReduseJoin, query=colAq,
                   out={'db': 'local', 'replace': out},
                   finalize=None, scope=scope, sort=sort, verbose=verbose, jsMode=jsMode)
    scope['phase'] = 2
    sort = {colBfld: 1} if colBfld in collectionIndexedFields(colB) else None
    mr2 = MRsimple(colB, MapJoin % (colBfld), FunReduce=ReduseJoin, query=colBq,
                   out={'db': 'local', 'reduce': out},
                   finalize=None, scope=scope, sort=sort, verbose=verbose, jsMode=jsMode)
    return mr2[1], mr1[0], mr2[0]
Exemple #6
0
def tmpEduMR(colA, query={}, verbose=2):
    map_closest = parseJSfunFromFile(PATH_JS, 'map_closest')
    red_closest = parseJSfunFromFile(PATH_JS, 'red_closest')
    mr1 = MRsimple(colA,
                   map_closest,
                   FunReduce=red_closest,
                   query=query,
                   out={
                       'db': 'edu',
                       'replace': 'mr_closest'
                   },
                   finalize=None,
                   verbose=verbose)
    return mr1
Exemple #7
0
def MRfields(collection,
             query={},
             out={"replace": 'mrfields.tmp'},
             verbose=2,
             doMeta=False,
             scope={
                 'parms': {
                     'levelMax': -1,
                     'inclHeaderKeys': True,
                     'ExamplesMax': 2
                 }
             }):
    """A utility which intentifies all field's names used by documents of a collection
    """
    FunMap = parseJSfunFromFile(PATH_JS, 'MapKeys')
    FunReduce = parseJSfunFromFile(PATH_JS, 'ReduceKeys')
    rt = MRsimple(collection,
                  FunMap,
                  FunReduce=FunReduce,
                  query=query,
                  out=out,
                  scope=scope,
                  verbose=verbose)
    collection = rt[1]
    totalRecords = float(rt[0]['counts']['input'])
    totalCnt = 0
    if verbose > 0:
        print "calculating percentages"
    for doc in collection.find():
        cnt = doc['value']['cnt']
        percent = (cnt / totalRecords) * 100
        doc['value']['percent'] = percent
        collection.update({'_id.fields': doc['_id']['fields']},
                          {"$set": {
                              "value.percent": percent
                          }},
                          safe=True,
                          multi=False)
        #@warning:  don't use {_id:id} does not work posibly coz diffirent subfields order
        #print collection.find_one({'_id':doc['_id']}, safe=True)
        totalCnt += cnt
    if verbose > 0:
        print "creating intexes"
    rt[1].ensure_index('_id.type', background=True)
    if doMeta:
        rtMeta = MRfieldsMeta(rt, verbose=verbose)
        return rt, rtMeta
    else:
        return rt
Exemple #8
0
def MRfieldsMeta(mr_keys_results, verbose=2):
    FunMap = parseJSfunFromFile(PATH_JS, 'MetaMapKeys')
    FunReduce = parseJSfunFromFile(PATH_JS, 'MetaReduceKeys')
    totalRecords = mr_keys_results[0]['counts']['output']
    out = {'reduce': mr_keys_results[1].name}
    rt = MRsimple(mr_keys_results[1],
                  FunMap,
                  FunReduce=FunReduce,
                  query={'_id.type': 'fieldsGrp'},
                  out=out,
                  scope={'parms': {
                      'totalRecords': totalRecords
                  }},
                  verbose=verbose)
    return rt
Exemple #9
0
def MRJoin(colA,
           colAfld,
           colAq,
           colB,
           colBfld,
           colBq,
           out='mr_join',
           jsMode=False,
           verbose=3):
    MapJoin = parseJSfunFromFile(PATH_JS, 'MapJoin')
    ReduseJoin = parseJSfunFromFile(PATH_JS, 'ReduceJoin')
    scope = {'phase': 1}
    sort = {colAfld: 1} if colAfld in collectionIndexedFields(colA) else None
    mr1 = MRsimple(colA,
                   MapJoin % (colAfld),
                   FunReduce=ReduseJoin,
                   query=colAq,
                   out={
                       'db': 'local',
                       'replace': out
                   },
                   finalize=None,
                   scope=scope,
                   sort=sort,
                   verbose=verbose,
                   jsMode=jsMode)
    scope['phase'] = 2
    sort = {colBfld: 1} if colBfld in collectionIndexedFields(colB) else None
    mr2 = MRsimple(colB,
                   MapJoin % (colBfld),
                   FunReduce=ReduseJoin,
                   query=colBq,
                   out={
                       'db': 'local',
                       'reduce': out
                   },
                   finalize=None,
                   scope=scope,
                   sort=sort,
                   verbose=verbose,
                   jsMode=jsMode)
    return mr2[1], mr1[0], mr2[0]
Exemple #10
0
def tmpEduMR(colA, query={}, verbose=2):
    map_closest = parseJSfunFromFile(PATH_JS, 'map_closest')
    red_closest = parseJSfunFromFile(PATH_JS, 'red_closest')
    mr1 = MRsimple(colA, map_closest, FunReduce=red_closest, query=query,
                   out={'db': 'edu', 'replace': 'mr_closest'}, finalize=None, verbose=verbose)
    return mr1