def _get_code_w_scope(data, view, position, obj_end, opts, element_name): """Decode a BSON code_w_scope to bson.code.Code.""" code_end = position + _UNPACK_INT_FROM(data, position)[0] code, position = _get_string(data, view, position + 4, code_end, opts, element_name) scope, position = _get_object(data, view, position, code_end, opts, element_name) if position != code_end: raise InvalidBSON('scope outside of javascript code boundaries') return Code(code, scope), position
def test_code(self): a_string = "hello world" a_code = Code("hello world") self.assertTrue(a_code.startswith("hello")) self.assertTrue(a_code.endswith("world")) self.assertTrue(isinstance(a_code, Code)) self.assertFalse(isinstance(a_string, Code)) self.assertEqual(a_code.scope, {}) a_code.scope["my_var"] = 5 self.assertEqual(a_code.scope, {"my_var": 5})
def analyze_tweets(): if mapcollection not in db.collection_names(): map_function = Code(open('mapReduce/mapFunctionTweets.js', 'r').read()) reduce_function = Code(open('mapReduce/reduceFunctionTweets.js', 'r').read()) collection.map_reduce(map_function, reduce_function, out=mapcollection) json_file = open(output, 'wb') json_file.write('[') first = True for doc in db[mapcollection].find().sort([('value', -1)]).limit(50): if first: first = False else: json_file.write(',') json_file.write(json.dumps(doc, indent=2, default=json_util.default)) json_file.write(']') json_file.close()
def get_statistics( self, request, **kwargs ): ''' Get "statistics" aim is to (ultimately) run MapReduce functions on the set of data in a specific repository. ''' day_map = Code( ''' function() { day = Date.UTC( this.timestamp.getFullYear(), this.timestamp.getMonth(), this.timestamp.getDate() ); emit( { day: day }, { count: 1 } ) }''') day_reduce = Code( ''' function( key, values ) { var count = 0; values.forEach( function( v ) { count += v[ 'count' ]; }); return {count: count}; }''') # Grab the survey that we're querying survey data for repo_filter = { 'repo': ObjectId( kwargs.get( 'mongo_id' ) ) } cursor = db.data.find( repo_filter ) first = db.data.find_one( repo_filter, sort=[( '_id', pymongo.ASCENDING )] ) last = db.data.find_one( repo_filter, sort=[( '_id', pymongo.DESCENDING )] ) count_by_day = [] result = db.data.map_reduce( day_map, day_reduce, "myresults", query=repo_filter ) for doc in result.find(): count_by_day.append( { 'day': doc[ '_id' ][ 'day' ], 'value': doc[ 'value' ][ 'count' ] }) stats = { 'total_count': cursor.count(), 'count_by_day': count_by_day, 'first_submission': first, 'last_submission': last, } return self.create_response( request, stats )
def get_phrase_heading_counts(phrase, speakername=None, how_many=25, from_date=None, to_date=None): ''' A list of headings by number of occurrences for a phrase ''' query = {"phrase": phrase} if from_date and to_date: query["date"] = {"$gte": from_date, "$lte": to_date} if speakername: query["speakername"] = speakername map = Code( "function () {" " emit(this.headingtitle.substring(0,64) + ' ('+ this.date + ')',1);" "}") reduce = Code("function (key, values) {" " return Array.sum(values)" "}") results = db.phrases.map_reduce(map, reduce, "results", query=query) for doc in results.find().sort("value", -1).limit(how_many): yield doc
def mapReduce(): db = client.Corpus tweets = db.tweets map = Code("function() {emit(this.via,1);}") reduce = Code("""function(key, values) { var res=0; values.forEach(function(v){res +=1}) return {count: res}; }""") result = tweets.map_reduce(map, reduce, "via_count", query={"sentiment": 2}, limit=10) print result for doc in db.via_count.find(): print(doc)
def get_phrases_containing(fragment, how_many=25, from_date=None, to_date=None, speakername=None): ''' A list of phrases containing some text ''' #query = {"phrase":re.compile(".*"+fragment+".*", re.IGNORECASE)} query = { "phrase": re.compile("(^|\s)(" + fragment + ")($|\s)", re.IGNORECASE) } if from_date and to_date: query["date"] = {"$gte": from_date, "$lte": to_date} if speakername: query["speakername"] = speakername map = Code("function () {" " emit(this.phrase,1);" "}") reduce = Code("function (key, values) {" " return Array.sum(values)" "}") results = db.phrases.map_reduce(map, reduce, "results", query=query) for doc in results.find().sort("value", -1).limit(how_many): yield doc
def group(self, keys, initial, reduce, condition=None, finalize=None, **kwargs): body = { "ns": self._collection_name, "initial": initial, "$reduce": Code(reduce), } if isinstance(keys, (bytes, unicode)): body["$keyf"] = Code(keys) else: body["key"] = self._normalize_fields_projection(keys) if condition: body["cond"] = condition if finalize: body["finalize"] = Code(finalize) result = yield self._database.command("group", body, **kwargs) defer.returnValue(result)
def snippet(self, _input): """Takes in JavaScript string and code to function. Args: _input(:obj:`str`): JavaScript string. Return: (:obj:`bson.code.Code`) """ return Code(_input)
def object_hook(dct, compile_re=True): if "$oid" in dct: return ObjectId(str(dct["$oid"])) if "$numberLong" in dct: return int(dct["$numberLong"]) if "$decimal" in dct: v = str(dct["$decimal"]) if "$precision" in dct: precision = dct["$precision"][0] scale = dct["$precision"][1] d = Decimal(v, precision, scale) else: d = Decimal(v) return d if "$ref" in dct: return DBRef(dct["$ref"], dct["$id"], dct.get("$db", None)) if "$date" in dct: try: secs = float(dct["$date"]) / 1000.0 return EPOCH_AWARE + datetime.timedelta(seconds=secs) except ValueError: return datetime.datetime.strptime(dct["$date"], "%Y-%m-%d") if "$timestamp" in dct: try: ms = long_type(dct["$timestamp"]) return Timestamp(ms / 1000, ms % 1000 * 1000) except ValueError: dt = datetime.datetime.strptime(dct["$timestamp"], "%Y-%m-%d-%H.%M.%S.%f") secs = long_type(time.mktime(dt.timetuple())) return Timestamp(secs, dt.microsecond) if "$regex" in dct: flags = 0 # PyMongo always adds $options but some other tools may not. for opt in dct.get("$options", ""): flags |= _RE_OPT_TABLE.get(opt, 0) if compile_re: return re.compile(dct["$regex"], flags) else: return Regex(dct["$regex"], flags) if "$minKey" in dct: return MinKey() if "$maxKey" in dct: return MaxKey() if "$binary" in dct: if isinstance(dct["$type"], int): dct["$type"] = "%d" % dct["$type"] subtype = int(dct["$type"]) return Binary(base64.b64decode(dct["$binary"].encode()), subtype) if "$code" in dct: return Code(dct["$code"], dct.get("$scope")) if bson.has_uuid() and "$uuid" in dct: return bson.uuid.UUID(dct["$uuid"]) return dct
def getEndangeredSpecies(): global client, db, continentEntry continent = endangeredEntry.get() res = 0 list = [] Species_collection = db.get_collection("endangered_species") for post in Species_collection.find({"continent": continent}, { "_id": 0, "animals_endangered": 1, "examples": 2 }): res = post["animals_endangered"] list = post["examples"] if continent == "": map = Code("function () {" " emit('endangered', this.animals_endangered); " "}") reduce = Code("function(key, values) {" " return Array.sum(values);" "}") res = Species_collection.map_reduce(map=map, reduce=reduce, out=SON([('inline', 1)])) res = res["results"][0]["value"] ans = "Total endangered species: " + str(int(res)) label = Label(window, text=ans) label.grid(column=3, row=7) return examples = ", ".join(list) examples = "counting: " + examples ans = "Total endangered species: " + str(int(res)) label = Label(window, text=ans) label.grid(column=3, row=7) examplelabel = Label(window, text=examples) # examplelabel.grid_forget() examplelabel.grid(column=3, row=8)
def rq1_mapred_handler(event, context): start_time = timeit.default_timer() client = MongoClient(MONGO_SERVER, MONGO_PORT) db = client.stackoverflow rq1_map = Code(""" function () { if (this.PostTypeId == 1) { emit(this.OwnerUserId, 1); } } """) rq1_reduce = Code(''' function (key, values) { return Array.sum(values); } ''') db.posts.map_reduce(rq1_map, rq1_reduce, out='rq1') rq1_map2 = Code(""" function () { emit(this.Id, 0); } """) rq1_reduce2 = Code(""" function (key, values) { return Array.sum(values); } """) db.users.map_reduce(rq1_map2, rq1_reduce2, out={'reduce': 'rq1'}) rq1MR = list(db.rq1.find()) elapsed = timeit.default_timer() - start_time return make_response(rq1MR, elapsed)
def execute(trial=False): print("starting data retrieval") startTime = datetime.datetime.now() client = dml.pymongo.MongoClient() repo = client.repo print("repo: ", repo) repo.authenticate('jzhou94_katerin', 'jzhou94_katerin') map_function_avg_earnings = Code('''function() { if (this.postal*1 > 2100 && this.postal*1 < 2300 && this.title == "Police Officer") emit(this.postal, {tot:this.total_earnings, n: 1, avg: this.total_earnings}); }''') reduce_function_avg_earnings = Code('''function(k, vs) { var total = 0; var counts = 0; for (var i = 0; i < vs.length; i++) total += (vs[i].tot*1); for (var i = 0; i < vs.length; i++) counts += vs[i].n; return {tot:total.toFixed(2), n: counts, avg: (total/counts).toFixed(2)}; }''') repo.dropPermanent('jzhou94_katerin.avg_earnings') repo.createPermanent('jzhou94_katerin.avg_earnings') if trial == True: repo['jzhou94_katerin.avg_earnings'].insert( repo.jzhou94_katerin.employee_earnings.find().limit(100)) repo.jzhou94_katerin.avg_earnings.map_reduce( map_function_avg_earnings, reduce_function_avg_earnings, 'jzhou94_katerin.avg_earnings') else: repo.jzhou94_katerin.employee_earnings.map_reduce( map_function_avg_earnings, reduce_function_avg_earnings, 'jzhou94_katerin.avg_earnings') repo.logout() endTime = datetime.datetime.now() return {"start": startTime, "end": endTime}
def get_online_report(): """获取在线报告""" ses = get_conn("gps_info") now = datetime.datetime.now() year = now.year month = now.month begin = get_datetime_from_str("{}-{}-1 0:0:0".format(year, month)) # begin = now - datetime.timedelta(days=30) query = {"time": {"$gte": begin}} s = {"time": -1} out = "online_report_result" # 保存数据的表,每次map_reduce都会提前清空这个表 map_func = Code(""" function(){ emit(this.user_id.$id, 1); } """) reduce_func = Code(""" function(key, values){ return Array.sum(values); } """) result_conn = ses.map_reduce(map=map_func, reduce=reduce_func, query=query, sort=s, out=out, full_response=False) res = result_conn.find(filter=dict()) count_dict = {x['_id']: int(x['value']) for x in res} ids = list(count_dict.keys()) ses = get_conn("user_info") f = {"_id": {"$in": ids}} s = [("last_update", -1)] users = ses.find(filter=f, sort=s) res = list() for user in users: user_id = user['_id'] temp = to_flat_dict(user) temp['count'] = count_dict[user_id] res.append(temp) res.sort(key=lambda obj: obj['count'], reverse=True) return res
def get(self): map = Code("function () {" " var date = new Date(this.created_t*1000);" " var parsedDateMonth = date.getMonth();" " var parsedDateYear = date.getFullYear();" " var saltLevelsValue = null;" " var fatLevelsValue = null;" " var saturatedfatLevelsValue = null;" " var sugarsLevelsValue = null;" " if(!parsedDateYear || !parsedDateMonth) return;" " if(this.hasOwnProperty('nutrient_levels')) {" " saltLevelsValue = this.nutrient_levels.salt;" " fatLevelsValue = this.nutrient_levels.fat;" " saturatedfatLevelsValue = this['nutrient_levels']['saturated-fat'];" " sugarsLevelsValue = this.nutrient_levels.sugars;" " } else { saltLevelsValue = null; fatLevelsValue = null; saturatedfatLevelsValue = null, sugarsLevelsValue = null}" " emit({year : parsedDateYear, month : parsedDateMonth, saltlevels : saltLevelsValue, fatlevels : fatLevelsValue, saturatedfatlevels : saturatedfatLevelsValue, sugarslevels : sugarsLevelsValue}, {count:1});" "};") reduce = Code("function (key, values) {" " var count = 0;" " var ret = {count : 0};" " for (index in values) {" " ret.count += values[index].count;" " }" " return ret;" " };") listRes = [] result = mongo.db.products.map_reduce(map, reduce, "stats_products") for doc in result.find(): res = {} res['dateyear'] = doc['_id']['year'] res['datemonth'] = doc['_id']['month'] res['count'] = doc['value']['count'] res['saltlevels'] = doc['_id']['saltlevels'] res['saturatedfatlevels'] = doc['_id']['saturatedfatlevels'] res['sugarslevels'] = doc['_id']['sugarslevels'] res['fatlevels'] = doc['_id']['fatlevels'] listRes.append(res) return listRes
def word_count(coll): result = coll.find({}, {"text": 1}).limit(2) # pprint.pprint(list(result) ) mapper = Code(""" function () { txt=this.text ; wrds = txt.split(" "); for ( i=0 ; i < wrds.length ; i++) { if ( wrds[i].length >1 ) { wrd=wrds[i].toLowerCase().replace('"|,',' '); wrd=wrd.replace('(\\n)+',''); wrd=wrd.replace('.|;|,)/gi',''); emit(wrd, 1 ); } } } """) reducer = Code(""" function (key, values) { result = { count : 0 }; values.forEach ( function(v) { result.count += 1; } ) return {result}; } """) print("\n\n") pprint.pprint(coll) result = coll.map_reduce(mapper, reducer, "word_cnt", query={ "sender": "*****@*****.**" }).find() for doc in result: print(doc)
def _map_reduce(self, coll, mapreduce, spec=None): """ Perform map/reduce operation over DAS cache using provided collection, mapreduce name and optional conditions. """ self.logger.debug("(%s, %s)" % (mapreduce, spec)) record = find_one(self.mrcol, {'name': mapreduce}) if not record: raise Exception("Map/reduce function '%s' not found" % mapreduce) fmap = record['map'] freduce = record['reduce'] if spec: result = coll.map_reduce(Code(fmap), Code(freduce), query=spec) else: result = coll.map_reduce(Code(fmap), Code(freduce)) msg = "found %s records in %s" % (result.count(), result.name) self.logger.info(msg) self.logger.debug(fmap) self.logger.debug(freduce) return result
def photographer_shots_count(): mapper = Code(""" function() { emit(this.photographer, 1); } """) reducer = Code(""" function(k, vs) { return Array.sum(vs); } """) raw = mongo_client.photo.photo.map_reduce(mapper, reducer, "photographer_phs").find() res = [] for kv in raw: res.append({ 'name': Photographer.find_by_id(kv['_id'])['name'], 'count': kv['value'] }) return res
def updateDocFreq(patDB, outColName='corpusDict'): map = Code(open('fresh_docFreqMap.js').read()) reduce = Code(open('fresh_docFreqReduce.js').read()) # finIDF will calculate IDF scores, so I have to pass it # the number of total docs in a crafty way. size = patDB.patns.count() # replace all instances of TOTALDOCS with size in docFreqFinalize.js finIDF = open('docFreqFinalize.js').read() finIDF = finIDF.replace('TOTALDOCS', str(size)) # patDB.patns.map_reduce(map, reduce, outColName, finalize=finIDF) # can either reduce into outColName or replace it. I like replace for now ## I changed 'out' from 'replace' to 'reduce', that should combine the results ## uh-oh, I'm not sure if that works, because of the way docFreqFinalize works patDB.patns.map_reduce(map, reduce, out={'reduce': outColName}, finalize=finIDF)
def setUp(self): """Set up class.""" from bson.code import Code self.encoder = GoodJSONEncoder() self.expected = { "code": "console.log('HAAAAAAAAHHHH!!!')", "scope": { "data": "test" } } self.data = Code(**self.expected)
def test__group3(self): reducer=Code(""" function(obj, result) {result.count+=1 } """) conditions = { 'foo':{'$in':[self._id1]}, } self.cmp.compare.group(key=['foo'], condition=conditions, initial={"count": 0}, reduce=reducer)
def cutoffSL(doc, cutoff): m = Code("function(){ emit( this.len , { count : 1 } );}") r = Code( "function (key, values) { var count = 0; values.forEach(function (v) {count += v.count;}); return {count: count}; }" ) if Pippies.count() > 0: lens = dict([(x['_id'], int(x['value']['count'])) for x in Pippies.map_reduce( m, r, 'cutoff sparkline', query={ 'docs': doc._id }).find()]) else: lens = {} if lens.keys(): return [ str(lens[x]) if x in lens else '0' for x in xrange(int(max(lens.keys()) + 1)) ][4:cutoff] else: return []
def sum(self, field): """Sum over the values of the specified field. :param field: the field to sum over; use dot-notation to refer to embedded document fields """ map_func = """ function() { var path = '{{~%(field)s}}'.split('.'), field = this; for (p in path) { if (typeof field != 'undefined') field = field[path[p]]; else break; } if (field && field.constructor == Array) { field.forEach(function(item) { emit(1, item||0); }); } else if (typeof field != 'undefined') { emit(1, field||0); } } """ % dict(field=field) reduce_func = Code(""" function(key, values) { var sum = 0; for (var i in values) { sum += values[i]; } return sum; } """) mr_future = self.inline_map_reduce(map_func, reduce_func) future = get_future(self) def sum_cb(mr_future): results = mr_future.result() for result in results: r = result.value break else: r = 0 future.set_result(r) mr_future.add_done_callback(sum_cb) return future
def top_n_map(n): return Code(''' function() { var to_return = Math.min(%d, this.sorted_text.length); var out_arr = []; for (var i = 0; i < to_return; i++) { out_arr[i] = this.sorted_text[i]['tf-idf'] }; emit("tf-idf", {'vals':out_arr}) };''' % n) return out
def mapFunction(self): self.mapcode = Code("function() {" " var key = this.nppes_provider_state;" " var value = {" " count: 1," " claim: this.average_submitted_chrg_amt," " payment: this.average_Medicare_payment_amt" " };" " emit(key, value);" "};") return self.mapcode
def mapTopDriver(self): mapper = Code(""" function() { var key = this.driver.name; var value = {count : 1}; emit(key, value); }; """) reducer = Code(""" function (key, values) { var count = 0; for(var i in values){ count += values[i].count; } return {count: count}; }; """) result = self.db.order.map_reduce(mapper, reducer, "result") res = list(result.find()) print res
def mapTopCompany(self): mapper = Code(""" function() { var key = this.Companies.Name; var value = {count : 1}; emit(key, value); }; """) reducer = Code(""" function(key, values) { var count = 0; for(var i in values) { count += values[i].count; } return {count: count}; }; """) result = self.db.Flights.map_reduce(mapper, reducer, 'result') res = list(result.find()) print res
def __init__(self): self.mongo = Database() self.map = Code( "function() {" " this.tags.forEach(" " function(tag) {" " emit(tag, 1);" " }" " )" "}" ) self.reduce = Code( "function(key, values) {" " var total = 0;" " for (var i=0; i<values.length; i++) {" " total += values[i];" " }" " return total;" "}" )
def reduceFunction(self): self.reducecode = Code("function(keyState, countStVals) {" " reduceVal = {count: 0, claim: 0, payment: 0};" " for (var provider = 0; provider < countStVals.length; provider++) {" " reduceVal.count += countStVals[provider].count;" " reduceVal.claim += countStVals[provider].claim;" " reduceVal.payment += countStVals[provider].payment;" " };" " return reduceVal;" "};") return self.reducecode
def test_qop_ne_6(monty_find, mongo_find): docs = [ {"a": [{"b": Code("a")}]}, ] spec = {"a.b": {"$ne": "a"}} monty_c = monty_find(docs, spec) mongo_c = mongo_find(docs, spec) assert mongo_c.count() == 1 assert monty_c.count() == mongo_c.count()