def _open(input_description, task=None): """Return a :class:`~mongodisco.mongodb_input.MongoWrapper` which wraps a cursor selecting just those documents relevant to one particular map operation. `input_description` is a JSON string describing the documents to find, and looks like:: { "inputURI": "mongodb://discomaster.zeroclues.net:27017/test.twitter", "keyField": null, "query": { "$query": {}, "$min": {"_id": {"$oid": "4fae7a97fa22c41aeb5d78f8"}}, "$max": {"_id": {"$oid": "4fae7b27fa22c41aeb5d96b5"}}}, "fields": null, "sort": null, "limit": 0, "skip": 0, "timeout": false } """ parsed = json.loads(input_description, object_hook=object_hook) collection = get_collection(parsed['inputURI']) return MongoWrapper(collection.find( spec=parsed['query'], fields=parsed['fields'], skip=parsed['skip'], limit=parsed['limit'], sort=parsed['sort'], timeout=parsed['timeout'], slave_okay=parsed['slave_ok'] ))
def __init__(self, stream, params): config = {} for key, value in params.__dict__.iteritems(): config[key] = value self.uri = config.get("output_uri") self.conn = get_connection(self.uri) self.coll = get_collection(self.uri) self.key_name = config.get("job_output_key", "_id") self.value_name = config.get("job_output_value")
def __init__(self, stream, params): config = {} for key, value in params.__dict__.iteritems(): config[key] = value self.uri = config.get('output_uri') self.conn = get_connection(self.uri) self.coll = get_collection(self.uri) self.key_name = config.get('job_output_key', '_id') self.value_name = config.get('job_output_value')
def __init__(self,stream,params): config = {} for key, value in params.__dict__.iteritems(): config[key] = value self.uri = config.get('output_uri') self.conn = get_connection(self.uri) self.coll = get_collection(self.uri) self.key_name = config.get('job_output_key','_id') self.value_name = config.get('job_output_value') self.add_action = config.get('add_action', 'insert') self.add_upsert = config.get('add_upsert', False)
def __init__(self, params): config = {} for key, value in params.get('mongodb', {}).items(): config[key] = value self.uri = config.get('output_uri') self.conn = get_connection(self.uri) self.coll = get_collection(self.uri) self.key_name = config.get('job_output_key','_id') self.value_name = config.get('job_output_value') self.add_action = config.get('add_action', 'insert') self.add_upsert = config.get('add_upsert', False) self.base_doc = config.get('base_doc', {})
def test_traditional_way(): start = time.clock() col = get_collection(config['input_uri']) count = {} cur = col.find() for row in cur: age = row['age'] / 10 if age in count: count[age] += 1 else: count[age] = 1 end = time.clock() print "Time used: ", end - start for key in count: print key, count[key]
def test_traditional_way(): start = time.clock() col = get_collection(config['input_uri']) count = {} cur = col.find() for row in cur: age = row['age']/10 if age in count: count[age] += 1 else: count[age] = 1 end = time.clock() print "Time used: ", end-start for key in count: print key,count[key]
def calculate_unsharded_splits(config, uri, adminuri): """@todo: Docstring for calculate_unsharded_splits :returns: @todo Note: collection_name seems unnecessary --CW """ splits = [] # will return this logging.info("Calculating unsharded splits") coll = get_collection(uri) admindb = get_database(adminuri) q = {} if not "query" in config else config.get("query") # create the command to do the splits # command to split should look like this VV # SON([('splitVector', u'test.test_data'), ('maxChunkSize', 2), # ('force', True), ('keyPattern', {'x': 1})]) split_key = config.get('split_key') split_size = config.get('split_size') full_name = coll.full_name logging.info("Calculating unsharded splits on collection %s with Split Key %s" % (full_name, split_key)) logging.info("Max split size :: %sMB" % split_size) cmd = bson.son.SON() cmd["splitVector"] = full_name cmd["maxChunkSize"] = split_size cmd["keyPattern"] = split_key cmd["force"] = False split_max = config.get('split_max') split_min = config.get('split_min') if split_min is not None and split_max is not None: cmd["min"] = split_min cmd["max"] = split_max logging.debug("Issuing Command: %s" % cmd) data = admindb.command(cmd) logging.debug("%r" % data) # results should look like this # {u'ok': 1.0, u'splitKeys': [{u'_id': ObjectId('4f49775348d9846c5e582b00')}, # {u'_id': ObjectId('4f49775548d9846c5e58553b')}]} if data.get("err"): raise Exception(data.get("err")) elif data.get("ok") != 1.0: raise Exception("Unable to calculate splits") split_data = data.get('splitKeys') if not split_data: logging.warning("WARNING: No Input Splits were calculated by the split code. \ Proceeding with a *single* split. Data may be too small, try lowering \ 'mongo.input.split_size' if this is undesirable.") else: logging.info("Calculated %s splits" % len(split_data)) last_key = split_min for bound in split_data: splits.append(_split(config, q, last_key, bound)) last_key = bound splits.append(_split(config, q, last_key, split_max)) return [s.format_uri_with_query() for s in splits]
def calculate_unsharded_splits(config, uri): """@todo: Docstring for calculate_unsharded_splits :returns: @todo Note: collection_name seems unnecessary --CW """ splits = [] # will return this logging.info("Calculating unsharded splits") coll = get_collection(uri) q = {} if not "query" in config else config.get("query") # create the command to do the splits # command to split should look like this VV # SON([('splitVector', u'test.test_data'), ('maxChunkSize', 2), # ('force', True), ('keyPattern', {'x': 1})]) split_key = config.get('split_key') split_size = config.get('split_size') full_name = coll.full_name logging.info( "Calculating unsharded splits on collection %s with Split Key %s" % (full_name, split_key)) logging.info("Max split size :: %sMB" % split_size) cmd = bson.son.SON() cmd["splitVector"] = full_name cmd["maxChunkSize"] = split_size cmd["keyPattern"] = split_key cmd["force"] = False split_max = config.get('split_max') split_min = config.get('split_min') if split_min is not None and split_max is not None: cmd["min"] = split_min cmd["max"] = split_max logging.debug("Issuing Command: %s" % cmd) data = coll.database.command(cmd) logging.debug("%r" % data) # results should look like this # {u'ok': 1.0, u'splitKeys': [{u'_id': ObjectId('4f49775348d9846c5e582b00')}, # {u'_id': ObjectId('4f49775548d9846c5e58553b')}]} if data.get("err"): raise Exception(data.get("err")) elif data.get("ok") != 1.0: raise Exception("Unable to calculate splits") split_data = data.get('splitKeys') if not split_data: logging.warning( "WARNING: No Input Splits were calculated by the split code. \ Proceeding with a *single* split. Data may be too small, try lowering \ 'mongo.input.split_size' if this is undesirable.") else: logging.info("Calculated %s splits" % len(split_data)) last_key = split_min for bound in split_data: splits.append(_split(config, q, last_key, bound)) last_key = bound splits.append(_split(config, q, last_key, split_max)) return [s.format_uri_with_query() for s in splits]