def run(self, **jobargs): """ Run a map-reduce job with SQL data Args = { sqltype: 'mssql' ## one of ('sqlite3', 'mysql', 'pg', 'mssql') connargs: {'some_conn_parameter':'some_conn_value'} ## passed to connect(**kwargs) query: "SELECT * FROM my_table" input_key: "id" split: False ## True is not implemented } """ if jobargs.get('sqltype', '') not in sql_packages.keys(): msg = 'You must specify a valid sqltype from %r' % list(sql_packages.keys()) logging.info(msg) raise Exception(msg) elif type(jobargs.get('connargs', None)) is not dict: msg = 'Parameter connargs must be a dictionary (passed to connect(**kwargs))' logging.info(msg) raise Exception(msg) if type(jobargs.get('params', {})) is not dict: msg = 'You must have params as a dict object' logging.info(msg) raise Exception(msg) params = jobargs.get('params', {}) if 'SELECT' in jobargs.get('query', ''): jobargs['map_input_stream'] = sql_input_stream jobargs['input'] = calculate_splits(jobargs) # TODO Output jobargs.setdefault('required_modules', []).extend([ 'sqldisco.sqldb_io' ]) jobargs['params'] = params super(SqlJob, self).run(**jobargs) return self
def runTest(self): #put 20000 objects in a database, call for a split by hand, then a split by the class conn = get_connection( "mongodb://ec2-23-20-75-24.compute-1.amazonaws.com:27020/test.twitter" ) db = conn[config.get('db_name')] coll = db[config.get('collection_name')] #print db.command("collstats", coll.full_name) #NOTE: need to run this code once to populate the database, after that comment it out ''' for i in range(40000): post = {"name" : i, "date": datetime.datetime.utcnow()} coll.insert(post) ''' #print coll.count() command = bson.son.SON() command['splitVector'] = coll.full_name command['maxChunkSize'] = config.get('split_size') command['force'] = False command['keyPattern'] = {'_id': 1} #SON([('splitVector', u'test.twitter'), ('maxChunkSize', 1), ('keyPattern', {'_id': 1}), ('force', False)]) results = db.command(command) man_splits = results.get("splitKeys") assert results.get( 'ok') == 1.0, 'split command did not return with 1.0 ok' print results print 'man_splits = ', len(man_splits) assert man_splits, 'no splitKeys returned' #now do it through MongoSplit splits = calculate_splits(config) assert splits, "MongoSplitter did not return the right splits" logging.info("Calculated %s MongoInputSplits" % len(splits)) #assert len(man_splits) + 1 == len(splits) , "MongoSplitter returned a different number of splits than manual splits" '''
def runTest(self): # put 20000 objects in a database, call for a split by hand, then a split by the class conn = get_connection("mongodb://ec2-23-20-75-24.compute-1.amazonaws.com:27020/test.twitter") db = conn[config.get("db_name")] coll = db[config.get("collection_name")] # print db.command("collstats", coll.full_name) # NOTE: need to run this code once to populate the database, after that comment it out """ for i in range(40000): post = {"name" : i, "date": datetime.datetime.utcnow()} coll.insert(post) """ # print coll.count() command = bson.son.SON() command["splitVector"] = coll.full_name command["maxChunkSize"] = config.get("split_size") command["force"] = False command["keyPattern"] = {"_id": 1} # SON([('splitVector', u'test.twitter'), ('maxChunkSize', 1), ('keyPattern', {'_id': 1}), ('force', False)]) results = db.command(command) man_splits = results.get("splitKeys") assert results.get("ok") == 1.0, "split command did not return with 1.0 ok" print results print "man_splits = ", len(man_splits) assert man_splits, "no splitKeys returned" # now do it through MongoSplit splits = calculate_splits(config) assert splits, "MongoSplitter did not return the right splits" logging.info("Calculated %s MongoInputSplits" % len(splits)) # assert len(man_splits) + 1 == len(splits) , "MongoSplitter returned a different number of splits than manual splits" """