Example #1
0
    def run(self, **jobargs):
        """
        Run a map-reduce job with SQL data
        
        Args = {
          sqltype: 'mssql' ## one of ('sqlite3', 'mysql', 'pg', 'mssql')
          connargs: {'some_conn_parameter':'some_conn_value'} ## passed to connect(**kwargs)
          query: "SELECT * FROM my_table"
          input_key: "id"
          split: False ## True is not implemented
        }
        """

        if jobargs.get('sqltype', '') not in sql_packages.keys():
            msg = 'You must specify a valid sqltype from %r' % list(sql_packages.keys())
            logging.info(msg)
            raise Exception(msg)
        elif type(jobargs.get('connargs', None)) is not dict:
            msg = 'Parameter connargs must be a dictionary (passed to connect(**kwargs))'
            logging.info(msg)
            raise Exception(msg)

        if type(jobargs.get('params', {})) is not dict:
            msg = 'You must have params as a dict object'
            logging.info(msg)
            raise Exception(msg)

        params = jobargs.get('params', {})

        if 'SELECT' in jobargs.get('query', ''):
            jobargs['map_input_stream'] = sql_input_stream
            jobargs['input'] = calculate_splits(jobargs)


        # TODO Output
            
        jobargs.setdefault('required_modules', []).extend([
            'sqldisco.sqldb_io'
        ])

        jobargs['params'] = params


        super(SqlJob, self).run(**jobargs)
        return self
Example #2
0
    def runTest(self):
        #put 20000 objects in a database, call for a split by hand, then a split by the class
        conn = get_connection(
            "mongodb://ec2-23-20-75-24.compute-1.amazonaws.com:27020/test.twitter"
        )
        db = conn[config.get('db_name')]
        coll = db[config.get('collection_name')]
        #print db.command("collstats", coll.full_name)

        #NOTE: need to run this code once to populate the database, after that comment it out
        '''
        for i in range(40000):
            post = {"name" : i, "date": datetime.datetime.utcnow()}
            coll.insert(post)
        '''

        #print coll.count()

        command = bson.son.SON()
        command['splitVector'] = coll.full_name
        command['maxChunkSize'] = config.get('split_size')
        command['force'] = False
        command['keyPattern'] = {'_id': 1}
        #SON([('splitVector', u'test.twitter'), ('maxChunkSize', 1), ('keyPattern', {'_id': 1}), ('force', False)])
        results = db.command(command)

        man_splits = results.get("splitKeys")
        assert results.get(
            'ok') == 1.0, 'split command did not return with 1.0 ok'
        print results
        print 'man_splits = ', len(man_splits)
        assert man_splits, 'no splitKeys returned'

        #now do it through MongoSplit
        splits = calculate_splits(config)

        assert splits, "MongoSplitter did not return the right splits"
        logging.info("Calculated %s MongoInputSplits" % len(splits))
        #assert len(man_splits) + 1 == len(splits) , "MongoSplitter returned a different number of splits than manual splits"
        '''
Example #3
0
    def runTest(self):
        # put 20000 objects in a database, call for a split by hand, then a split by the class
        conn = get_connection("mongodb://ec2-23-20-75-24.compute-1.amazonaws.com:27020/test.twitter")
        db = conn[config.get("db_name")]
        coll = db[config.get("collection_name")]
        # print db.command("collstats", coll.full_name)

        # NOTE: need to run this code once to populate the database, after that comment it out
        """
        for i in range(40000):
            post = {"name" : i, "date": datetime.datetime.utcnow()}
            coll.insert(post)
        """

        # print coll.count()

        command = bson.son.SON()
        command["splitVector"] = coll.full_name
        command["maxChunkSize"] = config.get("split_size")
        command["force"] = False
        command["keyPattern"] = {"_id": 1}
        # SON([('splitVector', u'test.twitter'), ('maxChunkSize', 1), ('keyPattern', {'_id': 1}), ('force', False)])
        results = db.command(command)

        man_splits = results.get("splitKeys")
        assert results.get("ok") == 1.0, "split command did not return with 1.0 ok"
        print results
        print "man_splits = ", len(man_splits)
        assert man_splits, "no splitKeys returned"

        # now do it through MongoSplit
        splits = calculate_splits(config)

        assert splits, "MongoSplitter did not return the right splits"
        logging.info("Calculated %s MongoInputSplits" % len(splits))
        # assert len(man_splits) + 1 == len(splits) , "MongoSplitter returned a different number of splits than manual splits"

        """