Ejemplo n.º 1
0
def fetch_splits_from_shards(config, uri, slave_ok):
    """Internal method to fetch splits from shareded db

    :returns: The splits
    """
    logging.warn("WARNING getting splits that connect directly to the backend mongods is risky and might not produce correct results")
    connection = get_connection(uri)

    configDB = connection["config"]
    shardsColl = configDB["shards"]

    shardSet = []
    cur = shardsColl.find()

    try:
        for row in cur:
            host = row.get('host')
            slashIndex = host.find("/")
            if slashIndex > 0:
                host = host[slashIndex + 1:]
            shardSet.append(host)

    splits = []


    for host in shardSet:
        new_uri = get_new_URI(uri,host,slave_ok)
        config['input_uri'] = new_uri
        splits += calculate_unsharded_splits(config,slave_ok,new_uri)
        #I think this is better than commented way

    return splits

    '''
Ejemplo n.º 2
0
    def __init__(self,stream,params):
        from mongo_util import get_connection,get_collection

        config = {}
        for key, value in params.__dict__.iteritems():
            config[key] = value

        self.uri =  config.get('output_uri')
        self.conn = get_connection(self.uri)
        self.coll = get_collection(self.uri)
        self.key_name = config.get('job_output_key','_id')
        self.value_name = config.get('job_output_value')
Ejemplo n.º 3
0
    def runTest(self):
        #put 20000 objects in a database, call for a split by hand, then a split by the class
        conn = get_connection(
            "mongodb://ec2-23-20-75-24.compute-1.amazonaws.com:27020/test.twitter"
        )
        db = conn[config.get('db_name')]
        coll = db[config.get('collection_name')]
        #print db.command("collstats", coll.full_name)

        #NOTE: need to run this code once to populate the database, after that comment it out
        '''
        for i in range(40000):
            post = {"name" : i, "date": datetime.datetime.utcnow()}
            coll.insert(post)
        '''

        #print coll.count()

        command = bson.son.SON()
        command['splitVector'] = coll.full_name
        command['maxChunkSize'] = config.get('split_size')
        command['force'] = False
        command['keyPattern'] = {'_id': 1}
        #SON([('splitVector', u'test.twitter'), ('maxChunkSize', 1), ('keyPattern', {'_id': 1}), ('force', False)])
        results = db.command(command)

        man_splits = results.get("splitKeys")
        assert results.get(
            'ok') == 1.0, 'split command did not return with 1.0 ok'
        print results
        print 'man_splits = ', len(man_splits)
        assert man_splits, 'no splitKeys returned'

        #now do it through MongoSplit
        splits = calculate_splits(config)

        assert splits, "MongoSplitter did not return the right splits"
        logging.info("Calculated %s MongoInputSplits" % len(splits))
        #assert len(man_splits) + 1 == len(splits) , "MongoSplitter returned a different number of splits than manual splits"
        '''
Ejemplo n.º 4
0
    def runTest(self):
        # put 20000 objects in a database, call for a split by hand, then a split by the class
        conn = get_connection("mongodb://ec2-23-20-75-24.compute-1.amazonaws.com:27020/test.twitter")
        db = conn[config.get("db_name")]
        coll = db[config.get("collection_name")]
        # print db.command("collstats", coll.full_name)

        # NOTE: need to run this code once to populate the database, after that comment it out
        """
        for i in range(40000):
            post = {"name" : i, "date": datetime.datetime.utcnow()}
            coll.insert(post)
        """

        # print coll.count()

        command = bson.son.SON()
        command["splitVector"] = coll.full_name
        command["maxChunkSize"] = config.get("split_size")
        command["force"] = False
        command["keyPattern"] = {"_id": 1}
        # SON([('splitVector', u'test.twitter'), ('maxChunkSize', 1), ('keyPattern', {'_id': 1}), ('force', False)])
        results = db.command(command)

        man_splits = results.get("splitKeys")
        assert results.get("ok") == 1.0, "split command did not return with 1.0 ok"
        print results
        print "man_splits = ", len(man_splits)
        assert man_splits, "no splitKeys returned"

        # now do it through MongoSplit
        splits = calculate_splits(config)

        assert splits, "MongoSplitter did not return the right splits"
        logging.info("Calculated %s MongoInputSplits" % len(splits))
        # assert len(man_splits) + 1 == len(splits) , "MongoSplitter returned a different number of splits than manual splits"

        """
Ejemplo n.º 5
0
def init_databases(config_file):
    global userdata_db, plog_db, report_db
    userdata_db = mongo_util.get_db('entities_main', config_file)  
    plog_db = mongo_util.get_connection('datastore', config_file)['kadb_pl']
    report_db = mongo_util.get_db('reporting', config_file)
Ejemplo n.º 6
0
def fetch_splits_via_chunks(config, uri, useShards, slaveOk):
    """Retrieves split objects based on chunks in mongo

    :returns: The splits
    """
    originalQuery = config.get("query")
    if useShards:
        logging.warn("WARNING getting splits that connect directly to the \
                backend mongods is risky and might not produce correct results")

    logging.debug("fetch_splits_via_chunks: originalQuery: %s" % originalQuery)

    connection = get_connection(uri)

    configDB = connection["config"]

    shardMap = {}

    if useShards:
        shardsColl = configDB["shards"]
        cur = shardsColl.find()

        try:
            for row in cur:
                host = row.get('host')
                slashIndex = host.find("/")
                if slashIndex > 0:
                    host = host[slashIndex + 1:]
                shardMap[row.get('_id')] = host

    logging.debug("MongoInputFormat.getSplitsUsingChunks(): shard map is: %s" % shardMap)

    chunksCollection = configDB["chunks"]
    logging.info(configDB.collection_names())
    query = bson.son.SON()

    uri_info = uri_parser.parse_uri(uri)
    query["ns"] = uri_info['database'] + '.' + uri_info['collection']

    cur = chunksCollection.find(query)
    logging.info("query is ", query)
    logging.info(cur.count())
    logging.info(chunksCollection.find().count())

    try:
        numChunks = 0

        splits = []

        for row in cur:
            numChunks += 1
            minObj = row.get('min')
            shardKeyQuery = bson.son.SON()
            min = bson.son.SON()
            max = bson.son.SON()

            for key in minObj:
                tMin = minObj[key]
                tMax = (row.get('max'))[key]

                #@to-do do type comparison first?
                min[key] = tMin
                max[key] = tMax

            if originalQuery == None:
                originalQuery = bson.son.SON()

            shardKeyQuery["$query"] = originalQuery
            shardKeyQuery["$min"] = min
            shardKeyQuery["$max"] = max

            inputURI = config.get("input_uri")

            if useShards:
                shardName = row.get('shard')
                host = shardMap[shardName]
                inputURI = get_new_URI(inputURI, host, slaveOk)

            splits.append(MongoInputSplit(
                inputURI,
                config.get("input_key"),
                shardKeyQuery,
                config.get("fields"),
                config.get("sort"),
                config.get("limit", 0),
                config.get("skip", 0),
                config.get("timeout", True)))


    # return splits in uri format for disco
    return [s.format_uri_with_query() for s in splits]