コード例 #1
0
ファイル: mongodb_input.py プロジェクト: sajal/MongoDisco
def open(url=None, task=None):
    from mongo_util import get_collection

    query = son.SON(json.loads(url, object_hook=json_util.object_hook))
    uri = query['inputURI']
    uri_info = uri_parser.parse_uri(uri)
    spec = query['query']
    fields = query['fields']
    skip = query['skip']
    limit = query['limit']
    timeout = query['timeout']
    sort = query['sort']
    slave_ok = query['slave_ok']

    #go around: connect to the sonnection then choose db by ['dbname']

    collection = get_collection(uri)
    cursor = collection.find(spec=spec,
                             fields=fields,
                             skip=skip,
                             limit=limit,
                             sort=sort,
                             timeout=timeout,
                             slave_okay=slave_ok)

    wrapper = MongoWrapper(cursor)
    return wrapper
コード例 #2
0
ファイル: mongodb_output.py プロジェクト: dcrosta/mongo-disco
    def __init__(self,stream,params):
        from mongo_util import get_connection,get_collection

        config = {}
        for key, value in params.__dict__.iteritems():
            config[key] = value

        self.uri =  config.get('output_uri')
        self.conn = get_connection(self.uri)
        self.coll = get_collection(self.uri)
        self.key_name = config.get('job_output_key','_id')
        self.value_name = config.get('job_output_value')
コード例 #3
0
ファイル: test_shards.py プロジェクト: dcrosta/mongo-disco
def test_traditional_way():
    start = time.clock()

    col = get_collection(config['input_uri'])
    count = {}
    cur = col.find()
    for row in cur:
        age = row['age']/10
        if age in count:
            count[age] += 1
        else:
            count[age] = 1

    end = time.clock()
    print "Time used: ", end-start
    for key in count:
        print key,count[key]
コード例 #4
0
ファイル: mongodb_input.py プロジェクト: dcrosta/mongo-disco
def open(url=None, task=None):
    from mongo_util import get_collection

    query = son.SON(json.loads(url, object_hook=json_util.object_hook))
    uri = query['inputURI']
    uri_info = uri_parser.parse_uri(uri)
    spec = query['query']
    fields = query['fields'] 
    skip = query['skip'] 
    limit = query['limit'] 
    timeout = query['timeout'] 
    sort = query['sort'] 


    #go around: connect to the sonnection then choose db by ['dbname']

    collection = get_collection(uri)
    cursor = collection.find(spec = spec, fields = fields, skip = skip, limit = limit, sort = sort, timeout = timeout)

    wrapper = MongoWrapper(cursor)
    return wrapper
コード例 #5
0
ファイル: splitter.py プロジェクト: dcrosta/mongo-disco
def calculate_unsharded_splits(config, slaveOk, uri):
    """@todo: Docstring for calculate_unsharded_splits

    :returns: @todo

    Note: collection_name seems unnecessary --CW

    """
    splits = []  # will return this
    logging.info("Calculating unsharded splits")

    coll = get_collection(uri)

    q = {} if not "query" in config else config.get("query")

    # create the command to do the splits
    # command to split should look like this VV
    # SON([('splitVector', u'test.test_data'), ('maxChunkSize', 2),
    #    ('force', True), ('keyPattern', {'x': 1})])

    split_key = config.get('split_key')
    split_size = config.get('split_size')
    full_name  = coll.full_name
    logging.info("Calculating unsharded splits on collection %s with Split Key %s" %
            (full_name, split_key))
    logging.info("Max split size :: %sMB" % split_size)

    cmd = bson.son.SON()
    cmd["splitVector"]  = full_name
    cmd["maxChunkSize"] = split_size
    cmd["keyPattern"]   = split_key
    cmd["force"]        = False

    logging.debug("Issuing Command: %s" % cmd)
    data = coll.database.command(cmd)
    logging.debug("%r" % data)

    # results should look like this
    # {u'ok': 1.0, u'splitKeys': [{u'_id': ObjectId('4f49775348d9846c5e582b00')},
    # {u'_id': ObjectId('4f49775548d9846c5e58553b')}]}

    if data.get("err"):
        raise Exception(data.get("err"))
    elif data.get("ok") != 1.0:
        raise Exception("Unable to calculate splits")

    split_data = data.get('splitKeys')
    if not split_data:
        logging.warning("WARNING: No Input Splits were calculated by the split code. \
                Proceeding with a *single* split. Data may be too small, try lowering \
                'mongo.input.split_size'  if this is undesirable.")
    else:
        logging.info("Calculated %s splits" % len(split_data))

        last_key = None
        for bound in split_data:
            splits.append(_split(config, q, last_key, bound))
            last_key = bound
        splits.append(_split(config, q, last_key, None))

    return [s.format_uri_with_query() for s in splits]