def create_mongo_fetch_generator(collection, query_filters): fetch_limit = _FETCH_LIMIT records = collection.find(query_filters, {'_id': 0}).count() + fetch_limit offset = 0 while (records > 0): yield offset, fetch_limit offset += fetch_limit records -= fetch_limit LOGGER.debug("Fetching another limit of {0}".format(str(fetch_limit)))
def push_data_file_to_es(): data = open(_ES_READ_FILE,'rb').read() if not data: return LOGGER.debug("{0} pushing data to es") req = urllib2.Request(es_conf[_ES_HOST]+'_bulk', data) req.add_header('Content-Length', '%d' % len(data)) req.add_header('Content-Type', 'application/octet-stream') res = urllib2.urlopen(req) LOGGER.debug("pushing data to es success")
def push_incremental_data_to_es(): params_collection = mongo_helpers.get_mongo_db_con( database=_MONGO_PARAMS_DB)[mongo_conf[_MONGO_PARAMS_COLLECTION]] timestamp_range = obtain_time_ranges(params_collection) LOGGER.debug( "Started river to push data to ES for {0}".format(timestamp_range)) mongo_object_ids_range = mongo_helpers.get_server_object_ids( timestamp_range=timestamp_range) raw_data_collection = mongo_helpers.get_mongo_db_con( database=_MONGO_RAW_DATA_DB)[mongo_conf[_MONGO_RAW_DATA_COLLECTION]] query_filters = {'_id': {'$gte': mongo_object_ids_range[0], '$lte': mongo_object_ids_range[ 1]}, 'category': {'$in': _categories}} mongo_fetch_generator = mongo_helpers.create_mongo_fetch_generator(raw_data_collection,query_filters) process_pipeline(raw_data_collection,mongo_fetch_generator,query_filters) params_collection.update({'elasticsearch.lastUpdated': {'$exists': 'true'}}, { '$set': {'elasticsearch.lastUpdated': str(timestamp_range[1])}})