def create_mongo_fetch_generator(collection, query_filters): fetch_limit = _FETCH_LIMIT records = collection.find(query_filters, {'_id': 0}).count() + fetch_limit offset = 0 while (records > 0): yield offset, fetch_limit offset += fetch_limit records -= fetch_limit LOGGER.debug("Fetching another limit of {0}".format(str(fetch_limit)))
def push_data_file_to_es(): data = open(_ES_READ_FILE,'rb').read() if not data: return LOGGER.debug("{0} pushing data to es") req = urllib2.Request(es_conf[_ES_HOST]+'_bulk', data) req.add_header('Content-Length', '%d' % len(data)) req.add_header('Content-Type', 'application/octet-stream') res = urllib2.urlopen(req) LOGGER.debug("pushing data to es success")
def fetch_mongo_data_for_fetch_range(collection, offset, fetch_limit,query_filters): _mongo_fetched_data_dict = [] data_cursor = collection.find(query_filters, {'_id': 0}).skip(offset).limit(fetch_limit) for document in data_cursor: try: if not 'category' in document: continue _mongo_fetched_data_dict.append(document) except Exception, e: LOGGER.error( "For document {0} encountered error {1} ".format(doc, e))
def dump_data_dict_to_es_readable_file(mongo_fetched_data_dict): put = {"index": {"_index": es_conf[_ES_INDEX], "_type": "wevent"}} events_dump_file = open(_ES_READ_FILE, 'w') try: for document in mongo_fetched_data_dict: put['index']['_type'] = document['category'] json.dump(put,events_dump_file) events_dump_file.write('\n') json.dump(document,events_dump_file) events_dump_file.write('\n') except Exception, e: LOGGER.error( "For document {0} encountered dumping data error {1} ".format(document, e))
def push_incremental_data_to_es(): params_collection = mongo_helpers.get_mongo_db_con( database=_MONGO_PARAMS_DB)[mongo_conf[_MONGO_PARAMS_COLLECTION]] timestamp_range = obtain_time_ranges(params_collection) LOGGER.debug( "Started river to push data to ES for {0}".format(timestamp_range)) mongo_object_ids_range = mongo_helpers.get_server_object_ids( timestamp_range=timestamp_range) raw_data_collection = mongo_helpers.get_mongo_db_con( database=_MONGO_RAW_DATA_DB)[mongo_conf[_MONGO_RAW_DATA_COLLECTION]] query_filters = {'_id': {'$gte': mongo_object_ids_range[0], '$lte': mongo_object_ids_range[ 1]}, 'category': {'$in': _categories}} mongo_fetch_generator = mongo_helpers.create_mongo_fetch_generator(raw_data_collection,query_filters) process_pipeline(raw_data_collection,mongo_fetch_generator,query_filters) params_collection.update({'elasticsearch.lastUpdated': {'$exists': 'true'}}, { '$set': {'elasticsearch.lastUpdated': str(timestamp_range[1])}})