def retrieve_for_timestamp(self, timestamp, unprocessed_only):
     """ method iterates thru all objects in timetable collections and load them into timetable"""
     resp = dict()
     resp.update(self._search_by_level(CollectionContext.get_collection(self.logger, COLLECTION_TIMETABLE_HOURLY),
                                       timestamp, unprocessed_only))
     resp.update(self._search_by_level(CollectionContext.get_collection(self.logger, COLLECTION_TIMETABLE_DAILY),
                                       timestamp, unprocessed_only))
     resp.update(self._search_by_level(CollectionContext.get_collection(self.logger, COLLECTION_TIMETABLE_MONTHLY),
                                       timestamp, unprocessed_only))
     resp.update(self._search_by_level(CollectionContext.get_collection(self.logger, COLLECTION_TIMETABLE_YEARLY),
                                       timestamp, unprocessed_only))
     return resp
Beispiel #2
0
 def _get_timetable_collection(self, process_name):
     """timetable stores timeperiod in 4 collections: hourly, daily, monthly and yearly; method looks for the
     proper timetable_collection base on process TIME_QUALIFIER"""
     qualifier = ProcessContext.get_time_qualifier(process_name)
     if qualifier == ProcessContext.QUALIFIER_HOURLY:
         collection = CollectionContext.get_collection(self.logger, COLLECTION_TIMETABLE_HOURLY)
     elif qualifier == ProcessContext.QUALIFIER_DAILY:
         collection = CollectionContext.get_collection(self.logger, COLLECTION_TIMETABLE_DAILY)
     elif qualifier == ProcessContext.QUALIFIER_MONTHLY:
         collection = CollectionContext.get_collection(self.logger, COLLECTION_TIMETABLE_MONTHLY)
     elif qualifier == ProcessContext.QUALIFIER_YEARLY:
         collection = CollectionContext.get_collection(self.logger, COLLECTION_TIMETABLE_YEARLY)
     else:
         raise ValueError('unknown time qualifier: %s for %s' % (qualifier, process_name))
     return collection
def retrieve_configuration(logger, box_id):
    """ method reads box configuration from the MongoDB"""
    collection = CollectionContext.get_collection(logger, COLLECTION_BOX_CONFIGURATION)
    document = collection.find_one( { BoxConfigurationEntry.BOX_ID : box_id } )
    if document is None:
        raise LookupError('MongoDB has no process list for box_id = %r' % box_id)
    return BoxConfigurationEntry(document)
def clean_session_entries():
    connection = CollectionContext.get_collection(logging, COLLECTION_SINGLE_SESSION)
    for i in range(base_fixtures.TOTAL_ENTRIES):
        key = generate_session_composite_key(i, base_fixtures.TOTAL_ENTRIES)
        connection.remove({
                AbstractModel.DOMAIN_NAME : key[0],
                AbstractModel.TIMESTAMP : key[1],
                AbstractModel.FAMILY_USER_PROFILE + '.' + AbstractModel.SESSION_ID : 'session_id_' + str(i)})
    def _mq_callback(self, message):
        """ wraps call of abstract method with try/except 
        in case exception breaks the abstract method, this method:
        - catches the exception
        - logs the exception
        - marks unit of work as INVALID"""
        try:
            single_session_collection = CollectionContext.get_collection(self.logger, COLLECTION_SINGLE_SESSION)
            raw_data = RawData(message.body)
            query = {AbstractModel.DOMAIN_NAME: raw_data.get_key()[0],
                     AbstractModel.FAMILY_USER_PROFILE + '.' + AbstractModel.SESSION_ID: raw_data.get_session_id()}
            document = single_session_collection.find_one(query)

            if document is None:
                # insert the record
                session = SingleSessionStatistics()

                # input data constraints - both session_id and user_id must be present in MQ message
                session.composite_key(raw_data.get_key()[0], time_helper.raw_to_session(raw_data.get_key()[1]))
                session.set_session_id(raw_data.get_session_id())
                session.set_ip(raw_data.get_ip())
                session.set_total_duration(0)

                session = self.update_session_body(raw_data, session)
                self.add_entry(session, 0, raw_data)
                self.performance_ticker.increment_insert()
            else:
                # update the click_xxx info
                session = SingleSessionStatistics(document)

                session = self.update_session_body(raw_data, session)
                duration = raw_data.get_key()[1] - time_helper.session_to_epoch(session.get_key()[1])
                session.set_total_duration(duration)

                index = session.get_number_of_entries()
                self.add_entry(session, index, raw_data)
                self.performance_ticker.increment_update()

            if time.time() - self._last_safe_save_time < self.SAFE_SAVE_INTERVAL:
                isSafe = False
            else:
                isSafe = True
                self._last_safe_save_time = time.time()

            single_session_collection.save(session.get_document(), safe=isSafe)
            self.consumer.acknowledge(message.delivery_tag)
        except AutoReconnect as e:
            self.logger.error('MongoDB connection error: %r\nRe-queueing message & exiting the worker' % e)
            self.consumer.reject(message.delivery_tag)
            raise e
        except (KeyError, IndexError) as e:
            self.logger.error('Error is considered Unrecoverable: %r\nCancelled message: %r' % (e, message.body))
            self.consumer.cancel(message.delivery_tag)
        except Exception as e:
            self.logger.error('Error is considered Recoverable: %r\nRe-queueing message: %r' % (e, message.body))
            self.consumer.reject(message.delivery_tag)
def retrieve_by_id(logger, object_id):
    """ method finds unit_of_work record and returns it to the caller"""
    query = {"_id": object_id}
    collection = CollectionContext.get_collection(logger, COLLECTION_UNITS_OF_WORK)
    db_entry = collection.find_one(query)
    if db_entry is None:
        msg = "Unit_of_work with ID=%s was not found" % str(object_id)
        logger.warning(msg)
        raise LookupError(msg)
    return UnitOfWorkEntry(db_entry)
def retrieve(logger, process_name):
    """ method finds scheduler_configuration record and returns it to the caller"""
    query = {"process_name": process_name}
    collection = CollectionContext.get_collection(logger, COLLECTION_SCHEDULER_CONFIGURATION)
    db_entry = collection.find_one(query)
    if db_entry is None:
        msg = "SchedulerConfigurationEntry for process=%s was not found" % str(process_name)
        logger.warning(msg)
        raise LookupError(msg)
    return SchedulerConfigurationEntry(db_entry)
def retrieve_by_params(logger, process_name, timestamp, start_obj_id, end_obj_id):
    """ method finds unit_of_work record and returns it to the caller"""
    query = {
        UnitOfWorkEntry.PROCESS_NAME: process_name,
        UnitOfWorkEntry.TIMESTAMP: timestamp,
        UnitOfWorkEntry.START_OBJ_ID: start_obj_id,
        UnitOfWorkEntry.END_OBJ_ID: end_obj_id,
    }
    collection = CollectionContext.get_collection(logger, COLLECTION_UNITS_OF_WORK)
    db_entry = collection.find_one(query)
    if db_entry is None:
        msg = "Unit_of_work satisfying query %r was not found" % query
        logger.warning(msg)
        raise LookupError(msg)
    return UnitOfWorkEntry(db_entry)
    def update_scope_of_processing(self, process_name, unit_of_work, start_time, end_time, time_record):
        """method reads collection and refine slice upper bound for processing"""
        source_collection_name = unit_of_work.get_source_collection()
        source_collection = CollectionContext.get_collection(self.logger, source_collection_name)

        query = { AbstractModel.TIMESTAMP : { '$gte' : start_time, '$lt' : end_time } }
        dec_search = source_collection.find(spec=query, fields='_id').sort('_id', DESCENDING).limit(1)
        last_object_id = dec_search[0]['_id']
        unit_of_work.set_end_id(str(last_object_id))
        unit_of_work_helper.update(self.logger, unit_of_work)

        msg = 'Updated range to process for %s in timeperiod %s for collection %s: [%s : %s]'\
                % (process_name, time_record.get_timestamp(), source_collection_name,
                   unit_of_work.get_start_id(), str(last_object_id))
        self._log_message(INFO, process_name, time_record, msg)
def create_site_stats(collection, composite_key_function, statistics_klass, seed='RANDOM_SEED_OBJECT'):
    connection = CollectionContext.get_collection(logging, collection)
    random.seed(seed)
    object_ids = []
    for i in range(TOTAL_ENTRIES):
        key = composite_key_function(i, TOTAL_ENTRIES)
        site_stat = statistics_klass()
        site_stat.composite_key(key[0], key[1])
        site_stat.set_number_of_visits(random.randint(1, 1000))
        site_stat.set_total_duration(random.randint(0, 100))
        
        items = _generate_entries('os_', 5, i)
        site_stat.set_os(items)
        
        items = _generate_entries('browser_', 5, i)
        site_stat.set_browsers(items)
        
        items = dict() 
        items['(320, 240)'] = 3
        items['(640, 480)'] = 5
        items['(1024, 960)'] = 7
        items['(1280, 768)'] = 9
        site_stat.set_screen_res(items)
        
        items = dict() 
        items['ca_en'] = 3
        items['ca_fr'] = 5
        items['ua_uk'] = 7
        items['us_en'] = 9
        site_stat.set_languages(items)
        
        items = dict() 
        items['ca'] = 3
        items['fr'] = 5
        items['uk'] = 7
        items['us'] = 9
        site_stat.set_countries(items)

        stat_id = connection.insert(site_stat.get_document(), safe=True)
        object_ids.append(stat_id)
                
    return object_ids 
    def compute_scope_of_processing(self, process_name, start_time, end_time, time_record):
        """method reads collection and identify slice for processing"""
        source_collection_name = ProcessContext.get_source_collection(process_name)
        target_collection_name = ProcessContext.get_target_collection(process_name)
        source_collection = CollectionContext.get_collection(self.logger, source_collection_name)

        query = { AbstractModel.TIMESTAMP : { '$gte' : start_time, '$lt' : end_time } }
        asc_search = source_collection.find(spec=query, fields='_id').sort('_id', ASCENDING).limit(1)
        if asc_search.count() == 0:
            raise LookupError('No messages in timeperiod: %s:%s in collection %s'
                                % (start_time, end_time, source_collection_name))
        first_object_id = asc_search[0]['_id']

        dec_search = source_collection.find(spec=query, fields='_id').sort('_id', DESCENDING).limit(1)
        last_object_id = dec_search[0]['_id']

        unit_of_work = UnitOfWorkEntry()
        unit_of_work.set_timestamp(start_time)
        unit_of_work.set_start_id(str(first_object_id))
        unit_of_work.set_end_id(str(last_object_id))
        unit_of_work.set_start_timestamp(start_time)
        unit_of_work.set_end_timestamp(end_time)
        unit_of_work.set_created_at(datetime.utcnow())
        unit_of_work.set_source_collection(source_collection_name)
        unit_of_work.set_target_collection(target_collection_name)
        unit_of_work.set_state(unit_of_work.STATE_REQUESTED)
        unit_of_work.set_process_name(process_name)
        unit_of_work.set_number_of_retries(0)

        try:
            uow_id = unit_of_work_helper.insert(self.logger, unit_of_work)
        except DuplicateKeyError as e:
            e.first_object_id = str(first_object_id)
            e.last_object_id = str(last_object_id)
            e.process_name = process_name
            e.timestamp = start_time
            raise e

        self.publishers.get_publisher(process_name).publish(str(uow_id))
        msg = 'Published: UOW %r for %r in timeperiod %r.' % (uow_id, process_name, start_time)
        self._log_message(INFO, process_name, time_record, msg)
        return unit_of_work
def create_session_stats(composite_key_function, seed='RANDOM_SEED_OBJECT'):
    time_array = ['20010303102210', '20010303102212', '20010303102215', '20010303102250']
    connection = CollectionContext.get_collection(logging, COLLECTION_SINGLE_SESSION)
    random.seed(seed)
    object_ids = []
    for i in range(TOTAL_ENTRIES):
        key = composite_key_function(i, TOTAL_ENTRIES)
        session = SingleSessionStatistics()
        session.composite_key(key[0], key[1])
        session.set_session_id('session_id_' + str(i))
        session.set_ip('192.168.0.2')
        if i % 3 == 0:
            session.set_screen_res(240, 360)
        elif i % 5 == 0:
            session.set_screen_res(360, 480)
        else:
            session.set_screen_res(760, 980)

        if i % 2 == 0:
            session.set_os('Linux')
            session.set_browser('FF ' + str(i % 4))
            session.set_language('en_ca')
            session.set_country('ca')
        else:
            session.set_os('Windows')
            session.set_browser('IE ' + str(i % 9))
            session.set_language('ua_uk')
            session.set_country('eu')

        session.set_total_duration(random.randint(0, 200))
        session.set_number_of_pageviews(random.randint(1, 5))

        for index in range(random.randint(1, 4)):
            session.set_number_of_entries(index + 1)
            session.set_entry_timestamp(index, time_array[index])

        sess_id = connection.insert(session.get_document(), safe=True)
        object_ids.append(sess_id)
        
    return object_ids 
Beispiel #13
0
    def start(self):
        """ reading scheduler configurations and starting timers to trigger events """
        collection = CollectionContext.get_collection(self.logger, COLLECTION_SCHEDULER_CONFIGURATION)
        cursor = collection.find({})
        if cursor.count() == 0:
            raise LookupError('MongoDB has no scheduler configuration entries')

        for entry in cursor:
            document = SchedulerConfigurationEntry(entry)
            interval = document.get_interval()
            is_active = document.get_process_state() == SchedulerConfigurationEntry.STATE_ON
            type = ProcessContext.get_type(document.get_process_name())
            parameters = [document.get_process_name(), document]

            if type == TYPE_ALERT:
                function = self.fire_alert
            elif type == TYPE_HORIZONTAL_AGGREGATOR:
                function = self.fire_worker
            elif type == TYPE_VERTICAL_AGGREGATOR:
                function = self.fire_worker
            elif type == TYPE_GARBAGE_COLLECTOR:
                function = self.fire_garbage_collector
            else:
                self.logger.error('Can not start scheduler for %s since it has no processing function' % type)
                continue

            handler = RepeatTimer(interval, function, args=parameters)
            self.thread_handlers[document.get_process_name()] = handler

            if is_active:
                handler.start()
                self.logger.info('Started scheduler for %s:%s, triggering every %d seconds'\
                % (type, document.get_process_name(), interval))
            else:
                self.logger.info('Handler for %s:%s registered in Scheduler. Idle until activated.'\
                % (type, document.get_process_name()))

        # as Scheduler is now initialized and running - we can safely start its MX
        self.start_mx()
def update(logger, scheduler_configuration):
    """ method finds scheduler_configuration record and update its DB representation"""
    w_number = CollectionContext.get_w_number(logger, COLLECTION_SCHEDULER_CONFIGURATION)
    collection = CollectionContext.get_collection(logger, COLLECTION_SCHEDULER_CONFIGURATION)
    collection.save(scheduler_configuration.get_document(), safe=True, w=w_number)
def remove(logger, uow_id):
    w_number = CollectionContext.get_w_number(logger, COLLECTION_UNITS_OF_WORK)
    collection = CollectionContext.get_collection(logger, COLLECTION_UNITS_OF_WORK)
    collection.remove(uow_id, safe=True, w=w_number)
def insert(logger, unit_of_work):
    """ inserts unit of work to MongoDB. @throws DuplicateKeyError is such record already exists """
    w_number = CollectionContext.get_w_number(logger, COLLECTION_UNITS_OF_WORK)
    collection = CollectionContext.get_collection(logger, COLLECTION_UNITS_OF_WORK)
    uow_id = collection.insert(unit_of_work.get_document(), safe=True, w=w_number)
    return uow_id
def update(logger, unit_of_work):
    """ method finds unit_of_work record and change its status"""
    w_number = CollectionContext.get_w_number(logger, COLLECTION_UNITS_OF_WORK)
    collection = CollectionContext.get_collection(logger, COLLECTION_UNITS_OF_WORK)
    collection.save(unit_of_work.get_document(), safe=True, w=w_number)
Beispiel #18
0
 def _save_time_record(self, process_name, time_record):
     collection = self._get_timetable_collection(process_name)
     w_number = CollectionContext.get_w_number(self.logger, COLLECTION_TIMETABLE_YEARLY)
     return collection.save(time_record.get_document(), safe=True, w=w_number)
Beispiel #19
0
 def load_tree(self):
     """ method iterates thru all objects in timetable collections and load them into timetable"""
     self._build_tree_by_level(CollectionContext.get_collection(self.logger, COLLECTION_TIMETABLE_HOURLY))
     self._build_tree_by_level(CollectionContext.get_collection(self.logger, COLLECTION_TIMETABLE_DAILY))
     self._build_tree_by_level(CollectionContext.get_collection(self.logger, COLLECTION_TIMETABLE_MONTHLY))
     self._build_tree_by_level(CollectionContext.get_collection(self.logger, COLLECTION_TIMETABLE_YEARLY))
 def _get_target_collection(self):
     """collection to store aggregated documents"""
     return CollectionContext.get_collection(self.logger, ProcessContext.get_target_collection(self.process_name))
def update_configuration(logger, box_configuration):
    """ method updates box configuration in the MongoDB"""
    w_number = CollectionContext.get_w_number(logger, COLLECTION_BOX_CONFIGURATION)
    collection = CollectionContext.get_collection(logger, COLLECTION_BOX_CONFIGURATION)
    collection.save(box_configuration.get_document(), safe=True, w=w_number)
 def __init__(self, process_name):
     super(GarbageCollectorWorker, self).__init__(process_name)
     self.publishers = PublishersPool(self.logger)
     self.collection = CollectionContext.get_collection(self.logger, COLLECTION_UNITS_OF_WORK)
     self.lock = Lock()
 def _get_source_collection(self):
     """collection with data for processing"""
     return CollectionContext.get_collection(self.logger, ProcessContext.get_source_collection(self.process_name))