Example #1
0
    def process(self, user_handle, **kwargs):
        """ Setup metrics gathering using multiprocessing """

        self.apply_default_kwargs(kwargs,'process')

        k = kwargs['num_threads']
        log_progress = bool(kwargs['log_progress'])
        log_frequency = int(kwargs['log_frequency'])

        if user_handle:
            if not hasattr(user_handle, '__iter__'):
                user_handle = [user_handle]
            # build the argument lists for each thread

        if not user_handle:
            sql = bytes_added_rev_user_query(self._start_ts_, self._end_ts_)

            if log_progress: logging.info(
                __name__ + '::Getting all distinct users: " %s "' % sql)
            user_handle = [str(row[0]) for row in
                           self._data_source_.execute_SQL(sql)]
            if log_progress: logging.info(
                __name__ + '::Retrieved %s users.' % len(user_handle))

        # get revisions
        args = [log_progress, self._start_ts_,
                self._end_ts_, self._project_, self._namespace_]
        revs = mpw.build_thread_pool(user_handle,_get_revisions,k,args)

        # Start worker threads and aggregate results for bytes added
        args = [log_progress, log_frequency, self._project_]
        self._results = agg.list_sum_by_group(
            mpw.build_thread_pool(revs,_process_help,k,args),0)

        # Add any missing users - O(n)
        tallied_users = set([str(r[0]) for r in self._results])
        for user in user_handle:
            if not tallied_users.__contains__(str(user)):
                # Add a row indicating no activity for that user
                self._results.append([user,0,0,0,0,0])
        return self
Example #2
0
    def process(self, user_handle, **kwargs):

        self.apply_default_kwargs(kwargs,'process')

        if not hasattr(user_handle, '__iter__'): user_handle = [user_handle] # ensure the handles are iterable
        k = int(kwargs['num_threads'])
        log = bool(kwargs['log'])

        if log: logging.info(__name__ + "::parameters = " + str(kwargs))

        # Multiprocessing vs. single processing execution
        args = [self._project_, log, self._start_ts_, self._end_ts_]
        self._results = mpw.build_thread_pool(user_handle,_process_help,k,args)

        return self
Example #3
0
    def process(self, user_handle, **kwargs):

        self.apply_default_kwargs(kwargs,'process')

        # ensure the handles are iterable
        if not hasattr(user_handle, '__iter__'): user_handle = [user_handle]
        k = int(kwargs['num_threads'])
        k_r = int(kwargs['rev_threads'])
        log_progress = bool(kwargs['log_progress'])

        args = [self._project_, log_progress, self.look_ahead,
                self.look_back, self._start_ts_, self._end_ts_, k_r]
        self._results = mpw.build_thread_pool(user_handle, _process_help,
                                              k, args)

        return self
Example #4
0
def _process_help(args):
    """ Used by Threshold::process() for forking.
        Should not be called externally. """

    state = args[1]
    thread_args = RevertRateArgsClass(state[0],state[1],state[2],
                                      state[3],state[4],state[5],state[6])
    user_data = args[0]

    if thread_args.log_progress:
        logging.info(__name__ +
                    '::Computing reverts on %s users in thread %s.'
                    % (len(user_data), str(os.getpid())))
    results_agg = list()
    for user in user_data:
        conn = dl.Connector(instance='slave')
        conn._cur_.execute(
            revert_rate_user_revs_query(thread_args.project, user,
                thread_args.date_start,
                thread_args.date_end)
        )

        total_revisions = 0.0
        total_reverts = 0.0

        revisions = [rev for rev in conn._cur_]
        del conn

        results_thread = mpw.build_thread_pool(revisions, _revision_proc,
                                               thread_args.rev_threads, state)

        for r in results_thread:
            total_revisions += r[0]
            total_reverts += r[1]

        if not total_revisions:
            results_agg.append([user, 0.0, total_revisions])
        else:
            results_agg.append([user, total_reverts / total_revisions,
                                total_revisions])

    if thread_args.log_progress: logging.info(__name__ +
                                              '::PID %s complete.' %
                                              (str(os.getpid())))
    return results_agg
Example #5
0
    def process(self, user_handle, **kwargs):
        """
            This function gathers threahold (survival) metric data by: ::

                1. selecting all new user registrations within the timeframe
                    and in the user list (empty means select all withing the
                    timeframe.)
                2. For each user id find the number of revisions before (after)
                    the threshold (survival) cut-off time t

            - Parameters:
                - **user_handle** - String or Integer (optionally lists).
                    Value or list of values representing user handle(s).

            **NOTA BENE** - kwarg "survival" is used to execute has this
                determine survival rather than a threshold metric
        """

        self.apply_default_kwargs(kwargs,'process')

        k = kwargs['num_threads']
        log_progress = bool(kwargs['log_progress'])
        survival = bool(kwargs['survival'])
        restrict = bool(kwargs['restrict'])

        # Format condition on user ids.  if no user handle exists there is no
        # condition.
        if not hasattr(user_handle, '__iter__'): user_handle = [user_handle]
        if not user_handle: user_handle.append(-1) # No user is matched

        reg_query = threshold_reg_query(user_handle, self._project_)
        self._data_source_._cur_.execute(reg_query)

        # Process results
        user_data = [r for r in self._data_source_._cur_]
        args = [self._project_, self._namespace_, self._n_,
                self._t_, log_progress, survival, restrict,
                self._start_ts_, self._end_ts_]
        self._results = mpw.build_thread_pool(user_data,_process_help,k,args)

        return self