def _get_revisions(args): """ Retrieve total set of revision records for users within timeframe """ um.log_pool_worker_start(__name__, _get_revisions.__name__, args[0], args[1]) users = args[0] state = args[1] metric_params = um.UserMetric._unpack_params(state) query_args_type = namedtuple('QueryArgs', 'date_start date_end namespace') revs = list() umpd_obj = UMP_MAP[metric_params.group](users, metric_params) try: for t in umpd_obj: revs += \ list(query_mod.rev_query(t.user, metric_params.project, query_args_type(t.start, t.end, metric_params.namespace))) except query_mod.UMQueryCallError as e: logging.error('{0}:: {1}. PID={2}'.format(__name__, e.message, os.getpid())) return [] um.log_pool_worker_end(__name__, _process_help.__name__) return revs
def _process_help(args): """ Determine the bytes added over a number of revisions for user(s). The parameter *user_handle* can be either a string or an integer or a list of these types. When the *user_handle* type is integer it is interpreted as a user id, and as a user_name for string input. If a list of users is passed to the *process* method then a dict object with edit rates keyed by user handles is returned. The flow of the request is as follows: #. Get all revisions for the specified users in the given timeframe #. For each parent revision get its length #. Compute the difference in length between each revision and its parent #. Record edit count, raw bytes added (with sign and absolute), amount of positive bytes added, amount of negative bytes added - Parameters: - **user_handle** - String or Integer (optionally lists). Value or list of values representing user handle(s). - Return: - Dictionary. key(string): user handle, value(Float): edit counts """ um.log_pool_worker_start(__name__, _process_help.__name__, args[0], args[1]) revs = args[0] state = args[1] metric_params = um.UserMetric._unpack_params(state) bytes_added = dict() # Get the difference for each revision length from the parent # to compute bytes added row_count = 1 missed_records = 0 total_rows = len(revs) for row in revs: try: user = str(row[0]) rev_len_total = int(row[1]) parent_rev_id = row[2] except IndexError: missed_records += 1 continue except TypeError: missed_records += 1 continue # Produce the revision length of the parent. In case of a new # article, parent_rev_id = 0, no record in the db if parent_rev_id == 0: parent_rev_len = 0 else: try: parent_rev_len = query_mod.rev_len_query(parent_rev_id, metric_params.project) except query_mod.UMQueryCallError: missed_records += 1 logging.error(__name__ + '::Could not produce rev diff for %s on ' 'rev_id %s.' % (user, str(parent_rev_id))) continue # Update the bytes added hash - ignore revision if either rev length # is undetermined try: bytes_added_bit = int(rev_len_total) - int(parent_rev_len) except TypeError: missed_records += 1 continue try: # Exception where the user does not exist. Handle this by # creating the key bytes_added[user][0] += bytes_added_bit except KeyError: bytes_added[user] = [0] * 5 bytes_added[user][0] += bytes_added_bit pass bytes_added[user][1] += abs(bytes_added_bit) if bytes_added_bit > 0: bytes_added[user][2] += bytes_added_bit else: bytes_added[user][3] += bytes_added_bit bytes_added[user][4] += 1 row_count += 1 results = [[user] + bytes_added[user] for user in bytes_added] extra = 'Processed {0} out of {1} records.'.\ format(total_rows - missed_records, total_rows) um.log_pool_worker_end(__name__, _process_help.__name__, extra=extra) return results