def _process_help(args): """ Used by Threshold::process() for forking. Should not be called externally. """ state = args[1] thread_args = RevertRateArgsClass(state[0], state[1], state[2], state[3], state[4], state[6], state[7], state[8]) users = args[0] if thread_args.log_progress: logging.info(__name__ + ' :: Computing reverts on %s users (PID %s)' % (len(users), str(os.getpid()))) results_agg = list() dropped_users = 0 umpd_obj = UMP_MAP[thread_args.group](users, thread_args) for user_data in umpd_obj: total_revisions = 0.0 total_reverts = 0.0 # Call query on revert rate for each user # # 1. Obtain user registration date # 2. Compute end date based on 't' # 3. Get user revisions in time period query_args = namedtuple('QueryArgs', 'date_start date_end')\ (format_mediawiki_timestamp(user_data.start), format_mediawiki_timestamp(user_data.end)) try: revisions = query_mod.\ revert_rate_user_revs_query(user_data.user, thread_args.project, query_args) except query_mod.UMQueryCallError as e: logging.error(__name__ + ' :: Failed to ' 'get revisions: {0}'.format(e.message)) dropped_users += 1 continue results_thread = mpw.build_thread_pool(revisions, _revision_proc, thread_args.rev_threads, state) for r in results_thread: total_revisions += r[0] total_reverts += r[1] if not total_revisions: results_agg.append([user_data.user, 0.0, total_revisions]) else: results_agg.append([user_data.user, total_reverts / total_revisions, total_revisions]) if thread_args.log_progress: logging.debug(__name__ + ' :: PID {0} complete. Dropped users = {1}'. format(str(os.getpid()), dropped_users)) return results_agg
def get(users, metric): for user in users: yield USER_METRIC_PERIOD_DATA(user, format_mediawiki_timestamp (metric.datetime_start), format_mediawiki_timestamp (metric.datetime_end))
def get(users, metric): for row in query_mod.user_registration_date(users, metric.project, None): reg = date_parse(row[1]) end = reg + timedelta(hours=int(metric.t)) yield USER_METRIC_PERIOD_DATA(row[0], format_mediawiki_timestamp(reg), format_mediawiki_timestamp(end))
def process(self, user_handle, **kwargs): """ Determine the edit rate of user(s). The parameter *user_handle* can be either a string or an integer or a list of these types. When the *user_handle* type is integer it is interpreted as a user id, and as a user_name for string input. If a list of users is passed to the *process* method then a dict object with edit rates keyed by user handles is returned. - Parameters: - **user_handle** - String or Integer (optionally lists). Value or list of values representing user handle(s). - Return: - Dictionary. key(string): user handle, value(Float): edit counts """ # Extract edit count for given parameters edit_rate = list() ec_kwargs = deepcopy(self.__dict__) e = ec.EditCount(**ec_kwargs).process(user_handle, **kwargs) # Compute time difference between datetime objects and get the # integer number of seconds if self.group == umpt.REGISTRATION: time_diff_sec = self.t * 3600.0 elif self.group == umpt.INPUT: try: start_ts_obj = date_parse( format_mediawiki_timestamp(self.datetime_start)) end_ts_obj = date_parse( format_mediawiki_timestamp(self.datetime_end)) except (AttributeError, ValueError): raise um.UserMetricError() time_diff_sec = (end_ts_obj - start_ts_obj).total_seconds() else: raise um.UserMetricError('group parameter not specified.') # Normalize the time interval based on the measure if self.time_unit == self.TIME_UNIT_TYPE.DAY: time_diff = time_diff_sec / (24 * 60 * 60) elif self.time_unit == self.TIME_UNIT_TYPE.HOUR: time_diff = time_diff_sec / (60 * 60) else: time_diff = time_diff_sec # Build the list of edit rate metrics for i in e.__iter__(): new_i = i[:] # Make a copy of the edit count element new_i.append(new_i[1] / (time_diff * self.time_unit_count)) new_i.append(time_diff) edit_rate.append(new_i) self._results = edit_rate return self
def get(users, metric): # For each registration date build time interval reg = get_registration_dates(users, metric.project) for row in reg: reg = date_parse(row[1]) end = reg + timedelta(hours=int(metric.t)) yield USER_METRIC_PERIOD_DATA(row[0], format_mediawiki_timestamp(reg), format_mediawiki_timestamp(end))
def _get_timeseries(date_start, date_end, interval): """ Generates a series of timestamps given a start date, end date, and interval """ # Ensure the dates are string representations date_start = format_mediawiki_timestamp(date_start) date_end = format_mediawiki_timestamp(date_end) c = date_parse(date_start) + datetime.timedelta(hours=-int(interval)) e = date_parse(date_end) while c < e: c += datetime.timedelta(hours=int(interval)) yield c
def get_users(self, date_start, date_end, project='enwiki'): """ Returns a Generator for MediaWiki user IDs. """ param_dict = { 'date_start': format_mediawiki_timestamp(date_start), 'date_end': format_mediawiki_timestamp(date_end), 'project': project, } conn = Connector(instance=settings.PROJECT_DB_MAP[project]) sql = self.QUERY_TYPES[self._query_type] % param_dict conn._cur_.execute(sql) for row in conn._cur_: yield row[0]
def generate_test_cohort_name(project): """ Generates a name for a test cohort to be inserted into usertags[_meta] """ return 'testcohort_{0}_{1}'.\ format(project, format_mediawiki_timestamp(datetime.now()))
def get_users(self, date_start, date_end, project='enwiki'): """ Returns a Generator for MediaWiki user IDs. """ # @TODO MOVE DB REFS INTO QUERY MODULE params = { 'date_start': format_mediawiki_timestamp(date_start), 'date_end': format_mediawiki_timestamp(date_end), } conn = Connector(instance=settings.PROJECT_DB_MAP[project]) query = sub_tokens(self.QUERY_TYPES[self._query_type], db=escape_var(project)) conn._cur_.execute(query, params) for row in conn._cur_: yield row[0]
def format_request_params(request_meta): """ Formats request data and ensures that it is clean using Flask escape functionality. Parameters ~~~~~~~~~~ request_meta : recordtype: Stores the request data. """ # Handle any datetime fields passed - raise an exception if the # formatting is incorrect if request_meta.start: try: request_meta.start = format_mediawiki_timestamp( escape(request_meta.start)) except ValueError: # Pass the value of the error code in `error_codes` raise MetricsAPIError(error_code=1) if request_meta.end: try: request_meta.end = format_mediawiki_timestamp( escape(request_meta.end)) except ValueError: # Pass the value of the error code in `error_codes` raise MetricsAPIError(error_code=1) if not request_meta.project: request_meta.project = DEFAULT_PROJECT if not request_meta.group in REQUEST_VALUE_MAPPING: request_meta.group = DEFAULT_GROUP # set the aggregator if there is one agg_key = get_agg_key(request_meta.aggregator, request_meta.metric) request_meta.aggregator = escape(request_meta.aggregator)\ if agg_key else None # @TODO Escape remaining input # MAP request values. _map_request_values(request_meta)
def get(users, metric): reg = get_registration_dates(users, metric.project) for row in reg: user = row[0] reg = date_parse(row[1]) start = format_mediawiki_timestamp(metric.datetime_start) end = format_mediawiki_timestamp(metric.datetime_end) if date_parse(start) <= reg <= date_parse(end): reg_plus_t = reg + timedelta(hours=int(metric.t)) yield USER_METRIC_PERIOD_DATA(user, format_mediawiki_timestamp (reg), format_mediawiki_timestamp (reg_plus_t)) else: continue
def create_cohort(cohort, project, notes="", owner=1, group=3): conn = Connector(instance=conf.__cohort_data_instance__) now = format_mediawiki_timestamp(datetime.now()) # TODO: ALLOW THE COHORT DEF TO BE REFRESHED IF IT ALREADY EXISTS logging.debug(__name__ + ' :: Adding new cohort "{0}".'. format(cohort)) if not notes: notes = 'Generated by: ' + __name__ # Create an entry in ``usertags_meta`` utm_query = query_store[create_cohort.__query_name__] try: params = { 'utm_name': str(cohort), 'utm_project': str(project), 'utm_notes': str(notes), 'utm_group': int(group), 'utm_owner': int(owner), 'utm_touched': now, 'utm_enabled': 0 } except ValueError as e: raise UMQueryCallError(__name__ + ' :: ' + str(e)) utm_query = sub_tokens(utm_query, db=conf.__cohort_meta_instance__, table=conf.__cohort_meta_db__) try: conn._cur_.execute(utm_query, params) conn._db_.commit() except (ProgrammingError, OperationalError) as e: conn._db_.rollback() raise UMQueryCallError(__name__ + ' :: ' + str(e)) del conn
def add_cohort_data(cohort, users, project, notes="", owner=1, group=3, add_meta=True): """ Adds a new cohort to backend. Parameters ~~~~~~~~~~ cohort : string Name of cohort (must be unique). users : list List of user ids to add to cohort. project : string Project of cohort. """ conn = Connector(instance=conf.__cohort_data_instance__) now = format_mediawiki_timestamp(datetime.now()) # TODO: ALLOW THE COHORT DEF TO BE REFRESHED IF IT ALREADY EXISTS if add_meta: logging.debug(__name__ + ' :: Adding new cohort "{0}".'. format(cohort)) if not notes: notes = 'Generated by: ' + __name__ # Create an entry in ``usertags_meta`` utm_query = query_store[add_cohort_data.__query_name__ + '_meta'] % { 'cohort_meta_instance': conf.__cohort_meta_instance__, 'cohort_meta_db': conf.__cohort_meta_db__, 'utm_name': escape_var(cohort), 'utm_project': escape_var(project), 'utm_notes': notes, 'utm_group': escape_var(str(group)), 'utm_owner': escape_var(str(owner)), 'utm_touched': now, 'utm_enabled': '0' } conn._cur_.execute(utm_query) try: conn._db_.commit() except (ProgrammingError, OperationalError): conn._db_.rollback() # add data to ``user_tags`` if users: # get uid for cohort usertag = get_cohort_id(cohort) logging.debug(__name__ + ' :: Adding cohort {0} users.'. format(len(users))) value_list_ut = [('{0}'.format(project), int(uid), int(usertag)) for uid in users] value_list_ut = str(value_list_ut)[1:-1] ut_query = query_store[add_cohort_data.__query_name__] % { 'cohort_meta_instance': conf.__cohort_meta_instance__, 'cohort_db': conf.__cohort_db__, 'value_list': value_list_ut } conn._cur_.execute(ut_query) try: conn._db_.commit() except (ProgrammingError, OperationalError): conn._db_.rollback() del conn
def generate_test_cohort(project, max_size=10, write=False, user_interval_size=1, rev_interval_size=7, rev_lower_limit=0): """ Build a test cohort (list of UIDs) for the given project. Parameters ~~~~~~~~~~ project : str Wikipedia project e.g. 'enwiki'. size : uint Number of users to include in the cohort. write: boolean Flag indicating whether to write the cohort to settings.__cohort_meta_db__ and settings.__cohort_db__. user_interval_size: uint Number of days within which to take registered users rev_lower_limit: int Minimum number of revisions a user must have between registration and the Returns the list of UIDs from the corresponding project that defines the test cohort. """ # Determine the time bounds that define the cohort acceptance criteria ts_start_o = datetime.now() + timedelta(days=-60) ts_end_user_o = ts_start_o + timedelta(days=int(user_interval_size)) ts_end_revs_o = ts_start_o + timedelta(days=int(rev_interval_size)) ts_start = format_mediawiki_timestamp(ts_start_o) ts_end_user = format_mediawiki_timestamp(ts_end_user_o) ts_end_revs = format_mediawiki_timestamp(ts_end_revs_o) # Synthesize query and execute logging.info(__name__ + ' :: Getting users from {0}.\n\n' '\tUser interval: {1} - {2}\n' '\tRevision interval: {1} - {3}\n' '\tMax users = {4}\n' '\tMin revs = {5}\n'. format(project, ts_start, ts_end_user, ts_end_revs, max_size, rev_lower_limit ) ) query = sub_tokens(SELECT_PROJECT_IDS, db=escape_var(str(project))) # @TODO MOVE DB REFS INTO QUERY MODULE try: params = { 'ts_start': str(ts_start), 'ts_end_user': str(ts_end_user), 'ts_end_revs': str(ts_end_revs), 'max_size': int(max_size), 'rev_lower_limit': int(rev_lower_limit), } except ValueError as e: raise Exception(__name__ + ' :: Bad params ' + str(e)) conn = Connector(instance=settings.PROJECT_DB_MAP[project]) conn._cur_.execute(query, params) users = [row for row in conn._cur_] del conn # get latest cohort id & cohort name utm_name = generate_test_cohort_name(project) # add new ids to usertags & usertags_meta if write: logging.info(__name__ + ' :: Inserting records...\n\n' '\tCohort name - {0}\n' '\t{2} - {3} record(s)\n'. format(utm_name, settings.__cohort_db__, len(users))) query_mod.add_cohort_data(utm_name, users, project) return users
def add_cohort_data(cohort, users, project, notes="", owner=1, group=3, add_meta=True): """ Adds a new cohort to backend. Parameters ~~~~~~~~~~ cohort : string Name of cohort (must be unique). users : list List of user ids to add to cohort. project : string Project of cohort. """ conn = Connector(instance=conf.__cohort_data_instance__) now = format_mediawiki_timestamp(datetime.now()) # TODO: ALLOW THE COHORT DEF TO BE REFRESHED IF IT ALREADY EXISTS if add_meta: logging.debug(__name__ + ' :: Adding new cohort "{0}".'. format(cohort)) if not notes: notes = 'Generated by: ' + __name__ # Create an entry in ``usertags_meta`` utm_query = query_store[add_cohort_data.__query_name__ + '_meta'] try: params = { 'utm_name': str(cohort), 'utm_project': str(project), 'utm_notes': str(notes), 'utm_group': int(group), 'utm_owner': int(owner), 'utm_touched': now, 'utm_enabled': 0 } except ValueError as e: raise UMQueryCallError(__name__ + ' :: ' + str(e)) utm_query = sub_tokens(utm_query, db=conf.__cohort_meta_instance__, table=conf.__cohort_meta_db__) try: conn._cur_.execute(utm_query, params) conn._db_.commit() except (ProgrammingError, OperationalError) as e: conn._db_.rollback() raise UMQueryCallError(__name__ + ' :: ' + str(e)) # add data to ``user_tags`` if users: # get uid for cohort usertag = get_cohort_id(cohort) logging.debug(__name__ + ' :: Adding cohort {0} users.'. format(len(users))) try: value_list_ut = [('{0}'.format(project), int(uid), int(usertag)) for uid in users] except ValueError as e: raise UMQueryCallError(__name__ + ' :: ' + str(e)) ut_query = query_store[add_cohort_data.__query_name__] + '(' + \ ' %s,' * len(value_list_ut)[:-1] + ')' ut_query = sub_tokens(ut_query, db=conf.__cohort_meta_instance__, table=conf.__cohort_db__) try: conn._cur_.execute(ut_query, value_list_ut) conn._db_.commit() except (ProgrammingError, OperationalError) as e: conn._db_.rollback() raise UMQueryCallError(__name__ + ' :: ' + str(e)) del conn
def build_time_series(start, end, interval, metric, aggregator, cohort, **kwargs): """ Builds a timeseries dataset for a given metric. Parameters: start: str or datetime. date + time indicating start of time series end : str or datetime. date + time indicating end of time series interval : int. integer value in hours that defines the amount of time between data-points metric : class object. Metrics class (derived from UserMetric) aggregator : method. Aggregator method used to aggregate data for time series data points cohort : list(str). list of user IDs e.g. >>> cohort = ['156171','13234584'] >>> metric = ba.BytesAdded >>> aggregator = agg.list_sum_indices >>> build_time_series('20120101000000', '20120112000000', 24, metric, aggregator, cohort, num_threads=4, num_threads_metric=2, log=True) """ log = bool(kwargs['log']) if 'log' in kwargs else False # Get datetime types, and the number of threads start = date_parse(format_mediawiki_timestamp(start)) end = date_parse(format_mediawiki_timestamp(end)) k = kwargs['kt_'] if 'kt_' in kwargs else MAX_THREADS # Compute window size and ensure that all the conditions # necessary to generate a proper time series are met num_intervals = int((end - start).total_seconds() / (3600 * interval)) intervals_per_thread = num_intervals / k # Compose the sets of time series lists f = lambda t, i: t + datetime.timedelta( hours=int(intervals_per_thread * interval * i)) time_series = [_get_timeseries(f(start, i), f(start, i+1), interval) for i in xrange(k)] if f(start, k) < end: time_series.append(_get_timeseries(f(start, k), end, interval)) event_queue = Queue() process_queue = list() if log: logging.info(__name__ + ' :: Spawning procs\n' '\t%s - %s, interval = %s\n' '\tthreads = %s ... ' % (str(start), str(end), interval, k)) for i in xrange(len(time_series)): p = Process(target=time_series_worker, args=(time_series[i], metric, aggregator, cohort, event_queue, kwargs)) p.start() process_queue.append(p) # Call the listener return time_series_listener(process_queue, event_queue)