コード例 #1
0
ファイル: revert_rate.py プロジェクト: dartar/user_metrics
def _process_help(args):
    """ Used by Threshold::process() for forking.
        Should not be called externally. """

    state = args[1]
    thread_args = RevertRateArgsClass(state[0], state[1], state[2],
                                      state[3], state[4], state[6],
                                      state[7], state[8])
    users = args[0]

    if thread_args.log_progress:
        logging.info(__name__ +
                    ' :: Computing reverts on %s users (PID %s)'
                    % (len(users), str(os.getpid())))
    results_agg = list()
    dropped_users = 0

    umpd_obj = UMP_MAP[thread_args.group](users, thread_args)
    for user_data in umpd_obj:

        total_revisions = 0.0
        total_reverts = 0.0

        # Call query on revert rate for each user
        #
        # 1. Obtain user registration date
        # 2. Compute end date based on 't'
        # 3. Get user revisions in time period
        query_args = namedtuple('QueryArgs', 'date_start date_end')\
            (format_mediawiki_timestamp(user_data.start),
             format_mediawiki_timestamp(user_data.end))

        try:
            revisions = query_mod.\
                revert_rate_user_revs_query(user_data.user,
                                            thread_args.project,
                                            query_args)
        except query_mod.UMQueryCallError as e:
            logging.error(__name__ + ' :: Failed to '
                                     'get revisions: {0}'.format(e.message))
            dropped_users += 1
            continue

        results_thread = mpw.build_thread_pool(revisions, _revision_proc,
                                               thread_args.rev_threads, state)

        for r in results_thread:
            total_revisions += r[0]
            total_reverts += r[1]
        if not total_revisions:
            results_agg.append([user_data.user, 0.0, total_revisions])
        else:
            results_agg.append([user_data.user, total_reverts / total_revisions,
                                total_revisions])

    if thread_args.log_progress:
        logging.debug(__name__ + ' :: PID {0} complete. Dropped users = {1}'.
            format(str(os.getpid()), dropped_users))

    return results_agg
コード例 #2
0
ファイル: users.py プロジェクト: kl07/wikipedia_user_metrics
 def get(users, metric):
     for user in users:
         yield USER_METRIC_PERIOD_DATA(user,
                                       format_mediawiki_timestamp
                                       (metric.datetime_start),
                                       format_mediawiki_timestamp
                                       (metric.datetime_end))
コード例 #3
0
ファイル: users.py プロジェクト: sudeepdas/E3_analysis
 def get(users, metric):
     for row in query_mod.user_registration_date(users, metric.project,
                                                 None):
         reg = date_parse(row[1])
         end = reg + timedelta(hours=int(metric.t))
         yield USER_METRIC_PERIOD_DATA(row[0],
                                       format_mediawiki_timestamp(reg),
                                       format_mediawiki_timestamp(end))
コード例 #4
0
ファイル: edit_rate.py プロジェクト: dartar/user_metrics
    def process(self, user_handle, **kwargs):
        """
            Determine the edit rate of user(s).  The parameter *user_handle*
            can be either a string or an integer or a list of these types.
            When the *user_handle* type is integer it is interpreted as a user
            id, and as a user_name for string input.  If a list of users is
            passed to the *process* method then a dict object with edit rates
            keyed by user handles is returned.

            - Parameters:
                - **user_handle** - String or Integer (optionally lists).
                    Value or list of values representing user handle(s).

            - Return:
                - Dictionary. key(string): user handle, value(Float):
                edit counts
        """

        # Extract edit count for given parameters
        edit_rate = list()
        ec_kwargs = deepcopy(self.__dict__)
        e = ec.EditCount(**ec_kwargs).process(user_handle, **kwargs)

        # Compute time difference between datetime objects and get the
        # integer number of seconds

        if self.group == umpt.REGISTRATION:
            time_diff_sec = self.t * 3600.0
        elif self.group == umpt.INPUT:
            try:
                start_ts_obj = date_parse(
                    format_mediawiki_timestamp(self.datetime_start))
                end_ts_obj = date_parse(
                    format_mediawiki_timestamp(self.datetime_end))
            except (AttributeError, ValueError):
                raise um.UserMetricError()

            time_diff_sec = (end_ts_obj - start_ts_obj).total_seconds()
        else:
            raise um.UserMetricError('group parameter not specified.')

        # Normalize the time interval based on the measure
        if self.time_unit == self.TIME_UNIT_TYPE.DAY:
            time_diff = time_diff_sec / (24 * 60 * 60)
        elif self.time_unit == self.TIME_UNIT_TYPE.HOUR:
            time_diff = time_diff_sec / (60 * 60)
        else:
            time_diff = time_diff_sec

        # Build the list of edit rate metrics
        for i in e.__iter__():
            new_i = i[:]  # Make a copy of the edit count element
            new_i.append(new_i[1] / (time_diff * self.time_unit_count))
            new_i.append(time_diff)
            edit_rate.append(new_i)
        self._results = edit_rate
        return self
コード例 #5
0
ファイル: users.py プロジェクト: dartar/user_metrics
 def get(users, metric):
     # For each registration date build time interval
     reg = get_registration_dates(users, metric.project)
     for row in reg:
         reg = date_parse(row[1])
         end = reg + timedelta(hours=int(metric.t))
         yield USER_METRIC_PERIOD_DATA(row[0],
                                       format_mediawiki_timestamp(reg),
                                       format_mediawiki_timestamp(end))
コード例 #6
0
def _get_timeseries(date_start, date_end, interval):
    """
        Generates a series of timestamps given a start date,
        end date, and interval
    """

    # Ensure the dates are string representations
    date_start = format_mediawiki_timestamp(date_start)
    date_end = format_mediawiki_timestamp(date_end)

    c = date_parse(date_start) + datetime.timedelta(hours=-int(interval))
    e = date_parse(date_end)
    while c < e:
        c += datetime.timedelta(hours=int(interval))
        yield c
コード例 #7
0
ファイル: users.py プロジェクト: sudeepdas/E3_analysis
    def get_users(self, date_start, date_end, project='enwiki'):
        """
            Returns a Generator for MediaWiki user IDs.
        """
        param_dict = {
            'date_start': format_mediawiki_timestamp(date_start),
            'date_end': format_mediawiki_timestamp(date_end),
            'project': project,
        }
        conn = Connector(instance=settings.PROJECT_DB_MAP[project])
        sql = self.QUERY_TYPES[self._query_type] % param_dict
        conn._cur_.execute(sql)

        for row in conn._cur_:
            yield row[0]
コード例 #8
0
ファイル: users.py プロジェクト: kl07/wikipedia_user_metrics
def generate_test_cohort_name(project):
    """
        Generates a name for a test cohort to be inserted into usertags[_meta]
    """
    return 'testcohort_{0}_{1}'.\
        format(project,
               format_mediawiki_timestamp(datetime.now()))
コード例 #9
0
ファイル: users.py プロジェクト: kl07/wikipedia_user_metrics
    def get_users(self, date_start, date_end, project='enwiki'):
        """
            Returns a Generator for MediaWiki user IDs.
        """

        # @TODO MOVE DB REFS INTO QUERY MODULE

        params = {
            'date_start': format_mediawiki_timestamp(date_start),
            'date_end': format_mediawiki_timestamp(date_end),
        }
        conn = Connector(instance=settings.PROJECT_DB_MAP[project])
        query = sub_tokens(self.QUERY_TYPES[self._query_type],
            db=escape_var(project))
        conn._cur_.execute(query, params)

        for row in conn._cur_:
            yield row[0]
コード例 #10
0
def format_request_params(request_meta):
    """
        Formats request data and ensures that it is clean using Flask escape
        functionality.

            Parameters
            ~~~~~~~~~~

            request_meta : recordtype:
                Stores the request data.
    """

    # Handle any datetime fields passed - raise an exception if the
    # formatting is incorrect
    if request_meta.start:
        try:
            request_meta.start = format_mediawiki_timestamp(
                escape(request_meta.start))
        except ValueError:
            # Pass the value of the error code in `error_codes`
            raise MetricsAPIError(error_code=1)

    if request_meta.end:
        try:
            request_meta.end = format_mediawiki_timestamp(
                escape(request_meta.end))
        except ValueError:
            # Pass the value of the error code in `error_codes`
            raise MetricsAPIError(error_code=1)

    if not request_meta.project:
        request_meta.project = DEFAULT_PROJECT

    if not request_meta.group in REQUEST_VALUE_MAPPING:
        request_meta.group = DEFAULT_GROUP

    # set the aggregator if there is one
    agg_key = get_agg_key(request_meta.aggregator, request_meta.metric)
    request_meta.aggregator = escape(request_meta.aggregator)\
        if agg_key else None
    # @TODO Escape remaining input

    # MAP request values.
    _map_request_values(request_meta)
コード例 #11
0
ファイル: users.py プロジェクト: kl07/wikipedia_user_metrics
    def get(users, metric):
        reg = get_registration_dates(users, metric.project)
        for row in reg:

            user = row[0]
            reg = date_parse(row[1])

            start = format_mediawiki_timestamp(metric.datetime_start)
            end = format_mediawiki_timestamp(metric.datetime_end)

            if date_parse(start) <= reg <= date_parse(end):
                reg_plus_t = reg + timedelta(hours=int(metric.t))
                yield USER_METRIC_PERIOD_DATA(user,
                    format_mediawiki_timestamp
                        (reg),
                    format_mediawiki_timestamp
                        (reg_plus_t))
            else:
                continue
コード例 #12
0
def create_cohort(cohort, project,
                    notes="", owner=1, group=3):
    conn = Connector(instance=conf.__cohort_data_instance__)
    now = format_mediawiki_timestamp(datetime.now())

    # TODO: ALLOW THE COHORT DEF TO BE REFRESHED IF IT ALREADY EXISTS

    logging.debug(__name__ + ' :: Adding new cohort "{0}".'.
                  format(cohort))
    if not notes:
        notes = 'Generated by: ' + __name__

    # Create an entry in ``usertags_meta``
    utm_query = query_store[create_cohort.__query_name__]

    try:
        params = {
            'utm_name': str(cohort),
            'utm_project': str(project),
            'utm_notes': str(notes),
            'utm_group': int(group),
            'utm_owner': int(owner),
            'utm_touched': now,
            'utm_enabled': 0
        }
    except ValueError as e:
        raise UMQueryCallError(__name__ + ' :: ' + str(e))

    utm_query = sub_tokens(utm_query, db=conf.__cohort_meta_instance__,
                           table=conf.__cohort_meta_db__)
    try:
        conn._cur_.execute(utm_query, params)
        conn._db_.commit()
    except (ProgrammingError, OperationalError) as e:
        conn._db_.rollback()
        raise UMQueryCallError(__name__ + ' :: ' + str(e))
    del conn
コード例 #13
0
def add_cohort_data(cohort, users, project,
                    notes="", owner=1, group=3,
                    add_meta=True):
    """
        Adds a new cohort to backend.

        Parameters
        ~~~~~~~~~~

            cohort : string
                Name of cohort (must be unique).

            users : list
                List of user ids to add to cohort.

            project : string
                Project of cohort.
    """
    conn = Connector(instance=conf.__cohort_data_instance__)
    now = format_mediawiki_timestamp(datetime.now())

    # TODO: ALLOW THE COHORT DEF TO BE REFRESHED IF IT ALREADY EXISTS

    if add_meta:
        logging.debug(__name__ + ' :: Adding new cohort "{0}".'.
            format(cohort))
        if not notes:
            notes = 'Generated by: ' + __name__

        # Create an entry in ``usertags_meta``
        utm_query = query_store[add_cohort_data.__query_name__ + '_meta'] % {
            'cohort_meta_instance': conf.__cohort_meta_instance__,
            'cohort_meta_db': conf.__cohort_meta_db__,
            'utm_name': escape_var(cohort),
            'utm_project': escape_var(project),
            'utm_notes': notes,
            'utm_group': escape_var(str(group)),
            'utm_owner': escape_var(str(owner)),
            'utm_touched': now,
            'utm_enabled': '0'
        }
        conn._cur_.execute(utm_query)
        try:
            conn._db_.commit()
        except (ProgrammingError, OperationalError):
            conn._db_.rollback()

    # add data to ``user_tags``
    if users:

        # get uid for cohort
        usertag = get_cohort_id(cohort)

        logging.debug(__name__ + ' :: Adding cohort {0} users.'.
            format(len(users)))
        value_list_ut = [('{0}'.format(project),
                          int(uid),
                          int(usertag))
                         for uid in users]
        value_list_ut = str(value_list_ut)[1:-1]

        ut_query = query_store[add_cohort_data.__query_name__] % {
            'cohort_meta_instance': conf.__cohort_meta_instance__,
            'cohort_db': conf.__cohort_db__,
            'value_list': value_list_ut
        }
        conn._cur_.execute(ut_query)
        try:
            conn._db_.commit()
        except (ProgrammingError, OperationalError):
            conn._db_.rollback()

    del conn
コード例 #14
0
ファイル: users.py プロジェクト: kl07/wikipedia_user_metrics
def generate_test_cohort(project,
                         max_size=10,
                         write=False,
                         user_interval_size=1,
                         rev_interval_size=7,
                         rev_lower_limit=0):
    """
        Build a test cohort (list of UIDs) for the given project.

        Parameters
        ~~~~~~~~~~

        project : str
           Wikipedia project e.g. 'enwiki'.

        size : uint
           Number of users to include in the cohort.

        write: boolean
           Flag indicating whether to write the cohort to
           settings.__cohort_meta_db__ and settings.__cohort_db__.

        user_interval_size: uint
            Number of days within which to take registered users

        rev_lower_limit: int
            Minimum number of revisions a user must have between registration
            and the

        Returns the list of UIDs from the corresponding project that defines
        the test cohort.
    """

    # Determine the time bounds that define the cohort acceptance criteria

    ts_start_o = datetime.now() + timedelta(days=-60)
    ts_end_user_o = ts_start_o + timedelta(days=int(user_interval_size))
    ts_end_revs_o = ts_start_o + timedelta(days=int(rev_interval_size))

    ts_start = format_mediawiki_timestamp(ts_start_o)
    ts_end_user = format_mediawiki_timestamp(ts_end_user_o)
    ts_end_revs = format_mediawiki_timestamp(ts_end_revs_o)

    # Synthesize query and execute
    logging.info(__name__ + ' :: Getting users from {0}.\n\n'
                            '\tUser interval: {1} - {2}\n'
                            '\tRevision interval: {1} - {3}\n'
                            '\tMax users = {4}\n'
                            '\tMin revs = {5}\n'.
                            format(project,
                                   ts_start,
                                   ts_end_user,
                                   ts_end_revs,
                                   max_size,
                                   rev_lower_limit
                                   )
                 )
    query = sub_tokens(SELECT_PROJECT_IDS, db=escape_var(str(project)))

    # @TODO MOVE DB REFS INTO QUERY MODULE

    try:
        params = {
            'ts_start': str(ts_start),
            'ts_end_user': str(ts_end_user),
            'ts_end_revs': str(ts_end_revs),
            'max_size': int(max_size),
            'rev_lower_limit': int(rev_lower_limit),
        }
    except ValueError as e:
        raise Exception(__name__ + ' :: Bad params ' + str(e))

    conn = Connector(instance=settings.PROJECT_DB_MAP[project])
    conn._cur_.execute(query, params)

    users = [row for row in conn._cur_]
    del conn

    # get latest cohort id & cohort name
    utm_name = generate_test_cohort_name(project)

    # add new ids to usertags & usertags_meta
    if write:
        logging.info(__name__ + ' :: Inserting records...\n\n'
                                '\tCohort name - {0}\n'
                                '\t{2} - {3} record(s)\n'.
                                format(utm_name,
                                       settings.__cohort_db__,
                                       len(users)))
        query_mod.add_cohort_data(utm_name, users, project)

    return users
コード例 #15
0
def add_cohort_data(cohort, users, project,
                    notes="", owner=1, group=3,
                    add_meta=True):
    """
        Adds a new cohort to backend.

        Parameters
        ~~~~~~~~~~

            cohort : string
                Name of cohort (must be unique).

            users : list
                List of user ids to add to cohort.

            project : string
                Project of cohort.
    """
    conn = Connector(instance=conf.__cohort_data_instance__)
    now = format_mediawiki_timestamp(datetime.now())

    # TODO: ALLOW THE COHORT DEF TO BE REFRESHED IF IT ALREADY EXISTS

    if add_meta:
        logging.debug(__name__ + ' :: Adding new cohort "{0}".'.
                      format(cohort))
        if not notes:
            notes = 'Generated by: ' + __name__

        # Create an entry in ``usertags_meta``
        utm_query = query_store[add_cohort_data.__query_name__ + '_meta']

        try:
            params = {
                'utm_name': str(cohort),
                'utm_project': str(project),
                'utm_notes': str(notes),
                'utm_group': int(group),
                'utm_owner': int(owner),
                'utm_touched': now,
                'utm_enabled': 0
            }
        except ValueError as e:
            raise UMQueryCallError(__name__ + ' :: ' + str(e))

        utm_query = sub_tokens(utm_query, db=conf.__cohort_meta_instance__,
                               table=conf.__cohort_meta_db__)
        try:
            conn._cur_.execute(utm_query, params)
            conn._db_.commit()
        except (ProgrammingError, OperationalError) as e:
            conn._db_.rollback()
            raise UMQueryCallError(__name__ + ' :: ' + str(e))

    # add data to ``user_tags``
    if users:
        # get uid for cohort
        usertag = get_cohort_id(cohort)

        logging.debug(__name__ + ' :: Adding cohort {0} users.'.
                      format(len(users)))

        try:
            value_list_ut = [('{0}'.format(project),
                              int(uid),
                              int(usertag))
                             for uid in users]
        except ValueError as e:
            raise UMQueryCallError(__name__ + ' :: ' + str(e))

        ut_query = query_store[add_cohort_data.__query_name__] + '(' + \
                   ' %s,' * len(value_list_ut)[:-1] + ')'
        ut_query = sub_tokens(ut_query, db=conf.__cohort_meta_instance__,
                              table=conf.__cohort_db__)
        try:
            conn._cur_.execute(ut_query, value_list_ut)
            conn._db_.commit()
        except (ProgrammingError, OperationalError) as e:
            conn._db_.rollback()
            raise UMQueryCallError(__name__ + ' :: ' + str(e))
    del conn
コード例 #16
0
def build_time_series(start, end, interval, metric, aggregator, cohort,
                      **kwargs):
    """
        Builds a timeseries dataset for a given metric.

        Parameters:

            start: str or datetime.
                date + time indicating start of time series

            end : str or datetime.
                date + time indicating end of time series

            interval : int.
                integer value in hours that defines the amount of
                time between data-points

            metric : class object.
                Metrics class (derived from UserMetric)

            aggregator : method.
                Aggregator method used to aggregate data for time
                series data points

            cohort : list(str).
                list of user IDs

        e.g.

        >>> cohort = ['156171','13234584']
        >>> metric = ba.BytesAdded
        >>> aggregator = agg.list_sum_indices

        >>> build_time_series('20120101000000', '20120112000000', 24, metric,
                aggregator, cohort,
            num_threads=4, num_threads_metric=2, log=True)

    """

    log = bool(kwargs['log']) if 'log' in kwargs else False

    # Get datetime types, and the number of threads
    start = date_parse(format_mediawiki_timestamp(start))
    end = date_parse(format_mediawiki_timestamp(end))
    k = kwargs['kt_'] if 'kt_' in kwargs else MAX_THREADS

    # Compute window size and ensure that all the conditions
    # necessary to generate a proper time series are met
    num_intervals = int((end - start).total_seconds() / (3600 * interval))
    intervals_per_thread = num_intervals / k

    # Compose the sets of time series lists
    f = lambda t, i:  t + datetime.timedelta(
        hours=int(intervals_per_thread * interval * i))
    time_series = [_get_timeseries(f(start, i),
                   f(start, i+1), interval) for i in xrange(k)]
    if f(start, k) < end:
        time_series.append(_get_timeseries(f(start, k), end, interval))

    event_queue = Queue()
    process_queue = list()

    if log:
        logging.info(__name__ + ' :: Spawning procs\n'
                                '\t%s - %s, interval = %s\n'
                                '\tthreads = %s ... ' % (str(start), str(end),
                                                       interval, k))
    for i in xrange(len(time_series)):
        p = Process(target=time_series_worker,
                    args=(time_series[i], metric, aggregator,
                          cohort, event_queue, kwargs))
        p.start()
        process_queue.append(p)

    # Call the listener
    return time_series_listener(process_queue, event_queue)