Exemple #1
0
    def get_users(self, date_start, date_end, project='enwiki'):
        """
            Returns a Generator for MediaWiki user IDs.
        """

        # @TODO MOVE DB REFS INTO QUERY MODULE

        params = {
            'date_start': format_mediawiki_timestamp(date_start),
            'date_end': format_mediawiki_timestamp(date_end),
        }
        conn = Connector(instance=settings.PROJECT_DB_MAP[project])
        query = sub_tokens(self.QUERY_TYPES[self._query_type],
            db=escape_var(project))
        conn._cur_.execute(query, params)

        for row in conn._cur_:
            yield row[0]
Exemple #2
0
def generate_test_cohort(project,
                         max_size=10,
                         write=False,
                         user_interval_size=1,
                         rev_interval_size=7,
                         rev_lower_limit=0):
    """
        Build a test cohort (list of UIDs) for the given project.

        Parameters
        ~~~~~~~~~~

        project : str
           Wikipedia project e.g. 'enwiki'.

        size : uint
           Number of users to include in the cohort.

        write: boolean
           Flag indicating whether to write the cohort to
           settings.__cohort_meta_db__ and settings.__cohort_db__.

        user_interval_size: uint
            Number of days within which to take registered users

        rev_lower_limit: int
            Minimum number of revisions a user must have between registration
            and the

        Returns the list of UIDs from the corresponding project that defines
        the test cohort.
    """

    # Determine the time bounds that define the cohort acceptance criteria

    ts_start_o = datetime.now() + timedelta(days=-60)
    ts_end_user_o = ts_start_o + timedelta(days=int(user_interval_size))
    ts_end_revs_o = ts_start_o + timedelta(days=int(rev_interval_size))

    ts_start = format_mediawiki_timestamp(ts_start_o)
    ts_end_user = format_mediawiki_timestamp(ts_end_user_o)
    ts_end_revs = format_mediawiki_timestamp(ts_end_revs_o)

    # Synthesize query and execute
    logging.info(__name__ + ' :: Getting users from {0}.\n\n'
                            '\tUser interval: {1} - {2}\n'
                            '\tRevision interval: {1} - {3}\n'
                            '\tMax users = {4}\n'
                            '\tMin revs = {5}\n'.
                            format(project,
                                   ts_start,
                                   ts_end_user,
                                   ts_end_revs,
                                   max_size,
                                   rev_lower_limit
                                   )
                 )
    query = sub_tokens(SELECT_PROJECT_IDS, db=escape_var(str(project)))

    # @TODO MOVE DB REFS INTO QUERY MODULE

    try:
        params = {
            'ts_start': str(ts_start),
            'ts_end_user': str(ts_end_user),
            'ts_end_revs': str(ts_end_revs),
            'max_size': int(max_size),
            'rev_lower_limit': int(rev_lower_limit),
        }
    except ValueError as e:
        raise Exception(__name__ + ' :: Bad params ' + str(e))

    conn = Connector(instance=settings.PROJECT_DB_MAP[project])
    conn._cur_.execute(query, params)

    users = [row for row in conn._cur_]
    del conn

    # get latest cohort id & cohort name
    utm_name = generate_test_cohort_name(project)

    # add new ids to usertags & usertags_meta
    if write:
        logging.info(__name__ + ' :: Inserting records...\n\n'
                                '\tCohort name - {0}\n'
                                '\t{2} - {3} record(s)\n'.
                                format(utm_name,
                                       settings.__cohort_db__,
                                       len(users)))
        query_mod.add_cohort_data(utm_name, users, project)

    return users