Beispiel #1
0
    def __init__(self):
        '''
        Instantiate a RecentChangesDaemon
        '''

        self.db = db.SuggestBotDatabase()

        # Flag set by our signal handler if we receive SIGUSR1,
        # daemon will then shutdown cleanly upon next iteration
        # of its infinite loop.
        self.shutdown = False

        # Flag defining if we're running while SuggestBot is processing
        # subscribers (in any language)
        self.dailyRunning = False
Beispiel #2
0
    def __init__(self, lang='en',
                 templates={"User:SuggestBot/suggest": []},
                 ignoreList=[], verbose=False):
        """
        Initialise an object that will handle one-time requests that are added to
        a user-page.

        @param lang: What language Wikipedia we are working on
        @type lang: unicode

        @param templates: Dictionary where each key is the title of a main template
                          that we'll look for, and its corresponding value is a list
                          of synonyms (through redirects).
        @type templates: dict (unicode to list of unicode)

        @param ignoreList: List of page titles we'll ignore when looking for references
        @type ignorelist: list (of unicode strings)

        @param verbose: Write informational output?
        @type verbose: bool
        """

        self.lang = lang
        self.templates = templates
        self.db = db.SuggestBotDatabase()
        self.site = pywikibot.Site(self.lang)

        # For each defined template, create a set of templates unicode strings
        # we'll be looking for.
        self.template_pages = {}
        for (template, synonyms) in self.templates.items():
            self.template_pages[template] = set([template.lower()] + [s.lower() for s in synonyms])

        self.ignoreList = ignoreList
        self.verbose = verbose

        # Compile the regular expressions used to match section headings
        # in the language this script handles.
        self.reqHeaderReList = [re.compile(regex) for regex in config.request_head_re[self.lang]]

        self.shutdown = False # time to shut down?
Beispiel #3
0
    def __init__(self, lang, task_def=None):
        '''
        Initialize an extractor.

        :param lang: Language code of the Wikipedia we're updating data for
        :type lang: str

        :param task_def: Dictionary of tasks and their associated categories
                         to fetch articles from and traverse, as well as
                         regular expressions for inclusion and exclusion.
        :type task_def: dict
        '''

        self.lang = config.wp_langcode
        if lang:
            self.lang = lang

        self.seen_categories = set()
        self.seen_titles = set()

        if not task_def:
            self.task_def = config.tasks[self.lang]
        else:
            self.task_def = task_def

        self.site = pywikibot.Site(self.lang)

        # RegEx used for proper quoting of single quotes in SQL queries,
        # and escaping '\' (because there _is_ an article named 'Control-\');
        self.quote_re = re.compile(r"[']")
        self.backslash_re = re.compile(r"\\")

        self.db = db.SuggestBotDatabase()
        self.db_conn = None
        self.db_cursor = None

        self.queue_size = 1000
Beispiel #4
0
    def post_suggestions(self):
        """
        Find all the subscribers in the SuggestBot database for
        the current language version of Wikipedia, check if any of them
        are due up for receiving suggestions, and then post suggestions
        to their user talk page (or userspace subpage if that is set).
        """

        # today is?
        # Note: We use UTC as the basis for our calculations, because
        # the Wikipedia API also returns timestamps as UTC, thus allowing
        # us to correctly post suggestions to new subscribers who saw
        # SuggestBot post to their user talk page earlier.
        now = datetime.utcnow()

        # Query to get all regular users of the current language versions
        getRegularsQuery = r"""SELECT *
                                FROM {}
                                WHERE lang=%(lang)s
                                AND active=1
                                AND retired=0""".format(config.regulars_table)

        # Query to update a specific user's status (to processing|idle|ready)
        setStatusQuery = r"""UPDATE {} SET status=%(status)s
                              WHERE lang=%(lang)s
                              AND username=%(username)s""".format(config.regulars_table)

        # Query to update a specific user's last recommendation time
        setLastrecQuery = r"""UPDATE {}
                               SET last_rec=%(rectime)s
                               WHERE lang=%(lang)s
                               AND username=%(username)s""".format(config.regulars_table)

        # Query to get the time of the last suggestion posted
        getLastRecQuery = r"""SELECT MAX(last_rec) AS last_rec
                               FROM {}
                               WHERE lang=%(lang)s
                               AND active=1""".format(config.regulars_table)

        # query to increment the number of recommendations count
        incRecCountQuery = r'''UPDATE {}
                                SET n_recs=n_recs+1
                                WHERE lang=%(lang)s
                                AND username=%(user)s'''.format(config.regulars_table)

        
        # Query to set (or reset) the busy bit in the status info table
        updateStatusTableQuery = r"""UPDATE {status}
                                      SET daily_running=%(status)s
                                      WHERE lang=%(lang)s""".format(status=config.status_table)

        # Query to check the busy bit in the status info table, so that
        # multiple updates don't run at the same time (otherwise we'll get
        # double-posts (how do we know that?  we tested it!))
        checkStatusTableQuery = r"""SELECT daily_running FROM {status}
                                     WHERE lang=%(lang)s""".format(status=config.status_table)

        # instantiate the database object, and connect
        myDb = db.SuggestBotDatabase()
        # if connection fails, fail too.
        if not myDb.connect():
            logging.error('unable to connect to the SuggestBot database')
            return(False)

        (dbconn, dbcursor) = myDb.getConnection()

        # Check if a job is already running
        dbcursor.execute(checkStatusTableQuery, {'lang': self._lang})
        row = dbcursor.fetchone()
        dbcursor.fetchall() # flush cursor

        if ord(row['daily_running']):
            logging.warning("SuggestBot is already posting to users on {0}wiki, exiting!".format(self._lang))
            return(True)

        ## Instantiating bot so we can get suggestions
        sbot = suggestbot.SuggestBot(lang=self._lang)
        
        # Update the status of busyness to pretty busy...
        dbcursor.execute(updateStatusTableQuery, {'status': 1,
                                                  'lang': self._lang})
        dbconn.commit()

        # Figure out how long since we last ran.
        dbcursor.execute(getLastRecQuery, {'lang': self._lang})
        row = dbcursor.fetchone()
        dbcursor.fetchall() # flush cursor
        # Check that we got a row and that it's something...
        if row and row['last_rec']:
            timeSinceLastRun = now - row['last_rec']
            # If tSLR.days < 0, something's not right:
            if timeSinceLastRun.days < 0:
                logging.error("Time since last set of recs posted is negative, aborting!")
                return(False)
        else:
            # We might see this branch the first time we're running...
            timeSinceLastRun = timedelta(0)

        # If it's more than one day since we last ran, we don't look
        # into the future, instead we'll just catch up.  Otherwise,
        # we look half the distance into the future.
        # FIXME: this will bump people if one run runs a little long,
        # and the user is at the end of the next run.  We should instead
        # store the start and end-time of the last run somewhere, perhaps
        # actually have a log, and then use the last start-time from the log.
        lookaheadTime = 0
        if timeSinceLastRun.days == 0:
            lookaheadTime = timeSinceLastRun.seconds / 2

        logging.info("looking {0} seconds ahead for due recs.".format(lookaheadTime))

        # Store users who should get recs in this list:
        userQueue = list()

        dbcursor.execute(getRegularsQuery, {'lang': self._lang})
        done = False
        while not done:
            row = dbcursor.fetchone()
            if not row:
                done = True
                continue

            # The values of the row we currently use:
            lastRec = row['last_rec']
            period = row['period']
            username = row['username'].decode('utf-8')
            pagetitle = row['page_title']
            if pagetitle:
                pagetitle = pagetitle.decode('utf-8')
            design = row['design']

            recTemplate = config.templates[self._lang]['regulars']
            # If the user has chosen to use a different design from the default,
            # check if we have a template and possibly use that.
            if design:
                try:
                    recTemplate = config.templates[self._lang][design]
                except KeyError:
                    pass

            # If the user wants recs replaced, do so.
            replace = False
            if ord(row['replace_recs']):
                replace = True

            # FIXME: better to use the Subscriber object now, since it is
            # here and has slots for all the variables. Makes more sense.

            # if lastRec is None (NULL), they didn't receive any recs earlier,
            # which means it's definitely time to post.
            if not lastRec:
                ## print('lastRec is None/False, adding user')
                userQueue.append({'username': username,
                                  'page': pagetitle,
                                  'replace': replace,
                                  'template': recTemplate,
                                  })
                continue

            # Use last rec and period to check if it's time to post or not
            if period == 0:
                # Add 28 days to last rec.  This is stricly not always
                # "once a month", but it's a lot easier than trying to
                # handle overflow when the last recommendation occurred near
                # the end of the previous month (e.g. Jan to Feb).  It also
                # has the added feature that recommendations usually happen on
                # the same day of the week.
                modLastRec = lastRec + timedelta(days=28)
            else:
                # add 'period' days to last rec
                modLastRec = lastRec + timedelta(days=period)

            # subtract the modified last rec from today
            timelapse = now - modLastRec

            # It's time to post recommendations if we're past this user's due
            # date, or if it's less than lookaheadTime seconds ahead.
            # This makes sure that we don't always bump users to the
            # next day's recommendations, which would otherwise mean
            # we'd consistently post a day late.
            if timelapse.days >= 0 \
                    or (timelapse.days == -1 and (86400 - timelapse.seconds) < lookaheadTime):
                # add {'username':username, 'page':page_title} to list
                userQueue.append({'username': username,
                                  'page': pagetitle,
                                  'replace': replace,
                                  'template': recTemplate,
                                  })
        logging.info("Checked subscribers, found {n} users to post to.".format(
            n=len(userQueue)))

        # (We shuffle the user list so it doesn't necessarily get processed in
        # alphabetical order, IIRC the results of this SELECT is in sorted
        # order because we use a primary key)
        if len(userQueue) > 0:
            shuffle(userQueue)

        # for each user on said list...
        for user in userQueue:
            # update database to processing
            dbcursor.execute(setStatusQuery,
                             {'status': 'processing',
                              'lang': self._lang,
                              'username': user['username'].encode('utf-8')})
            dbconn.commit()

            logging.info("now getting recs for User:{username}".format(
                username=user['username']))

            # Get recommendations and post...
            # Design and template is passed along based on what we looked
            # up earlier.
            success = sbot.recommend(username=user['username'],
                                     userGroup='suggest',
                                     filterMinor=True,
                                     filterReverts=True,
                                     page=user['page'],
                                     recTemplate=user['template'],
                                     replace=user['replace'])
            if success:
                # update database to idle, and update last_rec
                dbcursor.execute(setStatusQuery,
                                 {'status': 'idle',
                                  'lang': self._lang,
                                  'username': user['username'].encode('utf-8')})

                # we don't update the rec time on a test run...
                if not config.testrun:
                    # Note: we call utcnow() to store the closest last recommendation
                    # time in the database.  If some slack is needed with regards to
                    # posting time, we can instead alter the scheduling.
                    dbcursor.execute(setLastrecQuery,
                                     {'rectime': datetime.utcnow(),
                                      'lang': self._lang,
                                      'username': user['username'].encode('utf-8')})
                    # update count of number of recommendations for this user
                    dbcursor.execute(incRecCountQuery,
                                     {'lang': self._lang,
                                      'user': user['username'].encode('utf-8')})
                    
                dbconn.commit()
                logging.info("Posted recs to User:{username}".format(
                    username=user['username']))

        # Update the status of busyness to pretty unbusy...
        dbcursor.execute(updateStatusTableQuery, {'status': 0,
                                                  'lang': self._lang})
        dbconn.commit()

        # disconnect from database
        myDb.disconnect()

        # ok, done
        return
Beispiel #5
0
    def update_subscribers(self):
        '''
        Update the list of subscribers based on the current configuration

        '''
        # reset all seen-values of users of the current wiki,
        # and who are currently active 
        reset_query = r"""UPDATE {}
                          SET seen=0
                          WHERE lang=%(lang)s
                          AND active=1""".format(config.regulars_table)

        # query to set all unseen users as inactive, because it means
        # they no longer use the template
        inactive_query = r"""UPDATE {}
                             SET active=0
                             WHERE lang=%(lang)s
                             AND seen=0""".format(config.regulars_table)

        ## Connect to the database
        sbdb = db.SuggestBotDatabase()
        if not sbdb.connect():
            logging.error("Unable to connect to the suggestbot database")
            return(False)

        (dbconn, dbcursor) = sbdb.getConnection()

        ## Reset the `seen` bit for all active uers
        dbcursor.execute(reset_query,
                         {'lang': self._lang})
        dbconn.commit()
        logging.info('number of rows with updated seen-values: {}'.format(dbcursor.rowcount))

        # Build the set of pages that we'll ignore when we find links to
        # our templates.
        ignorePages = set()
        for page_title in config.template_stoplist[self._lang]:
            ignorePages.add(pywikibot.Page(self._site, page_title))

        # Grab the config templates for this language Wikipedia
        configTemplates = config.config_templates[self._lang]
        configPages = set()

        # Regular expression for splitting into username + subpage-name.
        subpageSplitRe = re.compile(r'(?P<username>[^/]+)(?P<subname>/.*)')

        # Loop over them, userbox first as any settings in the config template
        # is to take priority.
        for temp_nick in ['userbox', 'config']:
            configPage = pywikibot.Page(self._site,
                                        configTemplates[temp_nick])
            configPages.add(configPage.title().strip().lower())

            # Grab all links to the config template that are redirects
            warningsList = list(configPage.getReferences(
                onlyTemplateInclusion=True,
                redirectsOnly=True))

            # Output all of them to a file so we know which users might
            # have changed usernames.
            if len(warningsList) > 0:
                logging.info('writing {n} pages that are redirects to warnings file.'.format(n=len(warningsList)))

                with codecs.open(config.userlist_warnings, 'a',
                                 'utf-8') as warningsFile:
                    warningsFile.write("The following pages are redirects:\n")
                    for page in warningsList:
                        warningsFile.write(page.title())
                        warningsFile.write("\n")
                                
            # warningsList is now used as a list of pages that contain errors
            # that need fixing.  Values are tuples where the first item is the
            # pywikibot.Page object, and the second is a short description of
            # the problem.
            warningsList = []
        
            # For each page, that we're preloading 10 of at a time to
            # speed things up:
            for page in PreloadingGenerator(
                    configPage.getReferences(
                        onlyTemplateInclusion=True,
                        redirectsOnly=False),
                    step=10):
                # Is this one of our own pages?
                if page in ignorePages:
                    continue

                logging.info('now processing {}'.format(page.title()))

                #   figure out what user this page belongs to
                #   1: check that the page namespace is user or user talk
                if page.namespace() not in [2, 3]:
                    warningsList.append((page,
                                         "namespace not user or user talk"))
                    continue

                #   2: fetch the title without namespace
                page_title = page.title(withNamespace=False,
                                        withSection=False)

                # split the page title on first "/" in case it's a subpage.
                subpageTitle = None
                username = ''
                matchObj = subpageSplitRe.match(page_title)
                if matchObj:
                    # we have a subpage
                    # store subpage title in user object
                    subpageTitle = page.title()
                    username = matchObj.group('username')
                    logging.info('found subpage {subtitle} of user {username}'.format(
                        subtitle=matchObj.group('subname'), username=username))
                else:
                    username = page_title

                subscriber = Subscriber(self._lang, username, site=self._site)

                # check the timestamp of the user's last contribution,
                # set the retired bit if the user's no longer active.
                lastEditTuple = None
                try:
                    lastEditTuple = next(subscriber.contributions(total=5))
                except StopIteration:
                    # User apparently has made no edits, so there's no tuple
                    pass
                except KeyError:
                    # pywikibot had a bug that made it fail with a KeyError
                    # if a revision's comment was deleted.  That's fixed now,
                    # but we'll capture the exception just in case something
                    # else goes wrong and triggers it.
                    pass

                if lastEditTuple is not None:
                    lastEditTime = lastEditTuple[2]
                    logging.info('user last edited at {}'.format(lastEditTime))
                    timeSinceLastEdit = datetime.utcnow() - lastEditTime
                    if timeSinceLastEdit.days >= config.retired_days:
                        subscriber._retired = 1

                # NOTE: Don't add "if not subscriber.retired:" to skip
                # the template checking if the user is retired.  Don't do that.
                # It'll lead to us storing default values for our users in
                # the database, and since we've already fetched the page text,
                # this is cheap processing.

                parsed_page = mwp.parse(page.get(), skip_style_tags=True)
                #   call page.templatesWithParams()
                for template in parsed_page.filter_templates(recursive=True):
                    ## logging.info('checking template {}'.format(template.name))
                    template_name = template.name.strip().lower()
                    if not template_name in configPages:
                        continue

                    ## logging.info('checking parameters to known template {}'.format(template_name))

                    # This accounts for the case where a user has a subpage for
                    # their userboxes.  We'll post to their user talk page.
                    if subpageTitle is not None and template_name \
                       == configTemplates['userbox'].strip().lower():
                        subpageTitle = None

                    # for each parameter...
                    for param in template.params:
                        ## True if this is a key/value pair
                        if param.showkey:
                            # translate the key (e.g. Norwegian -> English)
                            translatedKey = self._translate_key(
                                param.name.strip().lower())
                        else:
                             translatedKey = self._translate_key(
                                 param.value.strip().lower())

                        if translatedKey is None:
                            warningsList.append((page, "unaccepted parameter"))
                            continue

                        ## logging.info("using parameter {} with value {}".format(translatedKey, param.value))

                        if param.showkey:
                            # parameter is OK, use it:
                            subscriber.useParam(translatedKey, param.value.strip().lower())
                        else:
                            ## Note: This works because the methods behave
                            ## sensibly if the value evaluates to False
                            subscriber.useParam(translatedKey, "")
                        
                # Always updating this ensures that we capture users who return
                # and do not specify where they want it posted.
                subscriber._page_title = subpageTitle

                ## FIXME: if we've gone through all the templates on a page
                ## and not found SuggestBot's template, we have a parsing error.
                ## In that case, we shouldn't update the database?
                
                logging.info('updating database for this user')
                
                # update or store values for this user
                subscriber.update(sbdb)

            if len(warningsList) > 0:
                logging.info("writing {n} users that have errors to warnings file".format(n=len(warningsList)))

                warningFilename = "{base}.{lang}".format(
                    base=config.userlist_warnings,
                    lang=self._lang)
                with codecs.open(warningFilename, 'a', 'utf-8') as \
                        warningsFile:
                    warningsFile.write("The following users had errors in their configuration:\n")
                    for (page, reason) in warningsList:
                        warningsFile.write(page.title())
                        warningsFile.write(" - %s" % (reason,))
                        warningsFile.write("\n")

        dbcursor.execute(inactive_query,
                         {'lang': self._lang})
        dbconn.commit()
        logging.info("number of users set as inactive: {}".format(dbcursor.rowcount))
        sbdb.disconnect()
        return()
Beispiel #6
0
    def update(self, sbdb=None):
        '''
        Update the values for this user in the table for regular users in
        the suggestbot database.

        :param sbdb: Existing connection to the database.
        :type sbdb: suggestbot.db.SuggestBotDatabase
        '''

        # Does the user exist?
        user_exists_query = '''SELECT lang, username FROM {}
                               WHERE lang=%(lang)s
                               AND username=%(username)s'''.format(
                                   config.regulars_table)

        # NOTE: this also sets them as active if they've been inactive
        update_query = '''UPDATE {}
                          SET seen=1, active=1, page_title=%(page)s,
                          period=%(period)s, replace_recs=%(replace)s,
                          retired=%(retired)s, headlevel=%(headlevel)s
                          WHERE lang=%(lang)s
                          AND username=%(username)s'''.format(
                              config.regulars_table)

        if not sbdb:
            sbdb = db.SuggestBotDatabase()
            if not sbdb.connect():
                logging.error("Unable to connect to the SuggestBot database")
                return(False)

        (dbconn, dbcursor) = sbdb.getConnection()


        logging.info('checking if {}:User:{} is new'.format(self._lang,
                                                            self._username))
        try:
            dbcursor.execute(user_exists_query,
                             {'lang': self._lang,
                              'username': self._username.encode('utf-8')})
            if dbcursor.fetchone() is None:
                # a new user, yay!
                logging.info('user is new')
                return(self._insert(sbdb))
        except MySQLdb.Error as e:
            logging.error("Unable to query database")
            logging.error("MySQL Error {}: {}".format(e.args[0], e.args[1]))
            return(False)
        
        max_retries = 3
        num_retries = 0
        done = False
        while num_retries < max_retries and not done:
            num_retries += 1
            try:
                # Update userinfo.
                if self._page_title:
                    self._page_title = self._page_title.encode('utf-8')
                dbcursor.execute(update_query,
                                 {'page': self._page_title,
                                  'period': self._period,
                                  'replace': self._replace,
                                  'headlevel': self._headlevel,
                                  'retired': self._retired,
                                  'lang': self._lang,
                                  'username': self._username})
                # ok, done
                dbconn.commit()
                done = True
                logging.info('committed on attempt {}'.format(num_retries))
                logging.info("Committed the following user data:\n{}".format(self))
            except MySQLdb.Error as e:
                dbconn.rollback()
                logging.error(
                    "Unable to update User:{username} in database.".format(
                        username=self._username))
                logging.error("MySQL Error %d: %s\n" % (e.args[0], e.args[1]))
                ## If "CR_SERVER_GONE_ERROR" or "CR_SERVER_LOST", try reconnect
                if e.args[0] == 2006 or e.args[0] == 2013:
                    sbdb.connect()
                    (dbconn, dbcursor) = sbdb.getConnection()

        # did something go wrong?
        if num_retries == max_retries:
            return(False)

        # ok, everything went well
        return(True)
Beispiel #7
0
    def _insert(self, sbdb=None):
        '''
        Insert a new row into the database with this user's information.

        :param sbdb: Connection to the database.
        :type sbdb: suggestbot.db.SuggestBotDatabase
        '''

        if not sbdb:
            sbdb = db.SuggestBotDatabase()
            if not sbdb.connect():
                logging.error("Unable to connect to the SuggestBot database")
                return(False)
            
        (dbconn, dbcursor) = sbdb.getConnection()

        # NOTE: default values of the 'active', 'retired', 'design',
        # and 'withdrawn' columns makes it unnecessary to specify
        # the values of those colums.
        insert_query = '''INSERT INTO {}
                          (lang, username, last_rec, page_title, period,
                           replace_recs, headlevel)
                          VALUES (%(lang)s, %(username)s, %(last_rec)s,
                          %(page)s, %(period)s, %(replace)s,
                          %(headlevel)s)'''.format(config.regulars_table)
   
        # go look for posts by SuggestBot on:
        # 1: a userspace sub-page, if they've got the template there
        # 2: their user talk page
        subpage_edit = None
        usertalkpage_edit = None

        # Note: the subpage will always have a history because otherwise
        # the user couldn't have put the SuggestBot template there.
        if self._page_title is not None:
            subpage_edit = self._sbot_edited(pywikibot.Page(self._site,
                                                            self._page_title))
            
        usertalkpage = self.getUserTalkPage()
        if usertalkpage.exists():
            usertalkpage_edit = self._sbot_edited(usertalkpage)

        # If one is None, but not the other, use the one that's not None.
        if usertalkpage_edit is not None and subpage_edit is None:
            logging.debug("using edit to user talk page as last rec timestamp.")
            self.last_rec = usertalkpage_edit.strftime("%Y%m%d%H%M%S")
        elif subpage_edit is not None:
            logging.debug("using edit to {} as last rec timestamp".format(
                self._page_title))
            self.last_rec = subpage_edit.strftime("%Y%m%d%H%M%S")
        elif subpage_edit is not None and usertalkpage_edit is not None:
            # If both are not None, then use the more recent one:
            logging.debug("using the more recent edit to either user talk page or sub page as last rec timestamp.")
            if usertalkpage_edit >= subpage_edit:
                self.last_rec = usertalkpage_edit.strftime("%Y%m%d%H%M%S")
            else:
                self.last_rec = subpage_edit.strftime("%Y%m%d%H%M%S")

        # No need for anything else, we'll then store NULL, and it will
        # be populated when the regular user update runs.

        max_retries = 3
        num_retries = 0
        done = False
        while num_retries < max_retries and not done:
            num_retries += 1
            try:
                # Store user info.
                if self._page_title:
                    self._page_title = self._page_title.encode('utf-8')
                dbcursor.execute(insert_query,
                                 {'lang': self._lang,
                                  'username': self._username.encode('utf-8'),
                                  'last_rec': self._last_rec,
                                  'page': self._page_title,
                                  'period': self._period,
                                  'replace': self._replace,
                                  'headlevel': self._headlevel})
                if dbcursor.rowcount != 1:
                    logging.warning("insert of User:{username} resulted in {n} updated rows".format(
                        username=self._username,
                        n=dbcursor.rowcount))
                    dbconn.rollback()
                else:
                    dbconn.commit()
                    done = True
            except MySQLdb.Error as e:
                logging.error("unable to store User:{username}' in database".format(username=self._username).encode('utf-8'))
                logging.error("MySQL Error {}: {}".format(e.args[0], e.args[1]))
                ## If "CR_SERVER_GONE_ERROR" or "CR_SERVER_LOST",
                ## reconnect and retry if possible
                if e.args[0] == 2006 or e.args[0] == 2013:
                    sbdb.connect()
                    (dbconn, dbcursor) = sbdb.getConnection()

        logging.info("inserted the following new user:\n{}".format(self))
                    
        # ok, done
        return(True)
Beispiel #8
0
    def get_recs_at_coedit_threshold(self, lang, username, contribs, params):
        # NOTE: because rev_user and rev_title currently are VARCHAR(255) and
        # UTF-8, they're assumed to consume ~765 bytes in memory, and
        # therefore MySQL chooses to use a temp file table rather than
        # a temp memory table.  Because the queries to get users by article
        # are each only run once per article a user edited, we can live with
        # the temp file being created to move less data.

        # First query gets users who made non-minor, non-reverting edits
        # to this article.  These are _always_ potential neighbours.
        get_users_by_article_query = """
            SELECT DISTINCT rev_user 
            FROM {}
            WHERE rev_title=%(title)s
            AND rev_is_minor=0
            AND rev_comment_is_revert=0""".format(config.revision_table[lang])

        # Second query gets the other users (either minor or reverting),
        # these are only interesting if they're below the threshold for total
        # number of edits, as they otherwise know what they were doing.
        get_minor_users_by_article_query = """
            SELECT DISTINCT rev_user
            FROM {}
            WHERE rev_title=%(title)s
	    AND (rev_is_minor=1
                 OR rev_comment_is_revert=1)""".format(
            config.revision_table[lang])

        # Query to get edited articles for a given user if the user is
        # below the edit threshold.
        self.get_articles_by_user_query = """
            SELECT rev_title
	    FROM {}
	    WHERE rev_user=%(username)s""".format(config.revision_table[lang])

        # Query to get edited articles for a user who is above the threshold,
        # we then disregard minor edits and reverts.
        self.get_articles_by_expert_user_query = """
            SELECT rev_title
	    FROM {}
            WHERE rev_user=%(username)s
	    AND rev_is_minor=0
	    AND rev_comment_is_revert=0""".format(config.revision_table[lang])

        # Query to get the number of edits a user has made (in our dataset)
        self.get_editcount_query = """
            SELECT count(*) AS num_edits
	    FROM {}
	    WHERE rev_user=%(username)s""".format(config.revision_table[lang])

        # Return this many recs
        N = params['nrecs']

        # Exclude items edited by this user.
        user_for_query = username

        # Neighbours must have at least this much association.
        association_threshold = params['association-threshold']

        # Recommendations we found
        recs = []

        sbdb = db.SuggestBotDatabase()
        if not sbdb.connect():
            logging.error("Unable to connect to the SuggestBot database")
            return (recs)

        (self.dbconn, self.dbcursor) = sbdb.getConnection()

        rec_map = {}

        # How many different users have coedited a given item with something
        # in the basket
        coedit_count = {}

        # Find users who rated the given items
        coeditor_map = {}
        user_assoc = {}
        user_shared = {}

        for item in contribs:
            # For each article the user has edited, find other editors.
            other_editors = {}
            # sys.stderr.write("Looking for contributors to {}\n".format(item))

            # First we get major stakeholders in the article
            # (non-minor/non-reverting edits)
            self.dbcursor.execute(get_users_by_article_query,
                                  {'title': item.encode('utf-8')})
            for row in self.dbcursor:
                # Only compute each thing once
                user = row['rev_user'].decode('utf-8')
                if user in coeditor_map:
                    continue

                # User can't be their own neighbour
                if user == user_for_query:
                    continue

                # OK, add user to hash
                other_editors[user] = 1

# Then we check minor edits and reverts, and keep those users
# who are not in the top 10% of users (see param filter-threshold
# defined earlier).

# Users we've seen (so we don't re-run SQL queries all the time)...
            seen_minors = {}
            self.dbcursor.execute(get_minor_users_by_article_query,
                                  {'title': item.encode('utf-8')})

            # Note: using fetchall() to allow us to execute further queries
            for row in self.dbcursor.fetchall():
                user = row['rev_user'].decode('utf-8')

                # If user has already been seen, move along...
                if user in coeditor_map:
                    continue

                # If user is a major stakeholder, move along...
                if user in other_editors:
                    continue

                # If we tested this user already...
                if user in seen_minors:
                    continue

                # User can't be their own neighbour
                if user == user_for_query:
                    continue

                # Passed tests, add as a minor user
                seen_minors[user] = 1

                # Is user above threshold?  If so, skip...
                self.dbcursor.execute(self.get_editcount_query,
                                      {'username': user.encode('utf-8')})
                nedit_row = self.dbcursor.fetchone()
                self.dbcursor.fetchall()  # flush cursor
                if nedit_row['num_edits'] >= params['filter-threshold']:
                    continue

                # Passed all criteria, adding the user
                other_editors[user] = 1

# Now we have all relevant stakeholders in the article, and can
# compute the appropriate association.
            for user in other_editors.keys():
                # Add user to coeditor-map so we'll skip this user later
                coeditor_map[user] = 1

                (assoc,
                 shared) = self.user_association(user, contribs,
                                                 params['filter-threshold'])
                if assoc < association_threshold:
                    continue

                user_assoc[user] = assoc
                user_shared[user] = shared

        sys.stderr.write("Found {} pre-neighbours\n".format(len(user_assoc)))

        # Find nhood of top k users
        k = 250  # Larger nhood for more recs, hopefully
        nhood = sorted(user_assoc, key=itemgetter(1), reverse=True)[:k]

        # Gather up preds
        for user in nhood:
            # sys.stderr.write("user {} assoc {} shared {}\n".format(user, user_assoc[user], user_shared[user]))

            # Find other items they've rated
            self.dbcursor.execute(self.get_articles_by_user_query,
                                  {'username': user.encode('utf-8')})
            for row in self.dbcursor:
                new_item = row['rev_title'].decode('utf-8')
                rec_map[new_item] = rec_map.get(new_item, 0) + \
                                    user_assoc[user]
                coedit_count[new_item] = coedit_count.get(new_item, 0) + 1

        # sys.stderr.write("Gathered predictions from neighbourhood, now have {} recs\n".format(len(rec_map)))

        # Take out items already given
        for item in contribs:
            if item in rec_map:
                del (rec_map[item])

        # sys.stderr.write("Took out existing contribs, now {} recs\n".format(len(rec_map)))

        # Take out items from user
        self.dbcursor.execute(self.get_articles_by_user_query,
                              {'username': user_for_query.encode('utf-8')})
        for row in self.dbcursor:
            page_title = row['rev_title'].decode('utf-8')
            if page_title in rec_map:
                del (rec_map[page_title])

        # sys.stderr.write("Took out all known articles by user, now {} recs\n".format(len(rec_map)))

        # Filter by coedit thresh
        rec_map = {
            k: v
            for k, v in rec_map.items()
            if coedit_count[k] >= params['threshold']
        }

        # sys.stderr.write("Filtered by coedit threshold, now {} recs\n".format(len(rec_map)))

        # Done with the database, disconnect
        self.dbconn = None
        self.dbcursor = None
        sbdb.disconnect()

        # Rank 'em and spit out 'nrecs' of them
        for (item, value) in sorted(rec_map.items(),
                                    key=itemgetter(1),
                                    reverse=True)[:params['nrecs']]:
            recs.append({'item': item, 'value': value})
        return (recs)
Beispiel #9
0
    def recommend(self,
                  contribs,
                  username,
                  lang,
                  nrecs=100,
                  threshold=3,
                  backoff=0):
        '''
        Find `nrecs` number of neighbours for a given user based on
        the overlap between their contributions.

        :param contribs: The user's contributions
        :type contribs: list

        :param username: Username of the user we're recommending for
        :type username: str

        :param lang: Language code of the Wikipedia we're working on
        :type lang: str

        :param nrecs: Number of recommendations we seek
        :type nrecs: int

        :param threshold: Number of articles in common to be determined a neighbour
        :type threshold: int

        :param backoff: Do we apply a backoff strategy on the threshold?
        :type backoff: int
        '''

        # Override default variables with supplied parameters
        self.lang = lang
        self.nrecs = nrecs
        self.thresh = threshold
        self.backoff = backoff

        # SQL queries are defined here so as to not perform the string
        # formatting multiple times.
        self.get_articles_by_user_query = """SELECT rev_title
             FROM {revision_table}
             WHERE rev_user = %(username)s""".format(
            revision_table=config.revision_table[lang])

        # Query to get edited articles for a user who is above the threshold,
        # we then disregard minor edits and reverts.
        self.get_articles_by_expert_user_query = """SELECT rev_title
             FROM {revision_table}
             WHERE rev_user = %(username)s
             AND rev_is_minor = 0
             AND rev_comment_is_revert = 0""".format(
            revision_table=config.revision_table[lang])

        # Query to get the number of edits a user has made (in our dataset)
        self.get_edit_count_query = """SELECT count(*) AS numedits
             FROM {revision_table}
             WHERE rev_user = %(username)s""".format(
            revision_table=config.revision_table[lang])

        logging.info(
            "Got request for user {0}:{1} to recommend based on {2} edits!".
            format(lang, username, len(contribs)))

        # Recommendations we'll be returning
        recs = []

        database = db.SuggestBotDatabase()
        if not database.connect():
            logging.error("Failed to connect to SuggestBot database")
            return (recs)

        (self.dbconn, self.dbcursor) = database.getConnection()

        # Turn contributions into a set, as we'll only use it that way
        contribs = set(contribs)

        # Get some recs.
        recs = self.get_recs_at_coedit_threshold(username, contribs)

        # If we're allowed to back off on the coedit threshold and don't have enough
        # recs, ease off on the threshold and try again.
        needed = nrecs - len(recs)
        while backoff and self.thresh >= self.min_thresh and needed:
            self.thresh -= 1
            logging.info('Co-edit threshold is now {0}'.format(self.thresh))
            recs = self.get_recs_at_coedit_threshold(username, contribs)
            needed = nrecs = len(recs)

        # Return truncated to nrecs, switched from list of objects to list of dicts
        return ([{
            'item': rec.username,
            'value': rec.assoc
        } for rec in recs[:nrecs]])
Beispiel #10
0
 def __init__(self):
     # Set up the database
     self.db = db.SuggestBotDatabase()
     self.dbconn = None
     self.dbcursor = None