class ProcessWebsiteSegment(BaseApiaryTask):
    """Bot function to retrieve a segment and issue the tasks."""

    def check_timer(self, site_id, token, expire_timer):
        """Determine if this task should be run again."""

        my_token = "wikiapiary_%d_%s_lastcheck" % (site_id, token)
        LOGGER.debug("Checking cache token %s" % my_token)

        current_time = int(datetime.datetime.now().strftime("%s"))
        try:
            stored_time = int(self.redis_db.get(my_token))
        except:
            stored_time = 0
        LOGGER.debug("Stored time for %s of %d" % (my_token, stored_time))

        time_since = current_time - stored_time
        if time_since > expire_timer:
            # Update the timer regardless of success
            self.redis_db.setex(
                my_token,
                60*60*24*7,    # Expire these tokens after 1 week
                int(datetime.datetime.now().strftime("%s"))
            )
            LOGGER.debug("check_timer %d %s is %d old, check it." % (site_id, token, time_since))
            return True
        else:
            LOGGER.debug("check_timer %d %s is %d old, skip." % (site_id, token, time_since))
            return False

    def check_if_true(self, site_obj, key_name):
        """Get a boolean out of Smeantic MediaWikis unhelpful return data structure."""

        try:
            if site_obj['printouts'][key_name][0] == "t":
                return True
            else:
                return False
        except:
            return False

    def run(self, segment):
        """Run a segment for processing."""
        LOGGER.info("Processing segment %d", segment)

        # TODO: It is very expensive to get the segment everytime. It would be better
        # to cache the segment in redis and just check the most recent "Modification date"
        # from the wiki. If that date hasn't changed, use the cached version from redis.

        my_query = ''.join([
            '[[Category:Website]]',
            '[[Is defunct::False]]',
            '[[Is active::True]]',
            "[[Has bot segment::%d]]" % segment,
            '|?Has API URL',
            '|?Has statistics URL',
            '|?Check every',
            '|?Creation date',
            '|?Page ID',
            '|?Collect general data',
            '|?Collect extension data',
            '|?Collect skin data',
            '|?Collect statistics',
            '|?Collect semantic statistics',
            '|?Collect logs',
            '|?Collect recent changes',
            '|?Collect statistics stats',
            '|sort=Creation date',
            '|order=asc',
            '|limit=1000'])
        
        LOGGER.debug("Query: %s", my_query)

        sites = self.bumble_bee.call({
            'action': 'ask',
            'query': my_query
        })

        i = 0
        if len(sites['query']['results']) > 0:
            for pagename, site in sites['query']['results'].items():
                i += 1
                LOGGER.info("Processing %s", pagename)

                site_id = int(site['printouts']['Page ID'][0])
                try:
                    stats_url = site['printouts']['Has statistics URL'][0]
                except Exception, e:
                    stats_url = None

                try:
                    api_url = site['printouts']['Has API URL'][0]
                except Exception, e:
                    api_url = None

                try:
                    check_every = int(site['printouts']['Check every'][0])
                except Exception, e:
                    check_every = 60*60*4

                # Get the statistical data
                if self.check_if_true(site,'Collect semantic statistics'):
                    if self.check_timer(site_id, 'smwinfo', check_every):
                        GetSMWInfoTask.delay(site_id, pagename, api_url)

                if self.check_if_true(site,'Collect statistics'):
                    if self.check_timer(site_id, 'statistics', check_every):
                        GetStatisticsTask.delay(site_id, pagename, 'API', api_url, stats_url)

                if self.check_if_true(site,'Collect statistics stats'):
                    if self.check_timer(site_id, 'statistics', check_every):
                        GetStatisticsTask.delay(site_id, pagename, 'Statistics', api_url, stats_url)

                # Get the metadata
                if self.check_if_true(site,'Collect general data'):
                    if self.check_timer(site_id, 'general', 24*60*60):
                        RecordGeneralTask.delay(site_id, pagename, api_url)
                        # TODO: Interwikimap and Namespaces should be moved to their own sections
                        # (see below) but the wiki needs to have fields added for those.
                        RecordInterwikimapTask.delay(site_id, pagename, api_url)
                        RecordNamespacesTask.delay(site_id, pagename, api_url)
                        RecordWhoisTask.delay(site_id, pagename, api_url)
                        MaxmindTask.delay(site_id, pagename, api_url)

                if self.check_if_true(site,'Collect extension data'):
                    if self.check_timer(site_id, 'extension', 24*60*60):
                        RecordExtensionsTask.delay(site_id, pagename, api_url)

                if self.check_if_true(site,'Collect skin data'):
                    if self.check_timer(site_id, 'skin', 3*24*60*60):
                        RecordSkinsTask.delay(site_id, pagename, api_url)
Example #2
0
 def test_record_general_fake(self):
     """Get general site information from non-existent website."""
     task = RecordGeneralTask()
     with self.assertRaises(Exception):
         task.run(18, 'Foo', 'https://foo.bar.com')
Example #3
0
 def test_record_general_fake(self):
     """Get general site information from non-existent website."""
     task = RecordGeneralTask()
     with self.assertRaises(Exception):
         task.run(18, 'Foo', 'https://foo.bar.com')
Example #4
0
 def test_record_general(self):
     """Get general site information from WikiApiary."""
     task = RecordGeneralTask()
     retval = task.run(18, 'WikiApiary', 'https://wikiapiary.com/w/api.php')
     if 'edit' not in retval:
         raise Exception(retval)
Example #5
0
 def test_record_general(self):
     """Get general site information from WikiApiary."""
     task = RecordGeneralTask()
     retval = task.run(18, 'WikiApiary', 'https://wikiapiary.com/w/api.php')
     if 'edit' not in retval:
         raise Exception(retval)