Exemple #1
0
    def test_sleep_for_rate_limit(self):
        """Test whether the time to reset is zero if the sleep time is negative"""

        client = MockedClient(CLIENT_API_URL, sleep_time=0.1, max_retries=1,
                              min_rate_to_sleep=100,
                              sleep_for_rate=True)
        client.rate_limit = 50
        self.rate_limit_reset_ts = -1

        before = datetime_utcnow().replace(microsecond=0).timestamp()
        client.sleep_for_rate_limit()
        after = datetime_utcnow().replace(microsecond=0).timestamp()

        self.assertEqual(before, after)
    def fetch(self, from_date=DEFAULT_DATETIME, to_date=None):
        """Fetch tests data from the server.

        This method fetches tests data from a server that were
        updated since the given date.

        :param from_date: obtain data updated since this date
        :param to_date: obtain data updated before this date

        :returns: a generator of items
        """
        from_date = datetime_to_utc(from_date)
        to_date = datetime_to_utc(to_date) if to_date else datetime_utcnow()

        logger.info("Fetching tests data of '%s' group from %s to %s",
                    self.url, str(from_date),
                    str(to_date) if to_date else '--')

        pages = self.client.results(from_date=from_date, to_date=to_date)
        ndata = 0

        for raw_page in pages:
            parsed_data = self.parse_json(raw_page)

            for test_data in parsed_data:
                yield test_data
                ndata += 1

        logger.info("Fetch process completed: %s tests data fetched", ndata)
Exemple #3
0
    def __get_headers(self):
        """Set header for request"""

        headers = {'Content-type': 'application/json',
                   'date': datetime_utcnow().strftime('%Y-%m-%d %H:%M:%S')}

        return headers
Exemple #4
0
    def test_archived_after(self):
        """Test if only those items archived after a date are returned"""

        manager = ArchiveManager(self.test_path)

        args = {
            'origin': 'http://example.com/',
            'category': 'mock_item',
            'tag': 'test',
            'subtype': 'mocksubtype',
            'from-date': str_to_datetime('2015-01-01')
        }

        items = fetch(CommandBackend, args, manager=manager)
        items = [item for item in items]
        self.assertEqual(len(items), 5)

        archived_dt = datetime_utcnow()

        items = fetch(CommandBackend, args, manager=manager)
        items = [item for item in items]
        self.assertEqual(len(items), 5)

        # Fetch items from the archive
        items = fetch_from_archive(CommandBackend, args, manager, 'mock_item',
                                   str_to_datetime('1970-01-01'))
        items = [item for item in items]
        self.assertEqual(len(items), 10)

        # Fetch items archived after the given date
        items = fetch_from_archive(CommandBackend, args, manager, 'mock_item',
                                   archived_dt)
        items = [item for item in items]
        self.assertEqual(len(items), 5)
Exemple #5
0
    def fetch(self):
        """Fetch data from a Docker Hub repository.

        The method retrieves, from a repository stored in Docker Hub,
        its data which includes number of pulls, stars, description,
        among other data.

        :returns: a generator of data
        """
        logger.info("Fetching data from '%s' repository of '%s' owner",
                    self.repository, self.owner)

        self._purge_cache_queue()

        raw_data = self.client.repository(self.owner, self.repository)
        fetched_on = datetime_utcnow().timestamp()

        self._push_cache_queue({'fetched_on': fetched_on, 'data': raw_data})

        data = self.parse_json(raw_data)
        data['fetched_on'] = fetched_on
        yield data

        self._flush_cache_queue()

        logger.info("Fetch process completed")
Exemple #6
0
    def fetch(self, from_date=DEFAULT_DATETIME):
        """Fetch the mbox files from the remote archiver.

        This method stores the archives in the path given during the
        initialization of this object.

        HyperKitty archives are accessed month by month and stored following
        the schema year-month. Archives are fetched from the given month
        till the current month.

        :param from_date: fetch archives that store messages
            equal or after the given date; only year and month values
            are compared

        :returns: a list of tuples, storing the links and paths of the
            fetched archives
        """
        logger.info("Downloading mboxes from '%s' to since %s", self.url,
                    str(from_date))
        logger.debug("Storing mboxes in '%s'", self.dirpath)

        # Check mailing list URL
        r = requests.get(self.url)
        r.raise_for_status()

        from_date = datetime_to_utc(from_date)
        to_end = datetime_utcnow()
        to_end += dateutil.relativedelta.relativedelta(months=1)

        months = months_range(from_date, to_end)

        fetched = []

        if not os.path.exists(self.dirpath):
            os.makedirs(self.dirpath)

        tmbox = 0

        for dts in months:
            tmbox += 1
            start, end = dts[0], dts[1]
            filename = start.strftime("%Y-%m.mbox.gz")
            filepath = os.path.join(self.dirpath, filename)

            url = urijoin(self.url, 'export', filename)

            params = {
                'start': start.strftime("%Y-%m-%d"),
                'end': end.strftime("%Y-%m-%d")
            }

            success = self._download_archive(url, params, filepath)

            if success:
                fetched.append((url, filepath))

        logger.info("%s/%s MBoxes downloaded", len(fetched), tmbox)

        return fetched
Exemple #7
0
    def __fetch_summary(self):
        """Fetch summary"""

        raw_summary = self.client.summary()
        summary = json.loads(raw_summary)
        summary['fetched_on'] = str(datetime_utcnow())

        yield summary
    def test_datetime_utcnow(self):
        """Check whether timezone information is added"""

        now = datetime_utcnow()
        timezone = str(now.tzinfo)
        expected = "UTC+00:00"

        self.assertTrue(timezone, expected)
Exemple #9
0
    def calculate_time_to_reset(self):
        """Number of seconds to wait. They are contained in the rate limit reset header"""

        time_to_reset = self.rate_limit_reset_ts - (
            datetime_utcnow().replace(microsecond=0).timestamp() + 1)
        time_to_reset = 0 if time_to_reset < 0 else time_to_reset

        return time_to_reset
    def test_search_archived_after(self):
        """Check if a set of archives created after a given date are searched"""

        archive_mng_path = os.path.join(self.test_path, ARCHIVE_TEST_DIR)
        manager = ArchiveManager(archive_mng_path)

        # First set of archives to create
        metadata = [
            {
                'origin': 'https://example.com',
                'backend_name': 'git',
                'backend_version': '0.8',
                'category': 'commit',
                'backend_params': {},
            },
            {
                'origin': 'https://example.com',
                'backend_name': 'gerrit',
                'backend_version': '0.1',
                'category': 'changes',
                'backend_params': {}
            },
        ]

        for meta in metadata:
            archive = manager.create_archive()
            archive.init_metadata(**meta)

        # Second set, archived after the date we'll use to search
        after_dt = datetime_utcnow()
        metadata = [
            {
                'origin': 'https://example.org',
                'backend_name': 'git',
                'backend_version': '0.1',
                'category': 'commit',
                'backend_params': {}
            },
            {
                'origin': 'https://example.com',
                'backend_name': 'git',
                'backend_version': '0.1',
                'category': 'commit',
                'backend_params': {}
            }
        ]

        for meta in metadata:
            archive = manager.create_archive()
            archive.init_metadata(**meta)
            meta['filepath'] = archive.archive_path

        archives = manager.search('https://example.com', 'git', 'commit',
                                  after_dt)

        expected = [metadata[1]['filepath']]
        self.assertListEqual(archives, expected)
Exemple #11
0
    def calculate_time_to_reset(self):
        """Calculate the seconds to reset the token requests, by obtaining the different
        between the current date and the next date when the token is fully regenerated.
        """

        time_to_reset = self.rate_limit_reset_ts - (
            datetime_utcnow().replace(microsecond=0).timestamp() + 1)
        time_to_reset = 0 if time_to_reset < 0 else time_to_reset

        return time_to_reset
Exemple #12
0
    def fetch_items(self, **kwargs):
        """Fetch the messages"""

        from_date = kwargs['from_date']

        logger.info("Fetching messages of '%s' channel from %s",
                    self.channel, str(from_date))

        raw_info = self.client.channel_info(self.channel)
        channel_info = self.parse_channel_info(raw_info)

        oldest = datetime_to_utc(from_date).timestamp()
        latest = datetime_utcnow().timestamp()

        # Minimum value supported by Slack is 0 not 0.0
        if oldest == 0.0:
            oldest = 0

        # Slack does not include on its result the lower limit
        # of the search if it has the same date of 'oldest'. To get
        # this messages too, we substract a low value to be sure
        # the dates are not the same. To avoid precision problems
        # it is substracted by five decimals and not by six.
        if oldest > 0.0:
            oldest -= .00001

        fetching = True
        nmsgs = 0

        while fetching:
            raw_history = self.client.history(self.channel,
                                              oldest=oldest, latest=latest)
            messages, fetching = self.parse_history(raw_history)

            for message in messages:
                # Fetch user data
                user_id = None
                if 'user' in message:
                    user_id = message['user']
                elif 'comment' in message:
                    user_id = message['comment']['user']

                if user_id:
                    message['user_data'] = self.__get_or_fetch_user(user_id)

                message['channel_info'] = channel_info
                yield message

                nmsgs += 1

                if fetching:
                    latest = float(message['ts'])

        logger.info("Fetch process completed: %s message fetched", nmsgs)
Exemple #13
0
    def init_metadata(self, origin, backend_name, backend_version, category,
                      backend_params):
        """Init metadata information.

        Metatada is composed by basic information needed to identify
        where archived data came from and how it can be retrieved
        and built into Perceval items.

        :param: origin: identifier of the repository
        :param: backend_name: name of the backend
        :param: backend_version: version of the backend
        :param: category: category of the items fetched
        :param: backend_params: dict representation of the fetch parameters

        raises ArchiveError: when an error occurs initializing the metadata
        """
        created_on = datetime_to_utc(datetime_utcnow())
        created_on_dumped = created_on.isoformat()
        backend_params_dumped = pickle.dumps(backend_params, 0)

        metadata = (
            origin,
            backend_name,
            backend_version,
            category,
            backend_params_dumped,
            created_on_dumped,
        )

        try:
            cursor = self._db.cursor()
            insert_stmt = "INSERT INTO " + self.METADATA_TABLE + " "\
                          "(origin, backend_name, backend_version, " \
                          "category, backend_params, created_on) " \
                          "VALUES (?, ?, ?, ?, ?, ?)"
            cursor.execute(insert_stmt, metadata)

            self._db.commit()
            cursor.close()
        except sqlite3.DatabaseError as e:
            msg = "metadata initialization error; cause: %s" % str(e)
            raise ArchiveError(cause=msg)

        self.origin = origin
        self.backend_name = backend_name
        self.backend_version = backend_version
        self.category = category
        self.backend_params = backend_params
        self.created_on = created_on

        logger.debug("Metadata of archive %s initialized to %s",
                     self.archive_path, metadata)
Exemple #14
0
    def test_init_metadata(self):
        """Test whether metadata information is properly initialized"""

        archive_path = os.path.join(self.test_path, 'myarchive')
        archive = Archive.create(archive_path)

        before_dt = datetime_to_utc(datetime_utcnow())
        archive.init_metadata('marvel.com', 'marvel-comics-backend', '0.1.0',
                              'issue', {'from_date': before_dt})
        after_dt = datetime_to_utc(datetime_utcnow())

        archive_copy = Archive(archive_path)

        # Both copies should have the same parameters
        for arch in [archive, archive_copy]:
            self.assertEqual(arch.origin, 'marvel.com')
            self.assertEqual(arch.backend_name, 'marvel-comics-backend')
            self.assertEqual(arch.backend_version, '0.1.0')
            self.assertEqual(arch.item_category, 'issue')
            self.assertGreaterEqual(arch.created_on, before_dt)
            self.assertLessEqual(arch.created_on, after_dt)
            self.assertDictEqual(arch.backend_params, {'from_date': before_dt})
Exemple #15
0
    def fetch_items(self, **kwargs):
        """Fetch items from a Docker Hub repository"""

        logger.info("Fetching data from '%s' repository of '%s' owner",
                    self.repository, self.owner)

        raw_data = self.client.repository(self.owner, self.repository)
        fetched_on = datetime_utcnow().timestamp()

        data = self.parse_json(raw_data)
        data['fetched_on'] = fetched_on
        yield data

        logger.info("Fetch process completed")
Exemple #16
0
    def calculate_time_to_reset(self):
        """Number of seconds to wait.

        The time is obtained by the different between the current date
        and the next date when the token is fully regenerated.
        """
        current_epoch = datetime_utcnow().replace(
            microsecond=0).timestamp() + 1
        time_to_reset = self.rate_limit_reset_ts - current_epoch

        if time_to_reset < 0:
            time_to_reset = 0

        return time_to_reset
Exemple #17
0
    def test_decorator(self):
        backend = MockedBackend('test', 'mytag')
        before = datetime_utcnow().timestamp()
        items = [item for item in backend.fetch()]
        after = datetime_utcnow().timestamp()

        for x in range(5):
            item = items[x]

            expected_uuid = uuid('test', str(x))

            self.assertEqual(item['data']['item'], x)
            self.assertEqual(item['backend_name'], 'MockedBackend')
            self.assertEqual(item['backend_version'], '0.2.0')
            self.assertEqual(item['perceval_version'], __version__)
            self.assertEqual(item['origin'], 'test')
            self.assertEqual(item['uuid'], expected_uuid)
            self.assertEqual(item['updated_on'], '2016-01-01')
            self.assertEqual(item['category'], 'mock_item')
            self.assertEqual(item['tag'], 'mytag')
            self.assertGreater(item['timestamp'], before)
            self.assertLess(item['timestamp'], after)

            before = item['timestamp']
    def test_search(self):
        """Test if a set of archives is found based on the given criteria"""

        archive_mng_path = os.path.join(self.test_path, ARCHIVE_TEST_DIR)
        manager = ArchiveManager(archive_mng_path)

        dt = datetime_utcnow()
        metadata = [
            {
                'origin': 'https://example.com',
                'backend_name': 'git',
                'backend_version': '0.8',
                'category': 'commit',
                'backend_params': {},
            },
            {
                'origin': 'https://example.com',
                'backend_name': 'gerrit',
                'backend_version': '0.1',
                'category': 'changes',
                'backend_params': {}
            },
            {
                'origin': 'https://example.org',
                'backend_name': 'git',
                'backend_version': '0.1',
                'category': 'commit',
                'backend_params': {}
            },
            {
                'origin': 'https://example.com',
                'backend_name': 'git',
                'backend_version': '0.1',
                'category': 'commit',
                'backend_params': {}
            }
        ]

        for meta in metadata:
            archive = manager.create_archive()
            archive.init_metadata(**meta)
            meta['filepath'] = archive.archive_path

        archives = manager.search('https://example.com', 'git', 'commit', dt)

        expected = [metadata[0]['filepath'], metadata[3]['filepath']]
        self.assertListEqual(archives, expected)
    def test_search_no_match(self):
        """Check if an empty set of archives is returned when none match the criteria"""

        archive_mng_path = os.path.join(self.test_path, ARCHIVE_TEST_DIR)
        manager = ArchiveManager(archive_mng_path)

        dt = datetime_utcnow()
        metadata = [
            {
                'origin': 'https://example.com',
                'backend_name': 'git',
                'backend_version': '0.8',
                'category': 'commit',
                'backend_params': {},
            },
            {
                'origin': 'https://example.com',
                'backend_name': 'gerrit',
                'backend_version': '0.1',
                'category': 'changes',
                'backend_params': {}
            },
            {
                'origin': 'https://example.org',
                'backend_name': 'git',
                'backend_version': '0.1',
                'category': 'commit',
                'backend_params': {}
            },
            {
                'origin': 'https://example.com',
                'backend_name': 'git',
                'backend_version': '0.1',
                'category': 'commit',
                'backend_params': {}
            }
        ]

        for meta in metadata:
            archive = manager.create_archive()
            archive.init_metadata(**meta)
            meta['filepath'] = archive.archive_path

        archives = manager.search('https://example.com', 'bugzilla', 'commit', dt)
        self.assertListEqual(archives, [])
Exemple #20
0
    def test_calculate_time_to_reset(self):
        """Test whether the time to reset is zero if the sleep time is negative"""

        httpretty.register_uri(
            httpretty.GET,
            GITLAB_URL_PROJECT,
            body='',
            status=200,
            forcing_headers={
                'RateLimit-Remaining':
                '20',
                'RateLimit-Reset':
                int(datetime_utcnow().replace(microsecond=0).timestamp())
            })

        client = GitLabClient("fdroid", "fdroiddata", "your-token")
        time_to_reset = client.calculate_time_to_reset()

        self.assertEqual(time_to_reset, 0)
Exemple #21
0
    def process(self, items_block):
        """Return items as they come, updating their metadata__enriched_on field.

        :param items_block:
        :return: hits blocks as they come, updating their metadata__enriched_on field. Namedtuple containing:
            - processed: number of processed hits
            - out_items: a list containing items ready to be written.
        """

        out_items = []

        for hit in items_block:
            if __name__ == '__main__':
                hit['_source'][
                    'metadata__enriched_on'] = datetime.datetime_utcnow(
                    ).isoformat()
            out_items.append(hit)

        return self.ProcessResults(processed=0, out_items=out_items)
Exemple #22
0
    def fetch_items(self, category, **kwargs):
        """Fetch the Dockher Hub items

        :param category: the category of items to fetch
        :param kwargs: backend arguments

        :returns: a generator of items
        """
        logger.info("Fetching data from '%s' repository of '%s' owner",
                    self.repository, self.owner)

        raw_data = self.client.repository(self.owner, self.repository)
        fetched_on = datetime_utcnow().timestamp()

        data = self.parse_json(raw_data)
        data['fetched_on'] = fetched_on
        yield data

        logger.info("Fetch process completed")
Exemple #23
0
    def fetch(self, category=CATEGORY_MESSAGE, from_date=DEFAULT_DATETIME):
        """Fetch the messages from the channel.

        This method fetches the messages stored on the channel that were
        sent since the given date.

        :param category: the category of items to fetch
        :param from_date: obtain messages sent since this date

        :returns: a generator of messages
        """
        if not from_date:
            from_date = DEFAULT_DATETIME

        from_date = datetime_to_utc(from_date)
        latest = datetime_utcnow().timestamp()

        kwargs = {'from_date': from_date, 'latest': latest}
        items = super().fetch(category, **kwargs)

        return items
Exemple #24
0
    def metadata(self, item):
        """Add metadata to an item.

        It adds metadata to a given item such as how and
        when it was fetched. The contents from the original item will
        be stored under the 'data' keyword.

        :param item: an item fetched by a backend
        """
        item = {
            'backend_name': self.__class__.__name__,
            'backend_version': self.version,
            'perceval_version': __version__,
            'timestamp': datetime_utcnow().timestamp(),
            'origin': self.origin,
            'uuid': uuid(self.origin, self.metadata_id(item)),
            'updated_on': self.metadata_updated_on(item),
            'category': self.metadata_category(item),
            'tag': self.tag,
            'data': item,
        }

        return item
Exemple #25
0
    def test_job_archive(self):
        """Execute a Bugzilla backend job to fetch data from the archive"""

        after_dt = datetime_utcnow()
        http_requests = setup_mock_bugzilla_server()

        expected = ['5a8a1e25dfda86b961b4146050883cbfc928f8ec',
                    '1fd4514e56f25a39ffd78eab19e77cfe4dfb7769',
                    '6a4cb244067c3cfa38f9f563e2ab3cd8ac21762f',
                    '7e033ed0110032ead6b918be43c1f3f88cd98fd7',
                    'f90d12b380ffdb47f2b6e96b321f08000181a9d6',
                    '4b166308f205121bc57704032acdc81b6c9bb8b1',
                    'b4009442d38f4241a4e22e3e61b7cd8ef5ced35c']

        q = rq.Queue('queue', async=False)

        # First, we fetch the bugs from the server, storing them
        # in an archive
        backend_args = {
            'url': BUGZILLA_SERVER_URL,
            'max_bugs': 5
        }
        archive_args = {
            'archive_path': self.tmp_path,
            'fetch_from_archive': False,
            'archived_after': after_dt
        }

        job = q.enqueue(execute_perceval_job,
                        backend='bugzilla', backend_args=backend_args, category='bug',
                        qitems='items', task_id='mytask',
                        archive_args=archive_args)

        bugs = self.conn.lrange('items', 0, -1)
        bugs = [pickle.loads(b) for b in bugs]
        bugs = [bug['uuid'] for bug in bugs]
        self.conn.ltrim('items', 1, 0)

        result = job.return_value
        self.assertEqual(result.job_id, job.get_id())
        self.assertEqual(result.task_id, 'mytask')
        self.assertEqual(result.backend, 'bugzilla')
        self.assertEqual(result.last_uuid, 'b4009442d38f4241a4e22e3e61b7cd8ef5ced35c')
        self.assertEqual(result.max_date, 1439404330.0)
        self.assertEqual(result.nitems, 7)
        self.assertEqual(result.nresumed, 0)

        self.assertEqual(len(http_requests), 13)
        self.assertListEqual(bugs, expected)

        # Now, we get the bugs from the archive.
        # The contents should be the same and there won't be
        # any new request to the server

        archive_args['fetch_from_archive'] = True
        job = q.enqueue(execute_perceval_job,
                        backend='bugzilla', backend_args=backend_args,
                        qitems='items', task_id='mytask', category='bug',
                        archive_args=archive_args)

        archived_bugs = self.conn.lrange('items', 0, -1)
        archived_bugs = [pickle.loads(b) for b in archived_bugs]
        archived_bugs = [bug['uuid'] for bug in archived_bugs]
        self.conn.ltrim('items', 1, 0)

        result = job.return_value
        self.assertEqual(result.job_id, job.get_id())
        self.assertEqual(result.task_id, 'mytask')
        self.assertEqual(result.backend, 'bugzilla')
        self.assertEqual(result.last_uuid, 'b4009442d38f4241a4e22e3e61b7cd8ef5ced35c')
        self.assertEqual(result.max_date, 1439404330.0)
        self.assertEqual(result.nitems, 7)
        self.assertEqual(result.nresumed, 0)

        self.assertEqual(len(http_requests), 13)

        self.assertListEqual(archived_bugs, bugs)
Exemple #26
0
    def fetch(self, from_date=DEFAULT_DATETIME):
        """Fetch the messages from the channel.

        This method fetches the messages stored on the channel that were
        sent since the given date.

        :param from_date: obtain messages sent since this date

        :returns: a generator of messages
        """
        logger.info("Fetching messages of '%s' channel from %s",
                    self.channel, str(from_date))

        self._purge_cache_queue()

        raw_info = self.client.channel_info(self.channel)
        self._push_cache_queue(raw_info)
        channel_info = self.parse_channel_info(raw_info)

        oldest = datetime_to_utc(from_date).timestamp()
        latest = datetime_utcnow().timestamp()

        # Minimum value supported by Slack is 0 not 0.0
        if oldest == 0.0:
            oldest = 0

        # Slack does not include on its result the lower limit
        # of the search if it has the same date of 'oldest'. To get
        # this messages too, we substract a low value to be sure
        # the dates are not the same
        if oldest > 0.0:
            oldest -= .000001

        fetching = True
        nmsgs = 0

        while fetching:
            raw_history = self.client.history(self.channel,
                                              oldest=oldest, latest=latest)
            messages, fetching = self.parse_history(raw_history)

            self._push_cache_queue(raw_history)

            for message in messages:
                if 'user' in message:
                    message['user_data'] = self.__get_or_fetch_user(message['user'])
                message['channel_info'] = channel_info
                yield message

                nmsgs += 1

                if fetching:
                    latest = float(message['ts'])

            # Checkpoint. A set of messages ends here.
            self._push_cache_queue('{}')
            self._flush_cache_queue()

        # Checkpoint for batch. A batch ends here.
        self._push_cache_queue('{END}')
        self._flush_cache_queue()

        logger.info("Fetch process completed: %s message fetched", nmsgs)
Exemple #27
0
    def __fetch_pre1_27(self, from_date=None):
        """Fetch the pages from the backend url.

        The method retrieves, from a MediaWiki url, the
        wiki pages.

        :returns: a generator of pages
        """

        def fetch_incremental_changes(namespaces_contents):
            # Use recent changes API to get the pages from date
            npages = 0  # number of pages processed
            rccontinue = ''
            hole_created = True  # To detect that incremental is not complete
            while rccontinue is not None:
                raw_pages = self.client.get_recent_pages(namespaces_contents, rccontinue)
                data_json = json.loads(raw_pages)
                if 'query-continue' in data_json:
                    # < 1.27
                    rccontinue = data_json['query-continue']['recentchanges']['rccontinue']
                elif 'continue' in data_json:
                    # >= 1.27
                    rccontinue = data_json['continue']['rccontinue']
                else:
                    rccontinue = None
                pages_json = data_json['query']['recentchanges']
                for page in pages_json:
                    page_ts = dateutil.parser.parse(page['timestamp'])
                    if from_date >= page_ts:
                        # The rest of recent changes are older than from_date
                        logger.debug("All recent changes newer than %s processed.", from_date)
                        rccontinue = None
                        hole_created = False
                        break
                    page_reviews = self.__get_page_reviews(page)
                    if not page_reviews:
                        # Page without reviews are not managed
                        continue
                    yield page_reviews
                    npages += 1
            if hole_created:
                logger.error("Incremental update NOT completed. Hole in history created.")
            logger.info("Total number of pages: %i", npages)

        def fetch_all_pages(namespaces_contents):
            # Use get all pages API to get pages
            npages = 0  # number of pages processed

            for ns in namespaces_contents:
                apcontinue = ''  # pagination for getting pages
                logger.debug("Getting pages for namespace: %s", ns)
                while apcontinue is not None:
                    raw_pages = self.client.get_pages(ns, apcontinue)
                    data_json = json.loads(raw_pages)
                    if 'query-continue' in data_json:
                        # < 1.27
                        apcontinue = data_json['query-continue']['allpages']['apcontinue']
                    elif 'continue' in data_json:
                        # >= 1.27
                        apcontinue = data_json['continue']['apcontinue']
                    else:
                        apcontinue = None
                    pages_json = data_json['query']['allpages']
                    for page in pages_json:
                        yield self.__get_page_reviews(page)
                        npages += 1
            logger.info("Total number of pages: %i", npages)

        logger.info("Looking for pages at url '%s'", self.url)

        # from_date can not be older than MAX_RECENT_DAYS days ago
        if from_date:
            if (datetime_utcnow() - from_date).days >= MAX_RECENT_DAYS:
                cause = "Can't get incremental pages older than %i days." % MAX_RECENT_DAYS
                cause += " Do a complete analysis without from_date for older changes."
                raise BackendError(cause=cause)

        namespaces_contents = self.__get_namespaces_contents()

        if not from_date:
            return fetch_all_pages(namespaces_contents)
        else:
            return fetch_incremental_changes(namespaces_contents)
Exemple #28
0
    def test_job_archive(self):
        """Execute a Bugzilla backend job to fetch data from the archive"""

        after_dt = datetime_utcnow()
        http_requests = setup_mock_bugzilla_server()

        expected = ['5a8a1e25dfda86b961b4146050883cbfc928f8ec',
                    '1fd4514e56f25a39ffd78eab19e77cfe4dfb7769',
                    '6a4cb244067c3cfa38f9f563e2ab3cd8ac21762f',
                    '7e033ed0110032ead6b918be43c1f3f88cd98fd7',
                    'f90d12b380ffdb47f2b6e96b321f08000181a9d6',
                    '4b166308f205121bc57704032acdc81b6c9bb8b1',
                    'b4009442d38f4241a4e22e3e61b7cd8ef5ced35c']

        q = rq.Queue('queue', async=False)

        # First, we fetch the bugs from the server, storing them
        # in an archive
        backend_args = {
            'url': BUGZILLA_SERVER_URL,
            'max_bugs': 5
        }
        archive_args = {
            'archive_path': self.tmp_path,
            'fetch_from_archive': False,
            'archived_after': after_dt
        }

        job = q.enqueue(execute_perceval_job,
                        backend='bugzilla', backend_args=backend_args, category='bug',
                        qitems='items', task_id='mytask',
                        archive_args=archive_args)

        bugs = self.conn.lrange('items', 0, -1)
        bugs = [pickle.loads(b) for b in bugs]
        bugs = [bug['uuid'] for bug in bugs]
        self.conn.ltrim('items', 1, 0)

        result = job.return_value
        self.assertEqual(result.job_id, job.get_id())
        self.assertEqual(result.task_id, 'mytask')
        self.assertEqual(result.backend, 'bugzilla')
        self.assertEqual(result.last_uuid, 'b4009442d38f4241a4e22e3e61b7cd8ef5ced35c')
        self.assertEqual(result.max_date, 1439404330.0)
        self.assertEqual(result.nitems, 7)
        self.assertEqual(result.nresumed, 0)

        self.assertEqual(len(http_requests), 13)
        self.assertListEqual(bugs, expected)

        # Now, we get the bugs from the archive.
        # The contents should be the same and there won't be
        # any new request to the server

        archive_args['fetch_from_archive'] = True
        job = q.enqueue(execute_perceval_job,
                        backend='bugzilla', backend_args=backend_args,
                        qitems='items', task_id='mytask', category='bug',
                        archive_args=archive_args)

        archived_bugs = self.conn.lrange('items', 0, -1)
        archived_bugs = [pickle.loads(b) for b in archived_bugs]
        archived_bugs = [bug['uuid'] for bug in archived_bugs]
        self.conn.ltrim('items', 1, 0)

        result = job.return_value
        self.assertEqual(result.job_id, job.get_id())
        self.assertEqual(result.task_id, 'mytask')
        self.assertEqual(result.backend, 'bugzilla')
        self.assertEqual(result.last_uuid, 'b4009442d38f4241a4e22e3e61b7cd8ef5ced35c')
        self.assertEqual(result.max_date, 1439404330.0)
        self.assertEqual(result.nitems, 7)
        self.assertEqual(result.nresumed, 0)

        self.assertEqual(len(http_requests), 13)

        self.assertListEqual(archived_bugs, bugs)