def fetch(self, from_date=DEFAULT_DATETIME, to_date=None):
        """Fetch tests data from the server.

        This method fetches tests data from a server that were
        updated since the given date.

        :param from_date: obtain data updated since this date
        :param to_date: obtain data updated before this date

        :returns: a generator of items
        """
        from_date = datetime_to_utc(from_date)
        to_date = datetime_to_utc(to_date) if to_date else datetime_utcnow()

        logger.info("Fetching tests data of '%s' group from %s to %s",
                    self.url, str(from_date),
                    str(to_date) if to_date else '--')

        pages = self.client.results(from_date=from_date, to_date=to_date)
        ndata = 0

        for raw_page in pages:
            parsed_data = self.parse_json(raw_page)

            for test_data in parsed_data:
                yield test_data
                ndata += 1

        logger.info("Fetch process completed: %s tests data fetched", ndata)
Ejemplo n.º 2
0
    def __fetch_commits_from_repo(self, repo, from_date, to_date, branches):
        if branches is None:
            branches_text = "all"
        elif len(branches) == 0:
            branches_text = "no"
        else:
            branches_text = ", ".join(branches)

        logger.info(
            "Fetching commits: '%s' git repository from %s to %s; %s branches",
            self.uri, str(from_date), str(to_date), branches_text)

        # Ignore default datetime to avoid problems with git
        # or convert to UTC
        if to_date == DEFAULT_LAST_DATETIME:
            to_date = None
        else:
            to_date = datetime_to_utc(to_date)

        if from_date == DEFAULT_DATETIME:
            from_date = None
        else:
            from_date = datetime_to_utc(from_date)

        repo.update()

        gitlog = repo.log(from_date, to_date, branches)
        return self.parse_git_log_from_iter(gitlog)
Ejemplo n.º 3
0
    def fetch(self, from_date=DEFAULT_DATETIME, to_date=None):
        """Fetch the events from the server.

        This method fetches those events of a group stored on the server
        that were updated since the given date. Data comments and rsvps
        are included within each event.

        :param from_date: obtain events updated since this date
        :param to_date: obtain events updated before this date

        :returns: a generator of events
        """
        logger.info("Fetching events of '%s' group from %s to %s", self.group,
                    str(from_date),
                    str(to_date) if to_date else '--')

        self._purge_cache_queue()

        from_date = datetime_to_utc(from_date)
        to_date_ts = datetime_to_utc(to_date).timestamp() if to_date else None

        nevents = 0
        stop_fetching = False

        ev_pages = self.client.events(self.group, from_date=from_date)

        for evp in ev_pages:
            self._push_cache_queue(evp)

            events = [event for event in self.parse_json(evp)]

            for event in events:
                event_id = event['id']

                event['comments'] = self.__fetch_and_parse_comments(event_id)
                event['rsvps'] = self.__fetch_and_parse_rsvps(event_id)

                # Check events updated before 'to_date'
                event_ts = self.metadata_updated_on(event)

                if to_date_ts and event_ts >= to_date_ts:
                    # Comments and RSVPS of items from the current
                    # page be fetched to avoid problems with the cache
                    stop_fetching = True
                    continue

                yield event
                nevents += 1

            self._flush_cache_queue()

            if stop_fetching:
                break

        logger.info("Fetch process completed: %s events fetched", nevents)
Ejemplo n.º 4
0
    def test_conversion(self):
        """Check if it converts some timestamps to timestamps with UTC+0."""

        date = datetime.datetime(2001,
                                 12,
                                 1,
                                 23,
                                 15,
                                 32,
                                 tzinfo=dateutil.tz.tzoffset(None, -21600))
        expected = datetime.datetime(2001,
                                     12,
                                     2,
                                     5,
                                     15,
                                     32,
                                     tzinfo=dateutil.tz.tzutc())
        utc = datetime_to_utc(date)
        self.assertIsInstance(utc, datetime.datetime)
        self.assertEqual(utc, expected)

        date = datetime.datetime(2001,
                                 12,
                                 1,
                                 23,
                                 15,
                                 32,
                                 tzinfo=dateutil.tz.tzutc())
        expected = datetime.datetime(2001,
                                     12,
                                     1,
                                     23,
                                     15,
                                     32,
                                     tzinfo=dateutil.tz.tzutc())
        utc = datetime_to_utc(date)
        self.assertIsInstance(utc, datetime.datetime)
        self.assertEqual(utc, expected)

        date = datetime.datetime(2001, 12, 1, 23, 15, 32)
        expected = datetime.datetime(2001,
                                     12,
                                     1,
                                     23,
                                     15,
                                     32,
                                     tzinfo=dateutil.tz.tzutc())
        utc = datetime_to_utc(date)
        self.assertIsInstance(utc, datetime.datetime)
        self.assertEqual(utc, expected)
Ejemplo n.º 5
0
    def fetch(self, from_date=DEFAULT_DATETIME):
        """Fetch the questions from the site.

        The method retrieves, from a StackExchange site, the
        questions updated since the given date.

        :param from_date: obtain questions updated since this date

        :returns: a generator of questions
        """
        if not from_date:
            from_date = DEFAULT_DATETIME

        logger.info(
            "Looking for questions at site '%s', with tag '%s' and updated from '%s'",
            self.site, self.tagged, str(from_date))

        self._purge_cache_queue()

        from_date = datetime_to_utc(from_date)

        whole_pages = self.client.get_questions(from_date)

        for whole_page in whole_pages:
            self._push_cache_queue(whole_page)
            self._flush_cache_queue()
            questions = self.parse_questions(whole_page)
            for question in questions:
                yield question
Ejemplo n.º 6
0
    def fetch(self, from_date=DEFAULT_DATETIME):
        """Fetch the topics from the Discurse board.

        The method retrieves, from a Discourse board the topics
        updated since the given date.

        :param from_date: obtain topics updated since this date

        :returns: a generator of topics
        """
        if not from_date:
            from_date = DEFAULT_DATETIME
        else:
            from_date = datetime_to_utc(from_date)

        logger.info("Looking for topics at '%s', updated from '%s'", self.url,
                    str(from_date))

        self._purge_cache_queue()

        ntopics = 0

        topics_ids = self.__fetch_and_parse_topics_ids(from_date)

        for topic_id in topics_ids:
            topic = self.__fetch_and_parse_topic(topic_id)
            ntopics += 1
            yield topic
            self._flush_cache_queue()

        logger.info("Fetch process completed: %s topics fetched", ntopics)
Ejemplo n.º 7
0
    def events(self, group, from_date=DEFAULT_DATETIME):
        """Fetch the events pages of a given group."""

        date = datetime_to_utc(from_date)
        date = date.strftime("since:%Y-%m-%dT%H:%M:%S.000Z")

        resource = urijoin(group, self.REVENTS)

        # Hack required due to Metup API does not support list
        # values with the format `?param=value1&param=value2`.
        # It only works with `?param=value1,value2`.
        # Morever, urrlib3 encodes comma characters when values
        # are given using params dict, which it doesn't work
        # with Meetup, either.
        fixed_params = '?' + self.PFIELDS + '=' + ','.join(self.VEVENT_FIELDS)
        fixed_params += '&' + self.PSTATUS + '=' + ','.join(self.VSTATUS)
        resource += fixed_params

        params = {
            self.PORDER: self.VUPDATED,
            self.PSCROLL: date,
            self.PPAGE: self.max_items
        }

        for page in self._fetch(resource, params):
            yield page
Ejemplo n.º 8
0
    def fetch(self, from_date=DEFAULT_DATETIME):
        """Fetch the issues from a project (distribution/package).

        The method retrieves, from a Launchpad project, the issues
        updated since the given date.

        :param from_date: obtain issues updated since this date

        :returns: a generator of issues
        """
        if not from_date:
            from_date = DEFAULT_DATETIME

        logger.info("Fetching issues of '%s' distribution from %s",
                    self.distribution, str(from_date))

        self._purge_cache_queue()

        from_date = datetime_to_utc(from_date)
        nissues = 0

        for issue in self._fetch(from_date):
            yield issue
            nissues += 1

        logger.info("Fetch process completed: %s issues fetched", nissues)
Ejemplo n.º 9
0
    def fetch(self, from_date=DEFAULT_DATETIME):
        """Fetch the messages from the Supybot IRC logger.

        The method parsers and returns the messages saved on the
        IRC log files and stored by Supybot in `dirpath`.

        :param from_date: obtain messages since this date

        :returns: a generator of messages
        """
        logger.info("Fetching messages of '%s' from %s",
                    self.uri, str(from_date))

        from_date = datetime_to_utc(from_date)

        nmessages = 0
        archives = self.__retrieve_archives(from_date)

        for archive in archives:
            logger.debug("Parsing supybot archive %s", archive)

            for message in self.parse_supybot_log(archive):
                dt = str_to_datetime(message['timestamp'])

                if dt < from_date:
                    logger.debug("Message %s sent before %s; skipped",
                                 str(dt), str(from_date))
                    continue

                yield message
                nmessages += 1

        logger.info("Fetch process completed: %s messages fetched",
                    nmessages)
Ejemplo n.º 10
0
    def __fetch_crates(self, from_date):
        """Fetch crates"""

        from_date = datetime_to_utc(from_date)

        crates_groups = self.client.crates()

        for raw_crates in crates_groups:
            crates = json.loads(raw_crates)

            for crate_container in crates['crates']:

                if str_to_datetime(crate_container['updated_at']) < from_date:
                    continue

                crate_id = crate_container['id']

                crate = self.__fetch_crate_data(crate_id)
                crate['owner_team_data'] = self.__fetch_crate_owner_team(
                    crate_id)
                crate['owner_user_data'] = self.__fetch_crate_owner_user(
                    crate_id)
                crate[
                    'version_downloads_data'] = self.__fetch_crate_version_downloads(
                        crate_id)
                crate['versions_data'] = self.__fetch_crate_versions(crate_id)

                yield crate
Ejemplo n.º 11
0
    def fetch_items(self, category, **kwargs):
        """Fetch the messages

        :param category: the category of items to fetch
        :param kwargs: backend arguments

        :returns: a generator of items
        """
        from_date = kwargs['from_date']
        latest = kwargs['latest']

        logger.info("Fetching messages of '%s' channel from %s", self.channel,
                    str(from_date))

        raw_info = self.client.channel_info(self.channel)
        channel_info = self.parse_channel_info(raw_info)

        oldest = datetime_to_utc(from_date).timestamp()

        # Minimum value supported by Slack is 0 not 0.0
        if oldest == 0.0:
            oldest = 0

        # Slack does not include on its result the lower limit
        # of the search if it has the same date of 'oldest'. To get
        # this messages too, we substract a low value to be sure
        # the dates are not the same. To avoid precision problems
        # it is substracted by five decimals and not by six.
        if oldest > 0.0:
            oldest -= .00001

        fetching = True
        nmsgs = 0

        while fetching:
            raw_history = self.client.history(self.channel,
                                              oldest=oldest,
                                              latest=latest)
            messages, fetching = self.parse_history(raw_history)

            for message in messages:
                # Fetch user data
                user_id = None
                if 'user' in message:
                    user_id = message['user']
                elif 'comment' in message:
                    user_id = message['comment']['user']

                if user_id:
                    message['user_data'] = self.__get_or_fetch_user(user_id)

                message['channel_info'] = channel_info
                yield message

                nmsgs += 1

                if fetching:
                    latest = float(message['ts'])

        logger.info("Fetch process completed: %s message fetched", nmsgs)
Ejemplo n.º 12
0
    def fetch(self, from_date=DEFAULT_DATETIME):
        """Fetch the issues from the repository.

        The method retrieves, from a GitHub repository, the issues
        updated since the given date.

        :param from_date: obtain issues updated since this date

        :returns: a generator of issues
        """

        self._purge_cache_queue()

        from_date = datetime_to_utc(from_date)

        issues_groups = self.client.get_issues(from_date)

        for raw_issues in issues_groups:
            self._push_cache_queue(raw_issues)
            self._flush_cache_queue()
            issues = json.loads(raw_issues)
            for issue in issues:
                for field in ['user', 'assignee']:
                    if issue[field]:
                        issue[field + "_data"] = self.__get_user(issue[field]['login'])
                    else:
                        issue[field + "_data"] = {}
                yield issue
Ejemplo n.º 13
0
    def issues(self,
               from_date=DEFAULT_DATETIME,
               offset=None,
               max_issues=MAX_ISSUES):
        """Get the information of a list of issues.

        :param from_date: retrieve issues that where updated from that date;
            dates are converted to UTC
        :param offset: starting position for the search
        :param max_issues: maximum number of issues to reteurn per query
        """
        resource = self.RISSUES + self.CJSON

        ts = datetime_to_utc(from_date)
        ts = ts.strftime("%Y-%m-%dT%H:%M:%SZ")

        # By default, Redmine returns open issues only.
        # Parameter 'status_id' is set to get all the statuses.
        params = {
            self.PSTATUS_ID: '*',
            self.PSORT: self.PUPDATED_ON,
            self.PUPDATED_ON: '>=' + ts,
            self.PLIMIT: max_issues
        }

        if offset is not None:
            params[self.POFFSET] = offset

        response = self._call(resource, params)

        return response
Ejemplo n.º 14
0
    def fetch(self, from_date=DEFAULT_DATETIME):
        """Fetch the contents by version from the server.

        This method fetches the different historical versions (or
        snapshots) of the contents stored in the server that were
        updated since the given date. Only those snapshots created
        or updated after `from_date` will be returned.

        Take into account that the seconds of `from_date` parameter will
        be ignored because the Confluence REST API only accepts the date
        and hours and minutes for timestamps values.

        :param from_date: obtain historical versions of contents updated
            since this date

        :returns: a generator of historical versions
        """
        if not from_date:
            from_date = DEFAULT_DATETIME

        from_date = datetime_to_utc(from_date)

        kwargs = {'from_date': from_date}
        items = super().fetch("historical content", **kwargs)

        return items
Ejemplo n.º 15
0
    def fetch(self,
              category=CATEGORY_EVENT,
              from_date=DEFAULT_DATETIME,
              to_date=None):
        """Fetch the events from the server.

        This method fetches those events of a group stored on the server
        that were updated since the given date. Data comments and rsvps
        are included within each event.

        :param category: the category of items to fetch
        :param from_date: obtain events updated since this date
        :param to_date: obtain events updated before this date

        :returns: a generator of events
        """
        if not from_date:
            from_date = DEFAULT_DATETIME

        from_date = datetime_to_utc(from_date)

        kwargs = {"from_date": from_date, "to_date": to_date}
        items = super().fetch(category, **kwargs)

        return items
Ejemplo n.º 16
0
    def test_get_pages_from_allrevisions_from_date(self):
        HTTPServer.routes()
        body = read_file('data/mediawiki/mediawiki_pages_allrevisions.json')
        client = MediaWikiClient(MEDIAWIKI_SERVER_URL)
        namespaces = ['0']
        str_date = '2016-01-01 00:00'
        dt = str_to_datetime(str_date)
        from_date = datetime_to_utc(dt)
        response = client.get_pages_from_allrevisions(namespaces, from_date)
        req = HTTPServer.requests_http[-1]
        self.assertEqual(response, body)
        self.assertEqual(req.method, 'GET')
        self.assertRegex(req.path, '/api.php')
        # Check request params
        expected = {
            'action': ['query'],
            'list': ['allrevisions'],
            'arvnamespace': ['0'],
            'arvdir': ['newer'],
            'arvlimit': ['max'],
            'format': ['json'],
            'arvprop': ['ids'],
            'arvstart': ['2016-01-01T00:00:00Z']
        }
        self.assertDictEqual(req.querystring, expected)

        from_date = datetime.datetime(2016, 1, 1, 0, 0, 0)

        with self.assertRaises(ValueError):
            _ = client.get_pages_from_allrevisions(namespaces, from_date)
Ejemplo n.º 17
0
    def tasks(self, from_date=DEFAULT_DATETIME):
        """Retrieve tasks.

        :param from_date: retrieve tasks that where updated from that date;
            dates are converted epoch time.
        """
        # Convert 'from_date' to epoch timestamp.
        # Zero value (1970-01-01 00:00:00) is not allowed for
        # 'modifiedStart' so it will be set to 1, by default.
        ts = int(datetime_to_utc(from_date).timestamp()) or 1

        consts = {
            self.PMODIFIED_START: ts
        }

        attachments = {
            self. PPROJECTS: True
        }

        params = {
            self.PCONSTRAINTS: consts,
            self.PATTACHMENTS: attachments,
            self.PORDER: self.VOUTDATED,
        }

        while True:
            r = self._call(self.MANIPHEST_TASKS, params)
            yield r
            j = json.loads(r)
            after = j['result']['cursor']['after']
            if not after:
                break
            params[self.PAFTER] = after
Ejemplo n.º 18
0
    def test_invalid_timezone(self):
        """Check whether an invalid timezone is converted to UTC+0"""

        # Python 3.6 does not put any restriction on the offset range.
        # Thus, this test is valid only for prior Python versions.
        if sys.version_info.major == 3 and sys.version_info.minor == 6:
            return

        date = datetime.datetime(2001,
                                 12,
                                 1,
                                 23,
                                 15,
                                 32,
                                 tzinfo=dateutil.tz.tzoffset(None, -3407))
        expected = datetime.datetime(2001,
                                     12,
                                     1,
                                     23,
                                     15,
                                     32,
                                     tzinfo=dateutil.tz.tzutc())
        utc = datetime_to_utc(date)

        self.assertIsInstance(utc, datetime.datetime)
        self.assertEqual(utc, expected)
Ejemplo n.º 19
0
    def fetch(self, from_date=DEFAULT_DATETIME):
        """Fetch the issues from the site.

        The method retrieves, from a JIRA site, the
        issues updated since the given date.

        :param from_date: retrieve issues updated from this date

        :returns: a generator of issues
        """
        if not from_date:
            from_date = DEFAULT_DATETIME

        logger.info(
            "Looking for issues at site '%s', in project '%s' and updated from '%s'",
            self.url, self.project, str(from_date))

        self._purge_cache_queue()

        from_date = datetime_to_utc(from_date)

        whole_pages = self.client.get_issues(from_date)

        fields = json.loads(self.client.get_fields())
        custom_fields = filter_custom_fields(fields)

        for whole_page in whole_pages:
            self._push_cache_queue(whole_page)
            self._flush_cache_queue()
            issues = self.parse_issues(whole_page)
            for issue in issues:
                mapping = map_custom_field(custom_fields, issue['fields'])
                for k, v in mapping.items():
                    issue['fields'][k] = v
                yield issue
Ejemplo n.º 20
0
    def _fetch_gerrit(self, from_date=DEFAULT_DATETIME):
        last_item = self.client.next_retrieve_group_item()
        reviews = self._get_reviews(last_item)
        last_nreviews = len(reviews)

        # Convert date to Unix time
        from_ut = datetime_to_utc(from_date)
        from_ut = from_ut.timestamp()

        while reviews:
            review = reviews.pop(0)
            try:
                last_item += 1
            except Exception:
                pass  # last_item is a string in old gerrits
            updated = review['lastUpdated']
            if updated <= from_ut:
                logger.debug("No more updates for %s" % (self.url))
                break
            else:
                yield review

            if not reviews and last_nreviews >= self.max_reviews:
                logger.debug("GETTING MORE REVIEWS %i >= %i " % (last_nreviews, self.max_reviews))
                last_item = self.client.next_retrieve_group_item(last_item, review)
                reviews = self._get_reviews(last_item)
                last_nreviews = len(reviews)
Ejemplo n.º 21
0
    def events(self, group, from_date=DEFAULT_DATETIME):
        """Fetch the events pages of a given group."""

        date = datetime_to_utc(from_date)
        date = date.strftime("since:%Y-%m-%dT%H:%M:%S.000Z")

        resource = urijoin(group, self.REVENTS)

        # Hack required due to Metup API does not support list
        # values with the format `?param=value1&param=value2`.
        # It only works with `?param=value1,value2`.
        # Morever, urrlib3 encodes comma characters when values
        # are given using params dict, which it doesn't work
        # with Meetup, either.
        fixed_params = '?' + self.PFIELDS + '=' + ','.join(self.VEVENT_FIELDS)
        fixed_params += '&' + self.PSTATUS + '=' + ','.join(self.VSTATUS)
        resource += fixed_params

        params = {
            self.PORDER: self.VUPDATED,
            self.PSCROLL: date,
            self.PPAGE: self.max_items
        }

        try:
            for page in self._fetch(resource, params):
                yield page
        except requests.exceptions.HTTPError as error:
            if error.response.status_code == 410:
                msg = "Group is no longer accessible: {}".format(error)
                raise RepositoryError(cause=msg)
            else:
                raise error
Ejemplo n.º 22
0
    def bugs(self, from_date=DEFAULT_DATETIME, offset=None, max_bugs=MAX_BUGS):
        """Get the information of a list of bugs.

        :param from_date: retrieve bugs that where updated from that date;
            dates are converted to UTC
        :param offset: starting position for the search; i.e to return 11th
            element, set this value to 10.
        :param max_bugs: maximum number of bugs to reteurn per query
        """
        date = datetime_to_utc(from_date)
        date = date.strftime("%Y-%m-%dT%H:%M:%SZ")

        params = {
            self.PLAST_CHANGE_TIME: date,
            self.PLIMIT: max_bugs,
            self.PORDER: self.VCHANGE_DATE_ORDER,
            self.PINCLUDE_FIELDS: self.VINCLUDE_ALL
        }

        if offset:
            params[self.POFFSET] = offset

        response = self.call(self.RBUG, params)

        return response
Ejemplo n.º 23
0
    def fetch_items(self, category, **kwargs):
        """Fetch the questions

        :param category: the category of items to fetch
        :param kwargs: backend arguments

        :returns: a generator of items
        """

        from_date = datetime_to_utc(kwargs['from_date']).timestamp()

        questions_groups = self.client.get_api_questions(AskbotClient.API_QUESTIONS)
        for questions in questions_groups:

            for question in questions['questions']:
                updated_at = int(question['last_activity_at'])
                if updated_at > from_date:
                    html_question = self.__fetch_question(question)
                    if not html_question:
                        continue

                    logger.debug("Fetching HTML question %s", question['id'])
                    comments = self.__fetch_comments(question)
                    question_obj = self.__build_question(html_question, question, comments)
                    question.update(question_obj)
                    yield question
Ejemplo n.º 24
0
    def fetch(self, from_date=DEFAULT_DATETIME):
        """Fetch the mbox files from the remote archiver.

        This method stores the archives in the path given during the
        initialization of this object.

        HyperKitty archives are accessed month by month and stored following
        the schema year-month. Archives are fetched from the given month
        till the current month.

        :param from_date: fetch archives that store messages
            equal or after the given date; only year and month values
            are compared

        :returns: a list of tuples, storing the links and paths of the
            fetched archives
        """
        logger.info("Downloading mboxes from '%s' to since %s", self.url,
                    str(from_date))
        logger.debug("Storing mboxes in '%s'", self.dirpath)

        # Check mailing list URL
        r = requests.get(self.url)
        r.raise_for_status()

        from_date = datetime_to_utc(from_date)
        to_end = datetime_utcnow()
        to_end += dateutil.relativedelta.relativedelta(months=1)

        months = months_range(from_date, to_end)

        fetched = []

        if not os.path.exists(self.dirpath):
            os.makedirs(self.dirpath)

        tmbox = 0

        for dts in months:
            tmbox += 1
            start, end = dts[0], dts[1]
            filename = start.strftime("%Y-%m.mbox.gz")
            filepath = os.path.join(self.dirpath, filename)

            url = urijoin(self.url, 'export', filename)

            params = {
                'start': start.strftime("%Y-%m-%d"),
                'end': end.strftime("%Y-%m-%d")
            }

            success = self._download_archive(url, params, filepath)

            if success:
                fetched.append((url, filepath))

        logger.info("%s/%s MBoxes downloaded", len(fetched), tmbox)

        return fetched
Ejemplo n.º 25
0
 def __get_max_date(self, reviews):
     """"Get the max date in unixtime format from reviews."""
     max_ts = 0
     for review in reviews:
         ts = str_to_datetime(review['timestamp'])
         ts = datetime_to_utc(ts)
         if ts.timestamp() > max_ts:
             max_ts = ts.timestamp()
     return max_ts
Ejemplo n.º 26
0
    def _fetch_gerrit28(self, from_date=DEFAULT_DATETIME):
        """ Specific fetch for gerrit 2.8 version.

        Get open and closed reviews in different queries.
        Take the newer review from both lists and iterate.
        """

        # Convert date to Unix time
        from_ut = datetime_to_utc(from_date)
        from_ut = from_ut.timestamp()

        filter_open = "status:open"
        filter_closed = "status:closed"

        last_item_open = self.client.next_retrieve_group_item()
        last_item_closed = self.client.next_retrieve_group_item()
        reviews_open = self._get_reviews(last_item_open, filter_open)
        reviews_closed = self._get_reviews(last_item_closed, filter_closed)
        last_nreviews_open = len(reviews_open)
        last_nreviews_closed = len(reviews_closed)

        while reviews_open or reviews_closed:
            if reviews_open and reviews_closed:
                if reviews_open[0]['lastUpdated'] >= reviews_closed[0][
                        'lastUpdated']:
                    review_open = reviews_open.pop(0)
                    review = review_open
                else:
                    review_closed = reviews_closed.pop(0)
                    review = review_closed
            elif reviews_closed:
                review_closed = reviews_closed.pop(0)
                review = review_closed
            else:
                review_open = reviews_open.pop(0)
                review = review_open

            updated = review['lastUpdated']
            if updated <= from_ut:
                logger.debug("No more updates for %s" % (self.url))
                break
            else:
                yield review

            if not reviews_open and last_nreviews_open >= self.max_reviews:
                last_item_open = self.client.next_retrieve_group_item(
                    last_item_open, review_open)
                reviews_open = self._get_reviews(last_item_open, filter_open)
                last_nreviews_open = len(reviews_open)
            if not reviews_closed and last_nreviews_closed >= self.max_reviews:
                last_item_closed = self.client.next_retrieve_group_item(
                    last_item_closed, review_closed)
                reviews_closed = self._get_reviews(last_item_closed,
                                                   filter_closed)
                last_nreviews_closed = len(reviews_closed)
Ejemplo n.º 27
0
    def fetch(self, from_date=DEFAULT_DATETIME):
        """Fetch the mbox files from the remote archiver.

        Stores the archives in the path given during the initialization
        of this object. Those archives which a not valid extension will
        be ignored.

        Pipermail archives usually have on their file names the date of
        the archives stored following the schema year-month. When `from_date`
        property is called, it will return the mboxes which their year
        and month are equal or after that date.

        :param from_date: fetch archives that store messages
            equal or after the given date; only year and month values
            are compared

        :returns: a list of tuples, storing the links and paths of the
            fetched archives
        """
        logger.info("Downloading mboxes from '%s' to since %s",
                    self.url, str(from_date))
        logger.debug("Storing mboxes in '%s'", self.dirpath)

        from_date = datetime_to_utc(from_date)

        r = requests.get(self.url)
        r.raise_for_status()

        links = self._parse_archive_links(r.text)

        fetched = []

        if not os.path.exists(self.dirpath):
            os.makedirs(self.dirpath)

        for l in links:
            filename = os.path.basename(l)

            mbox_dt = self._parse_date_from_filepath(filename)

            if ((from_date.year == mbox_dt.year and
                from_date.month == mbox_dt.month) or
                from_date < mbox_dt):

                filepath = os.path.join(self.dirpath, filename)
                success = self._download_archive(l, filepath)

                if success:
                    fetched.append((l, filepath))

        logger.info("%s/%s MBoxes downloaded", len(fetched), len(links))

        return fetched
Ejemplo n.º 28
0
    def init_metadata(self, origin, backend_name, backend_version, category,
                      backend_params):
        """Init metadata information.

        Metatada is composed by basic information needed to identify
        where archived data came from and how it can be retrieved
        and built into Perceval items.

        :param: origin: identifier of the repository
        :param: backend_name: name of the backend
        :param: backend_version: version of the backend
        :param: category: category of the items fetched
        :param: backend_params: dict representation of the fetch parameters

        raises ArchiveError: when an error occurs initializing the metadata
        """
        created_on = datetime_to_utc(datetime_utcnow())
        created_on_dumped = created_on.isoformat()
        backend_params_dumped = pickle.dumps(backend_params, 0)

        metadata = (
            origin,
            backend_name,
            backend_version,
            category,
            backend_params_dumped,
            created_on_dumped,
        )

        try:
            cursor = self._db.cursor()
            insert_stmt = "INSERT INTO " + self.METADATA_TABLE + " "\
                          "(origin, backend_name, backend_version, " \
                          "category, backend_params, created_on) " \
                          "VALUES (?, ?, ?, ?, ?, ?)"
            cursor.execute(insert_stmt, metadata)

            self._db.commit()
            cursor.close()
        except sqlite3.DatabaseError as e:
            msg = "metadata initialization error; cause: %s" % str(e)
            raise ArchiveError(cause=msg)

        self.origin = origin
        self.backend_name = backend_name
        self.backend_version = backend_version
        self.category = category
        self.backend_params = backend_params
        self.created_on = created_on

        logger.debug("Metadata of archive %s initialized to %s",
                     self.archive_path, metadata)
Ejemplo n.º 29
0
    def test_init_metadata(self):
        """Test whether metadata information is properly initialized"""

        archive_path = os.path.join(self.test_path, 'myarchive')
        archive = Archive.create(archive_path)

        before_dt = datetime_to_utc(datetime_utcnow())
        archive.init_metadata('marvel.com', 'marvel-comics-backend', '0.1.0',
                              'issue', {'from_date': before_dt})
        after_dt = datetime_to_utc(datetime_utcnow())

        archive_copy = Archive(archive_path)

        # Both copies should have the same parameters
        for arch in [archive, archive_copy]:
            self.assertEqual(arch.origin, 'marvel.com')
            self.assertEqual(arch.backend_name, 'marvel-comics-backend')
            self.assertEqual(arch.backend_version, '0.1.0')
            self.assertEqual(arch.item_category, 'issue')
            self.assertGreaterEqual(arch.created_on, before_dt)
            self.assertLessEqual(arch.created_on, after_dt)
            self.assertDictEqual(arch.backend_params, {'from_date': before_dt})
Ejemplo n.º 30
0
    def _fetch_and_parse_messages(self, mailing_list, from_date):
        """Fetch and parse the messages from a mailing list"""

        from_date = datetime_to_utc(from_date)

        nmsgs, imsgs, tmsgs = (0, 0, 0)

        for mbox in mailing_list.mboxes:
            tmp_path = None

            try:
                tmp_path = self._copy_mbox(mbox)

                for message in self.parse_mbox(tmp_path):
                    tmsgs += 1

                    if not self._validate_message(message):
                        imsgs += 1
                        continue

                    # Ignore those messages sent before the given date
                    dt = str_to_datetime(message[MBox.DATE_FIELD])

                    if dt < from_date:
                        logger.debug("Message %s sent before %s; skipped",
                                     message['unixfrom'], str(from_date))
                        tmsgs -= 1
                        continue

                    # Convert 'CaseInsensitiveDict' to dict
                    message = self._casedict_to_dict(message)

                    nmsgs += 1
                    logger.debug("Message %s parsed", message['unixfrom'])

                    yield message
            except (OSError, EOFError) as e:
                logger.warning("Ignoring %s mbox due to: %s", mbox.filepath,
                               str(e))
            except Exception as e:
                if tmp_path and os.path.exists(tmp_path):
                    os.remove(tmp_path)
                raise e
            finally:
                if tmp_path and os.path.exists(tmp_path):
                    os.remove(tmp_path)

        logger.info("Done. %s/%s messages fetched; %s ignored", nmsgs, tmsgs,
                    imsgs)