def fetch_items(self, path, payload): """Return the items from github API using links pagination""" page = 0 # current page last_page = None # last page url_next = urijoin(self.base_url, 'repos', self.owner, self.repository, path) logger.debug("Get GitHub paginated items from " + url_next) response = self.fetch(url_next, payload=payload) items = response.text page += 1 if 'last' in response.links: last_url = response.links['last']['url'] last_page = last_url.split('&page=')[1].split('&')[0] last_page = int(last_page) logger.debug("Page: %i/%i" % (page, last_page)) while items: yield items items = None if 'next' in response.links: url_next = response.links['next']['url'] response = self.fetch(url_next, payload=payload) page += 1 items = response.text logger.debug("Page: %i/%i" % (page, last_page))
def events(self, group, from_date=DEFAULT_DATETIME): """Fetch the events pages of a given group.""" date = datetime_to_utc(from_date) date = date.strftime("since:%Y-%m-%dT%H:%M:%S.000Z") resource = urijoin(group, self.REVENTS) # Hack required due to Metup API does not support list # values with the format `?param=value1¶m=value2`. # It only works with `?param=value1,value2`. # Morever, urrlib3 encodes comma characters when values # are given using params dict, which it doesn't work # with Meetup, either. fixed_params = '?' + self.PFIELDS + '=' + ','.join(self.VEVENT_FIELDS) fixed_params += '&' + self.PSTATUS + '=' + ','.join(self.VSTATUS) resource += fixed_params params = { self.PORDER: self.VUPDATED, self.PSCROLL: date, self.PPAGE: self.max_items } try: for page in self._fetch(resource, params): yield page except requests.exceptions.HTTPError as error: if error.response.status_code == 410: msg = "Group is no longer accessible: {}".format(error) raise RepositoryError(cause=msg) else: raise error
def _fetch(self, resource, params): """Fetch a resource. Method to fetch and to iterate over the contents of a type of resource. The method returns a generator of pages for that resource and parameters. :param resource: type of the resource :param params: parameters to filter :returns: a generator of pages for the requeste resource """ url = urijoin(self.base_url, resource) params[self.PKEY] = self.api_key params[self.PSIGN] = 'true', do_fetch = True while do_fetch: logger.debug("Meetup client calls resource: %s params: %s", resource, str(params)) self.sleep_for_rate_limit() r = self.fetch(url, payload=params) self.update_rate_limit(r) yield r.text if r.links and 'next' in r.links: url = r.links['next']['url'] params = {self.PKEY: self.api_key, self.PSIGN: 'true'} else: do_fetch = False
def _call(self, resource, params): """Retrive the given resource. :param resource: resource to retrieve :param params: dict with the HTTP parameters needed to retrieve the given resource """ url = self.URL % {'base': self.base_url, 'resource': resource} logger.debug("Confluence client requests: %s params: %s", resource, str(params)) while True: r = self.fetch(url, payload=params) yield r.text # Pagination is available when 'next' link exists j = r.json() if '_links' not in j: break if 'next' not in j['_links']: break url = urijoin(self.base_url, j['_links']['next']) params = {}
def events(self): """Collect the user events""" payload = {'per_page': 30} path = urijoin("users", self.user, "events", "public") return self.fetch_items(path, payload)
def fetch_items(self, category, **kwargs): """Fetch the contents :param category: the category of items to fetch :param kwargs: backend arguments :returns: a generator of items """ from_date = kwargs['from_date'] logger.info("Fetching historical contents of '%s' from %s", self.url, str(from_date)) nhcs = 0 contents = self.__fetch_contents_summary(from_date) contents = [content for content in contents] for content in contents: cid = content['id'] content_url = urijoin(self.origin, content['_links']['webui']) hcs = self.__fetch_historical_contents(cid, from_date) for hc in hcs: hc['content_url'] = content_url yield hc nhcs += 1 logger.info("Fetch process completed: %s historical contents fetched", nhcs)
def get_issues(self, from_date): """Retrieve all the issues from a given date. :param from_date: obtain issues updated since this date """ start_at = 0 url = urijoin(self.base_url, self.RESOURCE, self.VERSION_API, 'search') req = self.fetch(url, payload=self.__build_payload(start_at, from_date)) issues = req.text data = req.json() tissues = data['total'] nissues = data['maxResults'] start_at += min(nissues, tissues) self.__log_status(start_at, tissues) while issues: yield issues issues = None if data['startAt'] + nissues < tissues: req = self.fetch(url, payload=self.__build_payload( start_at, from_date)) data = req.json() start_at += nissues issues = req.text self.__log_status(start_at, tissues)
def get_fields(self): """Retrieve all the fields available.""" url = urijoin(self.base_url, self.RESOURCE, self.VERSION_API, 'field') req = self.fetch(url) return req.text
def _parse_archive_links(self, raw_html): bs = bs4.BeautifulSoup(raw_html, 'html.parser') candidates = [a['href'] for a in bs.find_all('a', href=True)] links = [] for candidate in candidates: # Links from Apache's 'mod_mbox' plugin contain # trailing "/thread" substrings. Remove them to get # the links where mbox files are stored. if candidate.endswith(MOD_MBOX_THREAD_STR): candidate = candidate[:-len(MOD_MBOX_THREAD_STR)] # Ignore links with not recognized extension ext1 = os.path.splitext(candidate)[-1] ext2 = os.path.splitext(candidate.rstrip(ext1))[-1] if ext1 in PIPERMAIL_TYPES or ext2 in PIPERMAIL_TYPES: links.append(urijoin(self.url, candidate)) else: logger.debug( "Ignoring %s archive because its extension was not recognized", candidate) logger.debug("%s archives found", len(links)) return links
def summary(self): """Get Crates.io summary""" path = urijoin(CRATES_API_URL, SUMMARY_CATEGORY) raw_content = self.fetch(path) return raw_content
def crate_attribute(self, crate_id, attribute): """Get crate attribute""" path = urijoin(CRATES_API_URL, CRATES_CATEGORY, crate_id, attribute) raw_attribute_data = self.fetch(path) return raw_attribute_data
def crate(self, crate_id): """Get a crate by its ID""" path = urijoin(CRATES_API_URL, CRATES_CATEGORY, crate_id) raw_crate = self.fetch(path) return raw_crate
def crates(self, from_page=1): """Get crates in alphabetical order""" path = urijoin(CRATES_API_URL, CRATES_CATEGORY) raw_crates = self.__fetch_items(path, from_page) return raw_crates
def events(self, group, from_date=DEFAULT_DATETIME): """Fetch the events pages of a given group.""" date = datetime_to_utc(from_date) date = date.strftime("since:%Y-%m-%dT%H:%M:%S.000Z") resource = urijoin(group, self.REVENTS) # Hack required due to Metup API does not support list # values with the format `?param=value1¶m=value2`. # It only works with `?param=value1,value2`. # Morever, urrlib3 encodes comma characters when values # are given using params dict, which it doesn't work # with Meetup, either. fixed_params = '?' + self.PFIELDS + '=' + ','.join(self.VEVENT_FIELDS) fixed_params += '&' + self.PSTATUS + '=' + ','.join(self.VSTATUS) resource += fixed_params params = { self.PORDER: self.VUPDATED, self.PSCROLL: date, self.PPAGE: self.max_items } for page in self._fetch(resource, params): yield page
def get_jobs(self): """ Retrieve all jobs""" url_jenkins = urijoin(self.base_url, "api", "json") response = self.fetch(url_jenkins) return response.text
def fetch(self, from_date=DEFAULT_DATETIME): """Fetch the mbox files from the remote archiver. This method stores the archives in the path given during the initialization of this object. HyperKitty archives are accessed month by month and stored following the schema year-month. Archives are fetched from the given month till the current month. :param from_date: fetch archives that store messages equal or after the given date; only year and month values are compared :returns: a list of tuples, storing the links and paths of the fetched archives """ logger.info("Downloading mboxes from '%s' to since %s", self.url, str(from_date)) logger.debug("Storing mboxes in '%s'", self.dirpath) # Check mailing list URL r = requests.get(self.url) r.raise_for_status() from_date = datetime_to_utc(from_date) to_end = datetime_utcnow() to_end += dateutil.relativedelta.relativedelta(months=1) months = months_range(from_date, to_end) fetched = [] if not os.path.exists(self.dirpath): os.makedirs(self.dirpath) tmbox = 0 for dts in months: tmbox += 1 start, end = dts[0], dts[1] filename = start.strftime("%Y-%m.mbox.gz") filepath = os.path.join(self.dirpath, filename) url = urijoin(self.url, 'export', filename) params = { 'start': start.strftime("%Y-%m-%d"), 'end': end.strftime("%Y-%m-%d") } success = self._download_archive(url, params, filepath) if success: fetched.append((url, filepath)) logger.info("%s/%s MBoxes downloaded", len(fetched), tmbox) return fetched
def issue_notes(self, issue_id): """Get the issue notes from pagination""" payload = {'order_by': 'updated_at', 'sort': 'asc'} path = urijoin("issues", str(issue_id), "notes") return self.fetch_items(path, payload)
def crate_attribute(self, crate_id, attribute): """Get crate attribute""" path = urijoin(CRATES_API_URL, CRATES_CATEGORY, crate_id, attribute) raw_attribute_data = self.__send_request(path, headers=self.__set_headers()) return raw_attribute_data
def get_jobs(self): """ Retrieve all jobs """ url_jenkins = urijoin(self.url, "/api/json") req = requests.get(url_jenkins) req.raise_for_status() return req.text
def issue(self, issue_id): """Get the issue data by its ID""" path = urijoin("bugs", str(issue_id)) url_issue = self.__get_url(path) raw_text = self.__send_request(url_issue) return raw_text
def issue_emojis(self, issue_id): """Get emojis of an issue""" payload = {'order_by': 'updated_at', 'sort': 'asc'} path = urijoin("issues", str(issue_id), "award_emoji") return self.fetch_items(path, payload)
def __init__(self, bot, bot_token, tag=None, cache=None, archive=None): origin = urijoin(TELEGRAM_URL, bot) super().__init__(origin, tag=tag, cache=cache, archive=archive) self.bot = bot self.bot_token = bot_token self.client = None
def pull_commits(self, pr_number): """Get pull request commits""" payload = { 'per_page': 30, } commit_url = urijoin("pulls", str(pr_number), "commits") return self.fetch_items(commit_url, payload)
def comments(self, group, event_id): """Fetch the comments of a given event.""" resource = urijoin(group, self.REVENTS, event_id, self.RCOMMENTS) params = {self.PPAGE: self.max_items} for page in self._fetch(resource, params): yield page
def __init__(self, channel, api_token, max_items=MAX_ITEMS, tag=None, cache=None): origin = urijoin(SLACK_URL, channel) super().__init__(origin, tag=tag, cache=cache) self.channel = channel self.max_items = max_items self.client = SlackClient(api_token, max_items=max_items) self._users = {}
def issue_collection(self, issue_id, collection_name): """Get a collection list of a given issue""" path = urijoin("bugs", str(issue_id), collection_name) url_collection = self.__get_url(path) payload = {'ws.size': self.items_per_page, 'ws.start': 0, 'order_by': 'date_last_updated'} raw_items = self.__fetch_items(path=url_collection, payload=payload) return raw_items
def __init__(self, owner, repository, tag=None, cache=None): if owner == DOCKER_SHORTCUT_OWNER: owner = DOCKER_OWNER origin = urijoin(DOCKERHUB_URL, owner, repository) super().__init__(origin, tag=tag, cache=cache) self.owner = owner self.repository = repository self.client = DockerHubClient()
def get_html_question(self, question_id, page=1): """Retrieve a raw HTML question and all it's information. :param question_id: question identifier :param page: page to retrieve """ path = urijoin(self.HTML_QUESTION, question_id) params = {'page': page, 'sort': self.ORDER_HTML} response = self.__call(path, params) return response
def repository(self, owner, repository): """Fetch information about a repository.""" url = urijoin(self.base_url, self.RREPOSITORY, owner, repository) logger.debug("DockerHub client requests: %s", url) response = self.fetch(url) return response.text
def __init__(self, owner, repository, tag=None, archive=None): if owner == DOCKER_SHORTCUT_OWNER: owner = DOCKER_OWNER origin = urijoin(DOCKERHUB_URL, owner, repository) super().__init__(origin, tag=tag, archive=archive) self.owner = owner self.repository = repository self.client = None