def _fetch_comments(self, delay=0): """Return a list of comments from an hn post with given link, delay is a fuzzing thing, probably useless Args: listing_item: string value, like '/item?id=3412900' with article id, from _get_all_submissions delay: be nice and wait between requests, or not """ try: link = self.url # for (comments, link) in iter(partial(self._extract_raw_comments, link), None): # self._process_comments(comments) while True: logger.info('processing page: %s', link) raw = utils.get_raw_page(urljoin(self.HN_BASE_URL, link)) logger.debug('Got raw page, parsing...') page = parse(raw) logger.debug('Extracting comments and next page...') # [:-2] the last two are some spacers comments = self._extract_raw_comments(page)[:-1] link = self._extract_next_url(page) logger.debug('Processing comments...') self._process_comments(comments) if link: logger.info('Waiting %s seconds...', delay) time.sleep(delay) else: logger.info("Downloaded all comments under submission %s", self.title) break except: raise
def _prepare_listing_page(self, url=None): """Get single page with url Args: url: url of page to download listings from """ if not url: url = self.SUBMISSION_URL def _decide_perm(title): if self.PERMANENT_TITLE in title: return 'permanent' elif self.FREELANCE_TITLE in title: return 'freelance' else: raise ValueError('no "permanent" or "freelance" string in title') rawpage = utils.get_raw_page(url) try: page = pq(rawpage.read()) except TypeError: logger.error("Error parsing raw page") raise listing = page.find('.title a') for i in listing: url = pq(i).attr('href') title = i.text.strip() # skip item if it has no date, like (January 2012) in title, probably not a job listing try: item_date = date_parse(self.datere.match(title).group(1)).date() idate = date(item_date.year, item_date.month, 1) position = _decide_perm(title.lower()) setattr(self[idate], position, Item(title=title, permanent=position, url=url, date=idate )) except (AttributeError, ValueError) as e: logger.info('SKIPPING: %s, error: %s', title, e) try: if listing[-1].text == 'More': return url else: return None except IndexError: logging.error("Can't prepare submission, you may be rate limited.") raise