コード例 #1
0
ファイル: joblisting.py プロジェクト: benrifkind/whoishiring
    def _fetch_comments(self, delay=0):
        """Return a list of comments from an hn post with given link, delay is a fuzzing thing, probably useless

        Args:
         listing_item: string value, like '/item?id=3412900' with article id, from _get_all_submissions
         delay: be nice and wait between requests, or not
        """

        try:
            link = self.url
            # for (comments, link) in iter(partial(self._extract_raw_comments, link), None):
            #     self._process_comments(comments)

            while True:
                logger.info('processing page: %s', link)

                raw = utils.get_raw_page(urljoin(self.HN_BASE_URL, link))
                logger.debug('Got raw page, parsing...')
                page = parse(raw)
                logger.debug('Extracting comments and next page...')
                # [:-2] the last two are some spacers
                comments = self._extract_raw_comments(page)[:-1]
                link = self._extract_next_url(page)
                logger.debug('Processing comments...')
                self._process_comments(comments)
                if link:
                    logger.info('Waiting %s seconds...', delay)
                    time.sleep(delay)
                else:
                    logger.info("Downloaded all comments under submission %s", self.title)
                    break
        except:
            raise
コード例 #2
0
ファイル: whlisting.py プロジェクト: benrifkind/whoishiring
    def _prepare_listing_page(self, url=None):
        """Get single page with url

        Args:
         url: url of page to download listings from
        """
        if not url:
            url = self.SUBMISSION_URL
        def _decide_perm(title):
            if self.PERMANENT_TITLE in title:
                return 'permanent'
            elif self.FREELANCE_TITLE in title:
                return 'freelance'
            else:
                raise ValueError('no "permanent" or "freelance" string in title')

        rawpage = utils.get_raw_page(url)

        try:
            page = pq(rawpage.read())
        except TypeError:
            logger.error("Error parsing raw page")
            raise

        listing = page.find('.title a')
        for i in listing:
            url = pq(i).attr('href')
            title = i.text.strip()

            # skip item if it has no date, like (January 2012) in title, probably not a job listing
            try:
                item_date = date_parse(self.datere.match(title).group(1)).date()
                idate = date(item_date.year, item_date.month, 1)
                position = _decide_perm(title.lower())
                setattr(self[idate], position, Item(title=title,
                                                  permanent=position,
                                                  url=url,
                                                  date=idate
                                               ))
            except (AttributeError, ValueError) as e:
                logger.info('SKIPPING: %s, error: %s', title, e)

        try:
            if listing[-1].text == 'More':
                return url
            else:
                return None
        except IndexError:
            logging.error("Can't prepare submission, you may be rate limited.")
            raise