コード例 #1
0
ファイル: crawler.py プロジェクト: mr-uuid/craigscrawler
 def __init__(self, search_url):
     self.base_url = MutableURL(MutableURL(search_url).baseurl())
     self.search_url = MutableURL(search_url)
コード例 #2
0
ファイル: crawler.py プロジェクト: mr-uuid/craigscrawler
class CraigsCrawler(object):
    def __init__(self, search_url):
        self.base_url = MutableURL(MutableURL(search_url).baseurl())
        self.search_url = MutableURL(search_url)

    def find_all_posts(self,
                       max_records=MAX_RESULTS,
                       records_per_page=RESULTS_PER_PAGE):
        """
        Finds all craigslist links for the specified search parameters
        It iterates through all pages of the search result.
        The number of iterations of search pages is determined by the total
        count in the page. For example, If there are 2500 results, at 100 results
        per page, that 25 pages. Thus we need to pull 25 pages.
        """
        current_url = self.search_url
        start_at_record = 0
        links = []
        while True:
            # Set the starting point for the records returned
            current_url.query['s'] = [str(start_at_record)]
            # Get page with 100 records after starting point
            # import pdb;pdb.set_trace()
            templinks = [
                link for link in self.extract_links_from_search_page(
                    str(current_url))
            ]
            # Chain results with the latest set of records
            links = itertools.chain(links, templinks)
            # Increment counters and break if there are no more records left
            start_at_record += records_per_page
            if start_at_record >= max_records or templinks == []:
                break
        return links

    def extract_links_from_search_page(self, url):
        """
        Extract all links from a url that is built to return search results for
        a specific product
        """
        # Build response object from url content
        body = requests.get(url)
        response = HtmlResponse(url=url, body=body.content)
        body.connection.close()
        # Build selector from response
        selector = Selector(response=response)
        # Select all the rows returned on that page.
        rows = selector.css('p[class=row]')
        # Extract all the links in the rows
        links = [
            row.css('a[class="hdrlnk"]::attr(href)').extract_first()
            for row in rows
        ]
        # Expand links
        links = [self.base_url.joinurl(link) for link in links]
        return links

    def extract_info_from_post(self, post):
        """
        Determines if the specified craigslist link has contact info. This is
        done based of the fact that there is a hyperlink created in the html
        page with the class showcontact when a page is hiding user contact info
        We are not interested in the users number, we only want to make sure
        that the listing is associated with a phone number
        """

        baseurl = MutableURL(post)
        # Get link content & build response object from url content
        body = requests.get(post)
        response = HtmlResponse(url=post, body=body.content)
        body.connection.close()
        # Build selector from response
        selector = Selector(response=response)
        # Extract the price from the link
        price = selector.css('span[class="price"]').xpath('text()').re(
            '[$]\d+')
        # Create the response
        post = {'link': post}
        # Attach the link that contains the full content of the page
        post['contact_info_link'] = selector.css(
            'a[class="showcontact"]::attr(href)').extract_first()
        # Expand the link
        post['contact_info_link'] = self.base_url.joinurl(
            post['contact_info_link']) if post['contact_info_link'] else None
        post['price'] = int(price[0][1:]) if price else None
        return post

    def scrape_average_value(self):
        """
        Extract all the urls for posts that search url returns.
        """
        pool = ThreadPool(100)
        post_links = self.find_all_posts()
        posts = pool.map(self.extract_info_from_post, post_links)
        pool.close()
        pool.join()
        # Extract the price from each of the posts
        prices = [
            post['price'] for post in posts
            if post.get('price', None) and post.get('contact_info_link', None)
        ]
        # Get some statistics on the prices
        stddev = numpy.std(prices)
        average = numpy.average(prices)
        # Construct and return the response
        response = {}
        # response['urls'] = posts
        response['prices'] = sorted(prices)
        response['url'] = str(self.search_url)
        response['used'] = average - stddev
        response['good'] = average
        response['new'] = average + stddev
        response['stddev'] = stddev
        return response