def __init__(self, search_url): self.base_url = MutableURL(MutableURL(search_url).baseurl()) self.search_url = MutableURL(search_url)
class CraigsCrawler(object): def __init__(self, search_url): self.base_url = MutableURL(MutableURL(search_url).baseurl()) self.search_url = MutableURL(search_url) def find_all_posts(self, max_records=MAX_RESULTS, records_per_page=RESULTS_PER_PAGE): """ Finds all craigslist links for the specified search parameters It iterates through all pages of the search result. The number of iterations of search pages is determined by the total count in the page. For example, If there are 2500 results, at 100 results per page, that 25 pages. Thus we need to pull 25 pages. """ current_url = self.search_url start_at_record = 0 links = [] while True: # Set the starting point for the records returned current_url.query['s'] = [str(start_at_record)] # Get page with 100 records after starting point # import pdb;pdb.set_trace() templinks = [ link for link in self.extract_links_from_search_page( str(current_url)) ] # Chain results with the latest set of records links = itertools.chain(links, templinks) # Increment counters and break if there are no more records left start_at_record += records_per_page if start_at_record >= max_records or templinks == []: break return links def extract_links_from_search_page(self, url): """ Extract all links from a url that is built to return search results for a specific product """ # Build response object from url content body = requests.get(url) response = HtmlResponse(url=url, body=body.content) body.connection.close() # Build selector from response selector = Selector(response=response) # Select all the rows returned on that page. rows = selector.css('p[class=row]') # Extract all the links in the rows links = [ row.css('a[class="hdrlnk"]::attr(href)').extract_first() for row in rows ] # Expand links links = [self.base_url.joinurl(link) for link in links] return links def extract_info_from_post(self, post): """ Determines if the specified craigslist link has contact info. This is done based of the fact that there is a hyperlink created in the html page with the class showcontact when a page is hiding user contact info We are not interested in the users number, we only want to make sure that the listing is associated with a phone number """ baseurl = MutableURL(post) # Get link content & build response object from url content body = requests.get(post) response = HtmlResponse(url=post, body=body.content) body.connection.close() # Build selector from response selector = Selector(response=response) # Extract the price from the link price = selector.css('span[class="price"]').xpath('text()').re( '[$]\d+') # Create the response post = {'link': post} # Attach the link that contains the full content of the page post['contact_info_link'] = selector.css( 'a[class="showcontact"]::attr(href)').extract_first() # Expand the link post['contact_info_link'] = self.base_url.joinurl( post['contact_info_link']) if post['contact_info_link'] else None post['price'] = int(price[0][1:]) if price else None return post def scrape_average_value(self): """ Extract all the urls for posts that search url returns. """ pool = ThreadPool(100) post_links = self.find_all_posts() posts = pool.map(self.extract_info_from_post, post_links) pool.close() pool.join() # Extract the price from each of the posts prices = [ post['price'] for post in posts if post.get('price', None) and post.get('contact_info_link', None) ] # Get some statistics on the prices stddev = numpy.std(prices) average = numpy.average(prices) # Construct and return the response response = {} # response['urls'] = posts response['prices'] = sorted(prices) response['url'] = str(self.search_url) response['used'] = average - stddev response['good'] = average response['new'] = average + stddev response['stddev'] = stddev return response