class iTunesScraper(scrapy.Spider): name = 'iTunes Scraper' # This variable is used by Scrapy to begin crawling. start_urls = [] # This list will contain all the URLs to visit and will pass them onto Scrapy in order, one by one. urls_to_visit = [] # This method is the constructor of the spider-scraper. It takes in the names of the input and output files # and performs some pre-processing. def __init__(self, input_file=None, output_file=None): self.csv_helper = CsvHelper(FIELD_NAMES, input_file, output_file) if self.csv_helper.stop: print("\nINCORRECT INVOCATION, run as:\nscrapy runspider %s" % os.path.basename(__file__) + \ " -a input_file=<your input file> -a output_file=<your output file>\n") return self.url_helper = UrlHelper("") # no prefix self.url_helper.process_urls_for_scrapy(self.csv_helper.get_input_file_content(), self.start_urls, None, self.urls_to_visit) def make_requests_from_url(self, url): return UrlHelper.make_requests_from_url(url) def parse(self, response): fullStars = len(response.xpath("//div[@id='left-stack']/div[2]/div[2]/div[1]/span[@class='rating-star']")) halfStars = len(response.xpath("//div[@id='left-stack']/div[2]/div[2]/div[1]/span[@class='rating-star half']")) ghostStars = len(response.xpath("//div[@id='left-stack']/div[2]/div[2]/div[1]/span[@class='rating-star ghost']")) reviewCount = response.xpath("//div[@id='left-stack']/div[2]/div[2]/span[2]/text()").extract_first() reviewCount = reviewCount.strip()[:-8] fullStarsAll = len(response.xpath("//div[@id='left-stack']/div[2]/div[4]/div[1]/span[@class='rating-star']")) halfStarsAll = len(response.xpath("//div[@id='left-stack']/div[2]/div[4]/div[1]/span[@class='rating-star half']")) ghostStarsAll = len( response.xpath("//div[@id='left-stack']/div[2]/div[4]/div[1]/span[@class='rating-star ghost']")) reviewCountAll = response.xpath("//div[@id='left-stack']/div[2]/div[4]/span[1]/text()").extract_first() reviewCountAll = reviewCountAll.strip()[:-8] message = None if fullStars + halfStars + ghostStars != 5 or fullStarsAll + halfStarsAll + ghostStarsAll != 5: message = "Error scraping page, scraping skipped." self.csv_helper.write_row_to_output_file( FIELD_NAMES, {FIELD_NAMES[0]: response.meta['start_url'], \ FIELD_NAMES[1]: datetime.datetime.fromtimestamp(time.time()).strftime(constants.TIME_FORMAT), \ FIELD_NAMES[2]: fullStars + 0.5 * halfStars if not message else message, FIELD_NAMES[3]: reviewCount if not message else None, FIELD_NAMES[4]: fullStarsAll + 0.5 * halfStarsAll if not message else None, FIELD_NAMES[5]: reviewCountAll if not message else None}) # If there are still URLs to process, then yield more crawling. if self.urls_to_visit: yield self.make_requests_from_url(self.urls_to_visit.pop(0))
class PinterestScraper(scrapy.Spider): name = "Pinterest Scraper" # This variable is used by Scrapy to begin crawling. start_urls = [] # This dictionary holds the mapping of the URLs to Pinterest handles, which is used when populating the output file. url_map = {} # This list will contain all the URLs to visit and will pass them onto Scrapy in order, one by one. urls_to_visit = [] # This method is the constructor of the spider-scraper. It takes in the names of the input and output files # and performs some pre-processing. def __init__(self, input_file=None, output_file=None): self.csv_helper = CsvHelper(FIELD_NAMES, input_file, output_file) if self.csv_helper.stop: print( "\nINCORRECT INVOCATION, run as:\nscrapy runspider %s" % os.path.basename(__file__) + " -a input_file=<your input file> -a output_file=<your output file>\n" ) return self.url_helper = UrlHelper(PREFIX) self.url_helper.process_urls_for_scrapy( self.csv_helper.get_input_file_content(), self.start_urls, self.url_map, self.urls_to_visit ) def make_requests_from_url(self, url): return UrlHelper.make_requests_from_url(url) def parse(self, response): # This method parses each of the pages found under the urls_to_visit and extracts the number # of followers from each of them p = re.compile('.*"pinterestapp:followers"\s*:\s*"(\d+)"') body = response.body_as_unicode().split("\n") followerCount = None for line in body: m = p.match(line) if m: followerCount = m.group(1) self.csv_helper.write_row_to_output_file( FIELD_NAMES, { FIELD_NAMES[0]: self.url_map[response.meta["start_url"]], FIELD_NAMES[1]: response.meta["start_url"], FIELD_NAMES[2]: datetime.datetime.fromtimestamp(time.time()).strftime(constants.TIME_FORMAT), FIELD_NAMES[3]: followerCount, }, ) # If there are still URLs to process, then yield more crawling. if self.urls_to_visit: yield self.make_requests_from_url(self.urls_to_visit.pop(0))
class FacebookScraper(scrapy.Spider): name = 'Facebook Scraper' # This variable is used by Scrapy to begin crawling. start_urls = [] # This dictionary holds the mapping of the URLs to Facebook handles, which is used when populating the output file. url_map = {} # This list will contain all the URLs to visit and will pass them onto Scrapy in order, one by one. urls_to_visit = [] # This method is the constructor of the spider-scraper. It takes in the names of the input and output files # and performs some pre-processing. def __init__(self, input_file=None, output_file=None): self.csv_helper = CsvHelper(FIELD_NAMES, input_file, output_file) if self.csv_helper.stop: print("\nINCORRECT INVOCATION, run as:\nscrapy runspider %s" % os.path.basename(__file__) + \ " -a input_file=<your input file> -a output_file=<your output file>\n") return self.url_helper = UrlHelper(PREFIX) self.url_helper.process_urls_for_scrapy(self.csv_helper.get_input_file_content(), self.start_urls, self.url_map, self.urls_to_visit) def make_requests_from_url(self, url): return UrlHelper.make_requests_from_url(url) def parse(self, response): # Here we're in the method that performs the scraping. Below an xpath expression extracts all HTML comments # (it just so happens that the number of likes is in a comment) # from the Facebook page and narrows it down to the div containing the number of likes. comment = response.xpath('//comment()').re(r'<div.*%s.*/div>' % LIKES_ELEMENT_NAME) # Convert the text in the comment to HTML DOM object. comment_sel = Selector(text=comment[0], type="html") # Use XPATH to extract the final text with the number of likes. likes_count = (comment_sel.xpath('//*[@id="%s"]/*/text()' \ % LIKES_ELEMENT_NAME).extract()[0]).replace(',', '').strip() self.csv_helper.write_row_to_output_file( FIELD_NAMES, {FIELD_NAMES[0]: self.url_map[response.meta['start_url']], \ FIELD_NAMES[1]: response.meta['start_url'], \ FIELD_NAMES[2]: datetime.datetime.fromtimestamp(time.time()).strftime(constants.TIME_FORMAT), \ FIELD_NAMES[3]: likes_count}) # If there are still URLs to process, then yield more crawling. if self.urls_to_visit: yield self.make_requests_from_url(self.urls_to_visit.pop(0))
class ChromeScraper(scrapy.Spider): name = 'Chrome Scraper' # This variable is used by Scrapy to begin crawling. start_urls = [] # This list will contain all the URLs to visit and will pass them onto Scrapy in order, one by one. urls_to_visit = [] # This method is the constructor of the spider-scraper. It takes in the names of the input and output files # and performs some pre-processing. def __init__(self, input_file=None, output_file=None): self.csv_helper = CsvHelper(FIELD_NAMES, input_file, output_file) if self.csv_helper.stop: print("\nINCORRECT INVOCATION, run as:\nscrapy runspider %s" % os.path.basename(__file__) + \ " -a input_file=<your input file> -a output_file=<your output file>\n") return self.url_helper = UrlHelper("") # no prefix self.url_helper.process_urls_for_scrapy(self.csv_helper.get_input_file_content(), self.start_urls, None, self.urls_to_visit) def make_requests_from_url(self, url): return UrlHelper.make_requests_from_url(url) def parse(self, response): # This method parses each of the pages found under the urls_to_visit and extracts the number # of users from each of them p = re.compile('.*name\s*=\s*"user_count"\s*>\s*(\d+)\s*<') body = response.body_as_unicode().split('\n') userCount = None for line in body: m = p.match(line) if m: userCount = m.group(1) self.csv_helper.write_row_to_output_file( FIELD_NAMES, {FIELD_NAMES[0]: response.meta['start_url'], \ FIELD_NAMES[1]: datetime.datetime.fromtimestamp(time.time()).strftime(constants.TIME_FORMAT), \ FIELD_NAMES[2]: userCount}) # If there are still URLs to process, then yield more crawling. if self.urls_to_visit: yield self.make_requests_from_url(self.urls_to_visit.pop(0))
class TwitterScraper(scrapy.Spider): name = 'Twitter Scraper' # This variable is used by Scrapy to begin crawling. start_urls = [] # This dictionary holds the mapping of the URLs to Twitter handles, which is used when populating the output file. url_map = {} # This list will contain all the URLs to visit and will pass them onto Scrapy in order, one by one. urls_to_visit = [] # This method is the constructor of the spider-scraper. It takes in the names of the input and output files # and performs some pre-processing. def __init__(self, input_file=None, output_file=None): self.csv_helper = CsvHelper(FIELD_NAMES, input_file, output_file) if self.csv_helper.stop: print("\nINCORRECT INVOCATION, run as:\nscrapy runspider %s" % os.path.basename(__file__) + \ " -a input_file=<your input file> -a output_file=<your output file>\n") return self.urlHelper = UrlHelper(PREFIX) self.urlHelper.process_urls_for_scrapy(self.csv_helper.get_input_file_content(), self.start_urls, self.url_map, self.urls_to_visit) # Here we override the method make_requests_from_url to use the one from the UrlHelper instead of the one in # scrapy.Spider def make_requests_from_url(self, url): return UrlHelper.make_requests_from_url(url) def parse(self, response): # Here we're in the method that performs the scraping. Below an xpath expression extracts the # number of followers from the element with attribute data-nav equal to "followers" followers_count = response.xpath('//*[@data-nav="followers"]/@title').re("[\d,]*")[0].replace(',', '') self.csv_helper.write_row_to_output_file( FIELD_NAMES, {FIELD_NAMES[0]: self.url_map[response.meta['start_url']], \ FIELD_NAMES[1]: response.meta['start_url'], \ FIELD_NAMES[2]: datetime.datetime.fromtimestamp(time.time()).strftime(constants.TIME_FORMAT), \ FIELD_NAMES[3]: followers_count}) # If there are still URLs to process, then yield more crawling. if self.urls_to_visit: yield self.make_requests_from_url(self.urls_to_visit.pop(0))