예제 #1
0
class iTunesScraper(scrapy.Spider):
	name = 'iTunes Scraper'

	# This variable is used by Scrapy to begin crawling.
	start_urls = []

	# This list will contain all the URLs to visit and will pass them onto Scrapy in order, one by one.
	urls_to_visit = []

	# This method is the constructor of the spider-scraper. It takes in the names of the input and output files
	# and performs some pre-processing.
	def __init__(self, input_file=None, output_file=None):
		self.csv_helper = CsvHelper(FIELD_NAMES, input_file, output_file)
		if self.csv_helper.stop:
			print("\nINCORRECT INVOCATION, run as:\nscrapy runspider %s" % os.path.basename(__file__) + \
					" -a input_file=<your input file> -a output_file=<your output file>\n")
			return
		self.url_helper = UrlHelper("")  # no prefix
		self.url_helper.process_urls_for_scrapy(self.csv_helper.get_input_file_content(),
															 self.start_urls, None, self.urls_to_visit)

	def make_requests_from_url(self, url):
		return UrlHelper.make_requests_from_url(url)

	def parse(self, response):

		fullStars = len(response.xpath("//div[@id='left-stack']/div[2]/div[2]/div[1]/span[@class='rating-star']"))
		halfStars = len(response.xpath("//div[@id='left-stack']/div[2]/div[2]/div[1]/span[@class='rating-star half']"))
		ghostStars = len(response.xpath("//div[@id='left-stack']/div[2]/div[2]/div[1]/span[@class='rating-star ghost']"))

		reviewCount = response.xpath("//div[@id='left-stack']/div[2]/div[2]/span[2]/text()").extract_first()
		reviewCount = reviewCount.strip()[:-8]

		fullStarsAll = len(response.xpath("//div[@id='left-stack']/div[2]/div[4]/div[1]/span[@class='rating-star']"))
		halfStarsAll = len(response.xpath("//div[@id='left-stack']/div[2]/div[4]/div[1]/span[@class='rating-star half']"))
		ghostStarsAll = len(
			response.xpath("//div[@id='left-stack']/div[2]/div[4]/div[1]/span[@class='rating-star ghost']"))

		reviewCountAll = response.xpath("//div[@id='left-stack']/div[2]/div[4]/span[1]/text()").extract_first()
		reviewCountAll = reviewCountAll.strip()[:-8]

		message = None
		if fullStars + halfStars + ghostStars != 5 or fullStarsAll + halfStarsAll + ghostStarsAll != 5:
			message = "Error scraping page, scraping skipped."

		self.csv_helper.write_row_to_output_file(
			FIELD_NAMES,
			{FIELD_NAMES[0]: response.meta['start_url'], \
			 FIELD_NAMES[1]: datetime.datetime.fromtimestamp(time.time()).strftime(constants.TIME_FORMAT), \
			 FIELD_NAMES[2]: fullStars + 0.5 * halfStars if not message else message,
			 FIELD_NAMES[3]: reviewCount if not message else None,
			 FIELD_NAMES[4]: fullStarsAll + 0.5 * halfStarsAll if not message else None,
			 FIELD_NAMES[5]: reviewCountAll if not message else None})

		# If there are still URLs to process, then yield more crawling.
		if self.urls_to_visit:
			yield self.make_requests_from_url(self.urls_to_visit.pop(0))
예제 #2
0
class PinterestScraper(scrapy.Spider):
    name = "Pinterest Scraper"

    # This variable is used by Scrapy to begin crawling.
    start_urls = []

    # This dictionary holds the mapping of the URLs to Pinterest handles, which is used when populating the output file.
    url_map = {}

    # This list will contain all the URLs to visit and will pass them onto Scrapy in order, one by one.
    urls_to_visit = []

    # This method is the constructor of the spider-scraper. It takes in the names of the input and output files
    # and performs some pre-processing.
    def __init__(self, input_file=None, output_file=None):
        self.csv_helper = CsvHelper(FIELD_NAMES, input_file, output_file)
        if self.csv_helper.stop:
            print(
                "\nINCORRECT INVOCATION, run as:\nscrapy runspider %s" % os.path.basename(__file__)
                + " -a input_file=<your input file> -a output_file=<your output file>\n"
            )
            return
        self.url_helper = UrlHelper(PREFIX)
        self.url_helper.process_urls_for_scrapy(
            self.csv_helper.get_input_file_content(), self.start_urls, self.url_map, self.urls_to_visit
        )

    def make_requests_from_url(self, url):
        return UrlHelper.make_requests_from_url(url)

    def parse(self, response):
        # This method parses each of the pages found under the urls_to_visit and extracts the number
        # of followers from each of them
        p = re.compile('.*"pinterestapp:followers"\s*:\s*"(\d+)"')
        body = response.body_as_unicode().split("\n")

        followerCount = None
        for line in body:
            m = p.match(line)
            if m:
                followerCount = m.group(1)

        self.csv_helper.write_row_to_output_file(
            FIELD_NAMES,
            {
                FIELD_NAMES[0]: self.url_map[response.meta["start_url"]],
                FIELD_NAMES[1]: response.meta["start_url"],
                FIELD_NAMES[2]: datetime.datetime.fromtimestamp(time.time()).strftime(constants.TIME_FORMAT),
                FIELD_NAMES[3]: followerCount,
            },
        )

        # If there are still URLs to process, then yield more crawling.
        if self.urls_to_visit:
            yield self.make_requests_from_url(self.urls_to_visit.pop(0))
예제 #3
0
class FacebookScraper(scrapy.Spider):
	name = 'Facebook Scraper'

	# This variable is used by Scrapy to begin crawling.
	start_urls = []

	# This dictionary holds the mapping of the URLs to Facebook handles, which is used when populating the output file.
	url_map = {}

	# This list will contain all the URLs to visit and will pass them onto Scrapy in order, one by one.
	urls_to_visit = []

	# This method is the constructor of the spider-scraper. It takes in the names of the input and output files
	# and performs some pre-processing.
	def __init__(self, input_file=None, output_file=None):
		self.csv_helper = CsvHelper(FIELD_NAMES, input_file, output_file)
		if self.csv_helper.stop:
			print("\nINCORRECT INVOCATION, run as:\nscrapy runspider %s" % os.path.basename(__file__) + \
					" -a input_file=<your input file> -a output_file=<your output file>\n")
			return
		self.url_helper = UrlHelper(PREFIX)
		self.url_helper.process_urls_for_scrapy(self.csv_helper.get_input_file_content(),
															 self.start_urls, self.url_map, self.urls_to_visit)

	def make_requests_from_url(self, url):
		return UrlHelper.make_requests_from_url(url)

	def parse(self, response):
		# Here we're in the method that performs the scraping. Below an xpath expression extracts all HTML comments
		# (it just so happens that the number of likes is in a comment)
		# from the Facebook page and narrows it down to the div containing the number of likes.
		comment = response.xpath('//comment()').re(r'<div.*%s.*/div>' % LIKES_ELEMENT_NAME)

		# Convert the text in the comment to HTML DOM object.
		comment_sel = Selector(text=comment[0], type="html")

		# Use XPATH to extract the final text with the number of likes.
		likes_count = (comment_sel.xpath('//*[@id="%s"]/*/text()' \
													% LIKES_ELEMENT_NAME).extract()[0]).replace(',', '').strip()

		self.csv_helper.write_row_to_output_file(
			FIELD_NAMES,
			{FIELD_NAMES[0]: self.url_map[response.meta['start_url']], \
			 FIELD_NAMES[1]: response.meta['start_url'], \
			 FIELD_NAMES[2]: datetime.datetime.fromtimestamp(time.time()).strftime(constants.TIME_FORMAT), \
			 FIELD_NAMES[3]: likes_count})

		# If there are still URLs to process, then yield more crawling.
		if self.urls_to_visit:
			yield self.make_requests_from_url(self.urls_to_visit.pop(0))
class ChromeScraper(scrapy.Spider):
	name = 'Chrome Scraper'

	# This variable is used by Scrapy to begin crawling.
	start_urls = []

	# This list will contain all the URLs to visit and will pass them onto Scrapy in order, one by one.
	urls_to_visit = []

	# This method is the constructor of the spider-scraper. It takes in the names of the input and output files
	# and performs some pre-processing.
	def __init__(self, input_file=None, output_file=None):
		self.csv_helper = CsvHelper(FIELD_NAMES, input_file, output_file)
		if self.csv_helper.stop:
			print("\nINCORRECT INVOCATION, run as:\nscrapy runspider %s" % os.path.basename(__file__) + \
					" -a input_file=<your input file> -a output_file=<your output file>\n")
			return
		self.url_helper = UrlHelper("") # no prefix
		self.url_helper.process_urls_for_scrapy(self.csv_helper.get_input_file_content(),
															 self.start_urls, None, self.urls_to_visit)

	def make_requests_from_url(self, url):
		return UrlHelper.make_requests_from_url(url)

	def parse(self, response):
		# This method parses each of the pages found under the urls_to_visit and extracts the number
		# of users from each of them
		p = re.compile('.*name\s*=\s*"user_count"\s*>\s*(\d+)\s*<')
		body = response.body_as_unicode().split('\n')

		userCount = None
		for line in body:
			m = p.match(line)
			if m:
				userCount = m.group(1)

		self.csv_helper.write_row_to_output_file(
			FIELD_NAMES,
			{FIELD_NAMES[0]: response.meta['start_url'], \
			 FIELD_NAMES[1]: datetime.datetime.fromtimestamp(time.time()).strftime(constants.TIME_FORMAT), \
			 FIELD_NAMES[2]: userCount})

		# If there are still URLs to process, then yield more crawling.
		if self.urls_to_visit:
			yield self.make_requests_from_url(self.urls_to_visit.pop(0))
class TwitterScraper(scrapy.Spider):
	name = 'Twitter Scraper'

	# This variable is used by Scrapy to begin crawling.
	start_urls = []

	# This dictionary holds the mapping of the URLs to Twitter handles, which is used when populating the output file.
	url_map = {}

	# This list will contain all the URLs to visit and will pass them onto Scrapy in order, one by one.
	urls_to_visit = []

	# This method is the constructor of the spider-scraper. It takes in the names of the input and output files
	# and performs some pre-processing.
	def __init__(self, input_file=None, output_file=None):
		self.csv_helper = CsvHelper(FIELD_NAMES, input_file, output_file)
		if self.csv_helper.stop:
			print("\nINCORRECT INVOCATION, run as:\nscrapy runspider %s" % os.path.basename(__file__) + \
					" -a input_file=<your input file> -a output_file=<your output file>\n")
			return
		self.urlHelper = UrlHelper(PREFIX)
		self.urlHelper.process_urls_for_scrapy(self.csv_helper.get_input_file_content(),
															self.start_urls, self.url_map, self.urls_to_visit)

	# Here we override the method make_requests_from_url to use the one from the UrlHelper instead of the one in
	# scrapy.Spider
	def make_requests_from_url(self, url):
		return UrlHelper.make_requests_from_url(url)

	def parse(self, response):
		# Here we're in the method that performs the scraping. Below an xpath expression extracts the
		# number of followers from the element with attribute data-nav equal to "followers"
		followers_count = response.xpath('//*[@data-nav="followers"]/@title').re("[\d,]*")[0].replace(',', '')

		self.csv_helper.write_row_to_output_file(
			FIELD_NAMES,
			{FIELD_NAMES[0]: self.url_map[response.meta['start_url']], \
			 FIELD_NAMES[1]: response.meta['start_url'], \
			 FIELD_NAMES[2]: datetime.datetime.fromtimestamp(time.time()).strftime(constants.TIME_FORMAT), \
			 FIELD_NAMES[3]: followers_count})

		# If there are still URLs to process, then yield more crawling.
		if self.urls_to_visit:
			yield self.make_requests_from_url(self.urls_to_visit.pop(0))