Example #1
0
class iTunesScraper(scrapy.Spider):
	name = 'iTunes Scraper'

	# This variable is used by Scrapy to begin crawling.
	start_urls = []

	# This list will contain all the URLs to visit and will pass them onto Scrapy in order, one by one.
	urls_to_visit = []

	# This method is the constructor of the spider-scraper. It takes in the names of the input and output files
	# and performs some pre-processing.
	def __init__(self, input_file=None, output_file=None):
		self.csv_helper = CsvHelper(FIELD_NAMES, input_file, output_file)
		if self.csv_helper.stop:
			print("\nINCORRECT INVOCATION, run as:\nscrapy runspider %s" % os.path.basename(__file__) + \
					" -a input_file=<your input file> -a output_file=<your output file>\n")
			return
		self.url_helper = UrlHelper("")  # no prefix
		self.url_helper.process_urls_for_scrapy(self.csv_helper.get_input_file_content(),
															 self.start_urls, None, self.urls_to_visit)

	def make_requests_from_url(self, url):
		return UrlHelper.make_requests_from_url(url)

	def parse(self, response):

		fullStars = len(response.xpath("//div[@id='left-stack']/div[2]/div[2]/div[1]/span[@class='rating-star']"))
		halfStars = len(response.xpath("//div[@id='left-stack']/div[2]/div[2]/div[1]/span[@class='rating-star half']"))
		ghostStars = len(response.xpath("//div[@id='left-stack']/div[2]/div[2]/div[1]/span[@class='rating-star ghost']"))

		reviewCount = response.xpath("//div[@id='left-stack']/div[2]/div[2]/span[2]/text()").extract_first()
		reviewCount = reviewCount.strip()[:-8]

		fullStarsAll = len(response.xpath("//div[@id='left-stack']/div[2]/div[4]/div[1]/span[@class='rating-star']"))
		halfStarsAll = len(response.xpath("//div[@id='left-stack']/div[2]/div[4]/div[1]/span[@class='rating-star half']"))
		ghostStarsAll = len(
			response.xpath("//div[@id='left-stack']/div[2]/div[4]/div[1]/span[@class='rating-star ghost']"))

		reviewCountAll = response.xpath("//div[@id='left-stack']/div[2]/div[4]/span[1]/text()").extract_first()
		reviewCountAll = reviewCountAll.strip()[:-8]

		message = None
		if fullStars + halfStars + ghostStars != 5 or fullStarsAll + halfStarsAll + ghostStarsAll != 5:
			message = "Error scraping page, scraping skipped."

		self.csv_helper.write_row_to_output_file(
			FIELD_NAMES,
			{FIELD_NAMES[0]: response.meta['start_url'], \
			 FIELD_NAMES[1]: datetime.datetime.fromtimestamp(time.time()).strftime(constants.TIME_FORMAT), \
			 FIELD_NAMES[2]: fullStars + 0.5 * halfStars if not message else message,
			 FIELD_NAMES[3]: reviewCount if not message else None,
			 FIELD_NAMES[4]: fullStarsAll + 0.5 * halfStarsAll if not message else None,
			 FIELD_NAMES[5]: reviewCountAll if not message else None})

		# If there are still URLs to process, then yield more crawling.
		if self.urls_to_visit:
			yield self.make_requests_from_url(self.urls_to_visit.pop(0))
class PinterestScraper(scrapy.Spider):
    name = "Pinterest Scraper"

    # This variable is used by Scrapy to begin crawling.
    start_urls = []

    # This dictionary holds the mapping of the URLs to Pinterest handles, which is used when populating the output file.
    url_map = {}

    # This list will contain all the URLs to visit and will pass them onto Scrapy in order, one by one.
    urls_to_visit = []

    # This method is the constructor of the spider-scraper. It takes in the names of the input and output files
    # and performs some pre-processing.
    def __init__(self, input_file=None, output_file=None):
        self.csv_helper = CsvHelper(FIELD_NAMES, input_file, output_file)
        if self.csv_helper.stop:
            print(
                "\nINCORRECT INVOCATION, run as:\nscrapy runspider %s" % os.path.basename(__file__)
                + " -a input_file=<your input file> -a output_file=<your output file>\n"
            )
            return
        self.url_helper = UrlHelper(PREFIX)
        self.url_helper.process_urls_for_scrapy(
            self.csv_helper.get_input_file_content(), self.start_urls, self.url_map, self.urls_to_visit
        )

    def make_requests_from_url(self, url):
        return UrlHelper.make_requests_from_url(url)

    def parse(self, response):
        # This method parses each of the pages found under the urls_to_visit and extracts the number
        # of followers from each of them
        p = re.compile('.*"pinterestapp:followers"\s*:\s*"(\d+)"')
        body = response.body_as_unicode().split("\n")

        followerCount = None
        for line in body:
            m = p.match(line)
            if m:
                followerCount = m.group(1)

        self.csv_helper.write_row_to_output_file(
            FIELD_NAMES,
            {
                FIELD_NAMES[0]: self.url_map[response.meta["start_url"]],
                FIELD_NAMES[1]: response.meta["start_url"],
                FIELD_NAMES[2]: datetime.datetime.fromtimestamp(time.time()).strftime(constants.TIME_FORMAT),
                FIELD_NAMES[3]: followerCount,
            },
        )

        # If there are still URLs to process, then yield more crawling.
        if self.urls_to_visit:
            yield self.make_requests_from_url(self.urls_to_visit.pop(0))
class FacebookScraper(scrapy.Spider):
	name = 'Facebook Scraper'

	# This variable is used by Scrapy to begin crawling.
	start_urls = []

	# This dictionary holds the mapping of the URLs to Facebook handles, which is used when populating the output file.
	url_map = {}

	# This list will contain all the URLs to visit and will pass them onto Scrapy in order, one by one.
	urls_to_visit = []

	# This method is the constructor of the spider-scraper. It takes in the names of the input and output files
	# and performs some pre-processing.
	def __init__(self, input_file=None, output_file=None):
		self.csv_helper = CsvHelper(FIELD_NAMES, input_file, output_file)
		if self.csv_helper.stop:
			print("\nINCORRECT INVOCATION, run as:\nscrapy runspider %s" % os.path.basename(__file__) + \
					" -a input_file=<your input file> -a output_file=<your output file>\n")
			return
		self.url_helper = UrlHelper(PREFIX)
		self.url_helper.process_urls_for_scrapy(self.csv_helper.get_input_file_content(),
															 self.start_urls, self.url_map, self.urls_to_visit)

	def make_requests_from_url(self, url):
		return UrlHelper.make_requests_from_url(url)

	def parse(self, response):
		# Here we're in the method that performs the scraping. Below an xpath expression extracts all HTML comments
		# (it just so happens that the number of likes is in a comment)
		# from the Facebook page and narrows it down to the div containing the number of likes.
		comment = response.xpath('//comment()').re(r'<div.*%s.*/div>' % LIKES_ELEMENT_NAME)

		# Convert the text in the comment to HTML DOM object.
		comment_sel = Selector(text=comment[0], type="html")

		# Use XPATH to extract the final text with the number of likes.
		likes_count = (comment_sel.xpath('//*[@id="%s"]/*/text()' \
													% LIKES_ELEMENT_NAME).extract()[0]).replace(',', '').strip()

		self.csv_helper.write_row_to_output_file(
			FIELD_NAMES,
			{FIELD_NAMES[0]: self.url_map[response.meta['start_url']], \
			 FIELD_NAMES[1]: response.meta['start_url'], \
			 FIELD_NAMES[2]: datetime.datetime.fromtimestamp(time.time()).strftime(constants.TIME_FORMAT), \
			 FIELD_NAMES[3]: likes_count})

		# If there are still URLs to process, then yield more crawling.
		if self.urls_to_visit:
			yield self.make_requests_from_url(self.urls_to_visit.pop(0))
Example #4
0
    def optimize(self):
        routes = []
        for route in self:
            routes.append(route)
        i = 0
        to_remove = []
        to_insert = {}
        for route in routes:
            j = 0
            for other_route in routes:
                if i != j and i not in to_remove and j not in to_remove:
                    if route[0] == other_route[0] and\
                       route[1] == other_route[1] and\
                       route[2] == other_route[2]:
                        first_dict = route[3]
                        second_dict = other_route[3]
                        if i in to_insert:
                            first_dict = to_insert[i][3]
                        if j in to_insert:
                            second_dict = to_insert[j][3]
                        merge = dict(first_dict)
                        merge.update(second_dict)
                        if len(merge) == len(first_dict) + len(second_dict):
                            to_remove.append(j)
                            to_insert[i] = (route[0], route[1], route[2], merge, route[4])
                            if j in to_insert:
                                del to_insert[j]
                        else:
                            raise RoutingDefinitionException('Method defined twice')
                    elif route[0] == other_route[0] and\
                       route[1] == other_route[1] and\
                       route[2] != other_route[2]:
                        raise RoutingDefinitionException('Same routes with different controllers')
                    elif route[0] == other_route[0] and\
                       route[1] != other_route[1]:
                        raise RoutingDefinitionException('Same routes with different variables')
                j += 1
            i+= 1
        for insert in to_insert:
            routes[insert] = to_insert[insert]
        for remove in to_remove:
            routes.remove(routes[remove])
        helper = UrlHelper()

        for route in routes:
            if route[4] is not None:
                helper.append(route[4], route[0])
        return (routes, helper)
class ChromeScraper(scrapy.Spider):
	name = 'Chrome Scraper'

	# This variable is used by Scrapy to begin crawling.
	start_urls = []

	# This list will contain all the URLs to visit and will pass them onto Scrapy in order, one by one.
	urls_to_visit = []

	# This method is the constructor of the spider-scraper. It takes in the names of the input and output files
	# and performs some pre-processing.
	def __init__(self, input_file=None, output_file=None):
		self.csv_helper = CsvHelper(FIELD_NAMES, input_file, output_file)
		if self.csv_helper.stop:
			print("\nINCORRECT INVOCATION, run as:\nscrapy runspider %s" % os.path.basename(__file__) + \
					" -a input_file=<your input file> -a output_file=<your output file>\n")
			return
		self.url_helper = UrlHelper("") # no prefix
		self.url_helper.process_urls_for_scrapy(self.csv_helper.get_input_file_content(),
															 self.start_urls, None, self.urls_to_visit)

	def make_requests_from_url(self, url):
		return UrlHelper.make_requests_from_url(url)

	def parse(self, response):
		# This method parses each of the pages found under the urls_to_visit and extracts the number
		# of users from each of them
		p = re.compile('.*name\s*=\s*"user_count"\s*>\s*(\d+)\s*<')
		body = response.body_as_unicode().split('\n')

		userCount = None
		for line in body:
			m = p.match(line)
			if m:
				userCount = m.group(1)

		self.csv_helper.write_row_to_output_file(
			FIELD_NAMES,
			{FIELD_NAMES[0]: response.meta['start_url'], \
			 FIELD_NAMES[1]: datetime.datetime.fromtimestamp(time.time()).strftime(constants.TIME_FORMAT), \
			 FIELD_NAMES[2]: userCount})

		# If there are still URLs to process, then yield more crawling.
		if self.urls_to_visit:
			yield self.make_requests_from_url(self.urls_to_visit.pop(0))
Example #6
0
	def __init__(self, input_file=None, output_file=None):
		self.csv_helper = CsvHelper(FIELD_NAMES, input_file, output_file)
		if self.csv_helper.stop:
			print("\nINCORRECT INVOCATION, run as:\nscrapy runspider %s" % os.path.basename(__file__) + \
					" -a input_file=<your input file> -a output_file=<your output file>\n")
			return
		self.url_helper = UrlHelper("")  # no prefix
		self.url_helper.process_urls_for_scrapy(self.csv_helper.get_input_file_content(),
															 self.start_urls, None, self.urls_to_visit)
class TwitterScraper(scrapy.Spider):
	name = 'Twitter Scraper'

	# This variable is used by Scrapy to begin crawling.
	start_urls = []

	# This dictionary holds the mapping of the URLs to Twitter handles, which is used when populating the output file.
	url_map = {}

	# This list will contain all the URLs to visit and will pass them onto Scrapy in order, one by one.
	urls_to_visit = []

	# This method is the constructor of the spider-scraper. It takes in the names of the input and output files
	# and performs some pre-processing.
	def __init__(self, input_file=None, output_file=None):
		self.csv_helper = CsvHelper(FIELD_NAMES, input_file, output_file)
		if self.csv_helper.stop:
			print("\nINCORRECT INVOCATION, run as:\nscrapy runspider %s" % os.path.basename(__file__) + \
					" -a input_file=<your input file> -a output_file=<your output file>\n")
			return
		self.urlHelper = UrlHelper(PREFIX)
		self.urlHelper.process_urls_for_scrapy(self.csv_helper.get_input_file_content(),
															self.start_urls, self.url_map, self.urls_to_visit)

	# Here we override the method make_requests_from_url to use the one from the UrlHelper instead of the one in
	# scrapy.Spider
	def make_requests_from_url(self, url):
		return UrlHelper.make_requests_from_url(url)

	def parse(self, response):
		# Here we're in the method that performs the scraping. Below an xpath expression extracts the
		# number of followers from the element with attribute data-nav equal to "followers"
		followers_count = response.xpath('//*[@data-nav="followers"]/@title').re("[\d,]*")[0].replace(',', '')

		self.csv_helper.write_row_to_output_file(
			FIELD_NAMES,
			{FIELD_NAMES[0]: self.url_map[response.meta['start_url']], \
			 FIELD_NAMES[1]: response.meta['start_url'], \
			 FIELD_NAMES[2]: datetime.datetime.fromtimestamp(time.time()).strftime(constants.TIME_FORMAT), \
			 FIELD_NAMES[3]: followers_count})

		# If there are still URLs to process, then yield more crawling.
		if self.urls_to_visit:
			yield self.make_requests_from_url(self.urls_to_visit.pop(0))
Example #8
0
	def make_requests_from_url(self, url):
		return UrlHelper.make_requests_from_url(url)
# Create CsvHelper which will aid in processing the CSV files
csv_helper = CsvHelper(FIELD_NAMES, sys.argv[1], sys.argv[2])
if csv_helper.stop:
	print_usage()
	exit(0)

# Here we take the system's current time and convert it to the number of seconds since the 'epoch'.
# Further on we subtract the number of seconds in 24 hours from it and specify the result
# as the time from which the number of tweets should be counted.
current_time = time.mktime(time.localtime())
time_24_hours_ago = current_time - constants.SECONDS_IN_24H_COUNT
time_query_parameter = "&mintime=" + repr(int(time_24_hours_ago))

# Create the UrlHelper which will aid in processing URLs
url_helper = UrlHelper(
	'http://otter.topsy.com/searchcount.js?dynamic=1&count_method=citation&' +
	'apikey=09C43A9B270A470B8EB8F2946A9369F3&%s&q=' % time_query_parameter)
url_helper.process_urls(csv_helper.get_input_file_content(), url_map, urls_to_visit)

print("Scraping number of tweets for phrases..")

# For each URL in the list of URLs to visit connect to topsy and fetch a response containing JSON
# with all the numbers of tweets in the past periods. From it we extract the number of tweets in
# the past day and write a line in the output CSV file.
for url in urls_to_visit:
	topsy_response = urllib2.urlopen(url).read()
	json_response = re.search("\{.*\}", topsy_response).group(0)
	tweets_count = json.loads(json_response)['response']['windows']['d']['total']

	csv_helper.write_row_to_output_file(
		FIELD_NAMES,