class iTunesScraper(scrapy.Spider): name = 'iTunes Scraper' # This variable is used by Scrapy to begin crawling. start_urls = [] # This list will contain all the URLs to visit and will pass them onto Scrapy in order, one by one. urls_to_visit = [] # This method is the constructor of the spider-scraper. It takes in the names of the input and output files # and performs some pre-processing. def __init__(self, input_file=None, output_file=None): self.csv_helper = CsvHelper(FIELD_NAMES, input_file, output_file) if self.csv_helper.stop: print("\nINCORRECT INVOCATION, run as:\nscrapy runspider %s" % os.path.basename(__file__) + \ " -a input_file=<your input file> -a output_file=<your output file>\n") return self.url_helper = UrlHelper("") # no prefix self.url_helper.process_urls_for_scrapy(self.csv_helper.get_input_file_content(), self.start_urls, None, self.urls_to_visit) def make_requests_from_url(self, url): return UrlHelper.make_requests_from_url(url) def parse(self, response): fullStars = len(response.xpath("//div[@id='left-stack']/div[2]/div[2]/div[1]/span[@class='rating-star']")) halfStars = len(response.xpath("//div[@id='left-stack']/div[2]/div[2]/div[1]/span[@class='rating-star half']")) ghostStars = len(response.xpath("//div[@id='left-stack']/div[2]/div[2]/div[1]/span[@class='rating-star ghost']")) reviewCount = response.xpath("//div[@id='left-stack']/div[2]/div[2]/span[2]/text()").extract_first() reviewCount = reviewCount.strip()[:-8] fullStarsAll = len(response.xpath("//div[@id='left-stack']/div[2]/div[4]/div[1]/span[@class='rating-star']")) halfStarsAll = len(response.xpath("//div[@id='left-stack']/div[2]/div[4]/div[1]/span[@class='rating-star half']")) ghostStarsAll = len( response.xpath("//div[@id='left-stack']/div[2]/div[4]/div[1]/span[@class='rating-star ghost']")) reviewCountAll = response.xpath("//div[@id='left-stack']/div[2]/div[4]/span[1]/text()").extract_first() reviewCountAll = reviewCountAll.strip()[:-8] message = None if fullStars + halfStars + ghostStars != 5 or fullStarsAll + halfStarsAll + ghostStarsAll != 5: message = "Error scraping page, scraping skipped." self.csv_helper.write_row_to_output_file( FIELD_NAMES, {FIELD_NAMES[0]: response.meta['start_url'], \ FIELD_NAMES[1]: datetime.datetime.fromtimestamp(time.time()).strftime(constants.TIME_FORMAT), \ FIELD_NAMES[2]: fullStars + 0.5 * halfStars if not message else message, FIELD_NAMES[3]: reviewCount if not message else None, FIELD_NAMES[4]: fullStarsAll + 0.5 * halfStarsAll if not message else None, FIELD_NAMES[5]: reviewCountAll if not message else None}) # If there are still URLs to process, then yield more crawling. if self.urls_to_visit: yield self.make_requests_from_url(self.urls_to_visit.pop(0))
class PinterestScraper(scrapy.Spider): name = "Pinterest Scraper" # This variable is used by Scrapy to begin crawling. start_urls = [] # This dictionary holds the mapping of the URLs to Pinterest handles, which is used when populating the output file. url_map = {} # This list will contain all the URLs to visit and will pass them onto Scrapy in order, one by one. urls_to_visit = [] # This method is the constructor of the spider-scraper. It takes in the names of the input and output files # and performs some pre-processing. def __init__(self, input_file=None, output_file=None): self.csv_helper = CsvHelper(FIELD_NAMES, input_file, output_file) if self.csv_helper.stop: print( "\nINCORRECT INVOCATION, run as:\nscrapy runspider %s" % os.path.basename(__file__) + " -a input_file=<your input file> -a output_file=<your output file>\n" ) return self.url_helper = UrlHelper(PREFIX) self.url_helper.process_urls_for_scrapy( self.csv_helper.get_input_file_content(), self.start_urls, self.url_map, self.urls_to_visit ) def make_requests_from_url(self, url): return UrlHelper.make_requests_from_url(url) def parse(self, response): # This method parses each of the pages found under the urls_to_visit and extracts the number # of followers from each of them p = re.compile('.*"pinterestapp:followers"\s*:\s*"(\d+)"') body = response.body_as_unicode().split("\n") followerCount = None for line in body: m = p.match(line) if m: followerCount = m.group(1) self.csv_helper.write_row_to_output_file( FIELD_NAMES, { FIELD_NAMES[0]: self.url_map[response.meta["start_url"]], FIELD_NAMES[1]: response.meta["start_url"], FIELD_NAMES[2]: datetime.datetime.fromtimestamp(time.time()).strftime(constants.TIME_FORMAT), FIELD_NAMES[3]: followerCount, }, ) # If there are still URLs to process, then yield more crawling. if self.urls_to_visit: yield self.make_requests_from_url(self.urls_to_visit.pop(0))
class FacebookScraper(scrapy.Spider): name = 'Facebook Scraper' # This variable is used by Scrapy to begin crawling. start_urls = [] # This dictionary holds the mapping of the URLs to Facebook handles, which is used when populating the output file. url_map = {} # This list will contain all the URLs to visit and will pass them onto Scrapy in order, one by one. urls_to_visit = [] # This method is the constructor of the spider-scraper. It takes in the names of the input and output files # and performs some pre-processing. def __init__(self, input_file=None, output_file=None): self.csv_helper = CsvHelper(FIELD_NAMES, input_file, output_file) if self.csv_helper.stop: print("\nINCORRECT INVOCATION, run as:\nscrapy runspider %s" % os.path.basename(__file__) + \ " -a input_file=<your input file> -a output_file=<your output file>\n") return self.url_helper = UrlHelper(PREFIX) self.url_helper.process_urls_for_scrapy(self.csv_helper.get_input_file_content(), self.start_urls, self.url_map, self.urls_to_visit) def make_requests_from_url(self, url): return UrlHelper.make_requests_from_url(url) def parse(self, response): # Here we're in the method that performs the scraping. Below an xpath expression extracts all HTML comments # (it just so happens that the number of likes is in a comment) # from the Facebook page and narrows it down to the div containing the number of likes. comment = response.xpath('//comment()').re(r'<div.*%s.*/div>' % LIKES_ELEMENT_NAME) # Convert the text in the comment to HTML DOM object. comment_sel = Selector(text=comment[0], type="html") # Use XPATH to extract the final text with the number of likes. likes_count = (comment_sel.xpath('//*[@id="%s"]/*/text()' \ % LIKES_ELEMENT_NAME).extract()[0]).replace(',', '').strip() self.csv_helper.write_row_to_output_file( FIELD_NAMES, {FIELD_NAMES[0]: self.url_map[response.meta['start_url']], \ FIELD_NAMES[1]: response.meta['start_url'], \ FIELD_NAMES[2]: datetime.datetime.fromtimestamp(time.time()).strftime(constants.TIME_FORMAT), \ FIELD_NAMES[3]: likes_count}) # If there are still URLs to process, then yield more crawling. if self.urls_to_visit: yield self.make_requests_from_url(self.urls_to_visit.pop(0))
def optimize(self): routes = [] for route in self: routes.append(route) i = 0 to_remove = [] to_insert = {} for route in routes: j = 0 for other_route in routes: if i != j and i not in to_remove and j not in to_remove: if route[0] == other_route[0] and\ route[1] == other_route[1] and\ route[2] == other_route[2]: first_dict = route[3] second_dict = other_route[3] if i in to_insert: first_dict = to_insert[i][3] if j in to_insert: second_dict = to_insert[j][3] merge = dict(first_dict) merge.update(second_dict) if len(merge) == len(first_dict) + len(second_dict): to_remove.append(j) to_insert[i] = (route[0], route[1], route[2], merge, route[4]) if j in to_insert: del to_insert[j] else: raise RoutingDefinitionException('Method defined twice') elif route[0] == other_route[0] and\ route[1] == other_route[1] and\ route[2] != other_route[2]: raise RoutingDefinitionException('Same routes with different controllers') elif route[0] == other_route[0] and\ route[1] != other_route[1]: raise RoutingDefinitionException('Same routes with different variables') j += 1 i+= 1 for insert in to_insert: routes[insert] = to_insert[insert] for remove in to_remove: routes.remove(routes[remove]) helper = UrlHelper() for route in routes: if route[4] is not None: helper.append(route[4], route[0]) return (routes, helper)
class ChromeScraper(scrapy.Spider): name = 'Chrome Scraper' # This variable is used by Scrapy to begin crawling. start_urls = [] # This list will contain all the URLs to visit and will pass them onto Scrapy in order, one by one. urls_to_visit = [] # This method is the constructor of the spider-scraper. It takes in the names of the input and output files # and performs some pre-processing. def __init__(self, input_file=None, output_file=None): self.csv_helper = CsvHelper(FIELD_NAMES, input_file, output_file) if self.csv_helper.stop: print("\nINCORRECT INVOCATION, run as:\nscrapy runspider %s" % os.path.basename(__file__) + \ " -a input_file=<your input file> -a output_file=<your output file>\n") return self.url_helper = UrlHelper("") # no prefix self.url_helper.process_urls_for_scrapy(self.csv_helper.get_input_file_content(), self.start_urls, None, self.urls_to_visit) def make_requests_from_url(self, url): return UrlHelper.make_requests_from_url(url) def parse(self, response): # This method parses each of the pages found under the urls_to_visit and extracts the number # of users from each of them p = re.compile('.*name\s*=\s*"user_count"\s*>\s*(\d+)\s*<') body = response.body_as_unicode().split('\n') userCount = None for line in body: m = p.match(line) if m: userCount = m.group(1) self.csv_helper.write_row_to_output_file( FIELD_NAMES, {FIELD_NAMES[0]: response.meta['start_url'], \ FIELD_NAMES[1]: datetime.datetime.fromtimestamp(time.time()).strftime(constants.TIME_FORMAT), \ FIELD_NAMES[2]: userCount}) # If there are still URLs to process, then yield more crawling. if self.urls_to_visit: yield self.make_requests_from_url(self.urls_to_visit.pop(0))
def __init__(self, input_file=None, output_file=None): self.csv_helper = CsvHelper(FIELD_NAMES, input_file, output_file) if self.csv_helper.stop: print("\nINCORRECT INVOCATION, run as:\nscrapy runspider %s" % os.path.basename(__file__) + \ " -a input_file=<your input file> -a output_file=<your output file>\n") return self.url_helper = UrlHelper("") # no prefix self.url_helper.process_urls_for_scrapy(self.csv_helper.get_input_file_content(), self.start_urls, None, self.urls_to_visit)
class TwitterScraper(scrapy.Spider): name = 'Twitter Scraper' # This variable is used by Scrapy to begin crawling. start_urls = [] # This dictionary holds the mapping of the URLs to Twitter handles, which is used when populating the output file. url_map = {} # This list will contain all the URLs to visit and will pass them onto Scrapy in order, one by one. urls_to_visit = [] # This method is the constructor of the spider-scraper. It takes in the names of the input and output files # and performs some pre-processing. def __init__(self, input_file=None, output_file=None): self.csv_helper = CsvHelper(FIELD_NAMES, input_file, output_file) if self.csv_helper.stop: print("\nINCORRECT INVOCATION, run as:\nscrapy runspider %s" % os.path.basename(__file__) + \ " -a input_file=<your input file> -a output_file=<your output file>\n") return self.urlHelper = UrlHelper(PREFIX) self.urlHelper.process_urls_for_scrapy(self.csv_helper.get_input_file_content(), self.start_urls, self.url_map, self.urls_to_visit) # Here we override the method make_requests_from_url to use the one from the UrlHelper instead of the one in # scrapy.Spider def make_requests_from_url(self, url): return UrlHelper.make_requests_from_url(url) def parse(self, response): # Here we're in the method that performs the scraping. Below an xpath expression extracts the # number of followers from the element with attribute data-nav equal to "followers" followers_count = response.xpath('//*[@data-nav="followers"]/@title').re("[\d,]*")[0].replace(',', '') self.csv_helper.write_row_to_output_file( FIELD_NAMES, {FIELD_NAMES[0]: self.url_map[response.meta['start_url']], \ FIELD_NAMES[1]: response.meta['start_url'], \ FIELD_NAMES[2]: datetime.datetime.fromtimestamp(time.time()).strftime(constants.TIME_FORMAT), \ FIELD_NAMES[3]: followers_count}) # If there are still URLs to process, then yield more crawling. if self.urls_to_visit: yield self.make_requests_from_url(self.urls_to_visit.pop(0))
def make_requests_from_url(self, url): return UrlHelper.make_requests_from_url(url)
# Create CsvHelper which will aid in processing the CSV files csv_helper = CsvHelper(FIELD_NAMES, sys.argv[1], sys.argv[2]) if csv_helper.stop: print_usage() exit(0) # Here we take the system's current time and convert it to the number of seconds since the 'epoch'. # Further on we subtract the number of seconds in 24 hours from it and specify the result # as the time from which the number of tweets should be counted. current_time = time.mktime(time.localtime()) time_24_hours_ago = current_time - constants.SECONDS_IN_24H_COUNT time_query_parameter = "&mintime=" + repr(int(time_24_hours_ago)) # Create the UrlHelper which will aid in processing URLs url_helper = UrlHelper( 'http://otter.topsy.com/searchcount.js?dynamic=1&count_method=citation&' + 'apikey=09C43A9B270A470B8EB8F2946A9369F3&%s&q=' % time_query_parameter) url_helper.process_urls(csv_helper.get_input_file_content(), url_map, urls_to_visit) print("Scraping number of tweets for phrases..") # For each URL in the list of URLs to visit connect to topsy and fetch a response containing JSON # with all the numbers of tweets in the past periods. From it we extract the number of tweets in # the past day and write a line in the output CSV file. for url in urls_to_visit: topsy_response = urllib2.urlopen(url).read() json_response = re.search("\{.*\}", topsy_response).group(0) tweets_count = json.loads(json_response)['response']['windows']['d']['total'] csv_helper.write_row_to_output_file( FIELD_NAMES,