def spider_opened(self, spider): '''Initialize your exporters when spider first starts ''' if spider.name == 'AptratingsSpider': BIZ_DICT, PROPS = get_biz_urls("Apartment Ratings") for k, v in PROPS.items(): self.d[v] = { "url" : k, # "stats" : stats.get_stats(), # this goes at end "totals" : { "overall_construction": -1, "overall_grounds" : -1, "overall_maintenance" : -1, "overall_noise" : -1, "overall_office_staff": -1, "overall_parking" : -1, "overall_safety" : -1, "recommended_by" : -1, "total_overall_rating": -1 }, "comments" : [] }
def spider_opened(self, spider): '''Initialize your exporters when spider first starts ''' if spider.name == 'AptratingsSpider': BIZ_DICT, PROPS = get_biz_urls("Apartment Ratings") for k, v in PROPS.items(): self.d[v] = { "url": k, # "stats" : stats.get_stats(), # this goes at end "totals": { "overall_construction": -1, "overall_grounds": -1, "overall_maintenance": -1, "overall_noise": -1, "overall_office_staff": -1, "overall_parking": -1, "overall_safety": -1, "recommended_by": -1, "total_overall_rating": -1 }, "comments": [] }
from scrapy.spider import BaseSpider from scrapy.selector import HtmlXPathSelector from scrapy import log #from scrapy.contrib.spiders import Rule from feverbot.items import Yelp from feverbot.fever_utils import now, get_biz_urls #----------------------------------------------------------------------------- # #----------------------------------------------------------------------------- BIZ_DICT, PROPS = get_biz_urls("Yelp") class YelpSpider(BaseSpider): '''Spider for crawling Yelp and extracting ratings data To use, first cd into your Scrapy project directory. then: >>> scrapy crawl YelpSpider ''' name = 'YelpSpider' allowed_domains = ['http://www.yelp.com'] start_urls = PROPS.keys() def parse(self, response): '''Parse the data out of Yelps HTML page ''' hxs = HtmlXPathSelector(response) ylp = Yelp() url = response.url if response.status == 302:
from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.exceptions import CloseSpider from scrapy.selector import HtmlXPathSelector from urlparse import urljoin from scrapy import log from feverbot.items import Apt, AptReview from feverbot.fever_utils import now, get_biz_urls, autoViv, replace_txt from feverbot.fever_utils import replace_escape_chars as esc_chars #----------------------------------------------------------------------------- # #----------------------------------------------------------------------------- # return a list of all apartment-ratings URL's you want to scrape BIZ_DICT, PROPS = get_biz_urls("Apartment Ratings") logline = 79*'*' class AptratingsSpider(CrawlSpider): ''' Spider for crawling Apartment Ratings and extracting ratings data http://readthedocs.org/docs/scrapy/en/latest/topics/spiders.html?highlight=crawlspider TODO - dump spider stats into json file in pipelines To activate the spider from the command line type: >>> cd fever >>> cd apps/webcrawlers/feverbot >>> scrapy crawl AptratingsSpider ''' name = 'AptratingsSpider'
from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.exceptions import CloseSpider from scrapy.selector import HtmlXPathSelector from urlparse import urljoin from scrapy import log from feverbot.items import Apt, AptReview from feverbot.fever_utils import now, get_biz_urls, autoViv, replace_txt from feverbot.fever_utils import replace_escape_chars as esc_chars #----------------------------------------------------------------------------- # #----------------------------------------------------------------------------- # return a list of all apartment-ratings URL's you want to scrape BIZ_DICT, PROPS = get_biz_urls("Apartment Ratings") logline = 79 * '*' class AptratingsSpider(CrawlSpider): ''' Spider for crawling Apartment Ratings and extracting ratings data http://readthedocs.org/docs/scrapy/en/latest/topics/spiders.html?highlight=crawlspider TODO - dump spider stats into json file in pipelines To activate the spider from the command line type: >>> cd fever >>> cd apps/webcrawlers/feverbot >>> scrapy crawl AptratingsSpider '''
from scrapy.spider import BaseSpider from scrapy.selector import HtmlXPathSelector from scrapy import log #from scrapy.contrib.spiders import Rule from feverbot.items import Yelp from feverbot.fever_utils import now, get_biz_urls #----------------------------------------------------------------------------- # #----------------------------------------------------------------------------- BIZ_DICT, PROPS = get_biz_urls("Yelp") class YelpSpider(BaseSpider): '''Spider for crawling Yelp and extracting ratings data To use, first cd into your Scrapy project directory. then: >>> scrapy crawl YelpSpider ''' name = 'YelpSpider' allowed_domains = ['http://www.yelp.com'] start_urls = PROPS.keys() def parse(self, response): '''Parse the data out of Yelps HTML page ''' hxs = HtmlXPathSelector(response) ylp = Yelp() url = response.url if response.status == 302: self.log('302: Redirected', level=log.INFO)