コード例 #1
0
    def spider_opened(self, spider):
        '''Initialize your exporters when spider first starts
        '''
        if spider.name == 'AptratingsSpider':
            BIZ_DICT, PROPS = get_biz_urls("Apartment Ratings")

        for k, v in PROPS.items():
            self.d[v] = {
              "url"    : k,
              # "stats"  : stats.get_stats(), # this goes at end
              "totals" : {
                  "overall_construction": -1,
                  "overall_grounds"     : -1,
                  "overall_maintenance" : -1,
                  "overall_noise"       : -1,
                  "overall_office_staff": -1,
                  "overall_parking"     : -1,
                  "overall_safety"      : -1,
                  "recommended_by"      : -1,
                  "total_overall_rating": -1
              },
              "comments" : []
            }
コード例 #2
0
    def spider_opened(self, spider):
        '''Initialize your exporters when spider first starts
        '''
        if spider.name == 'AptratingsSpider':
            BIZ_DICT, PROPS = get_biz_urls("Apartment Ratings")

        for k, v in PROPS.items():
            self.d[v] = {
                "url": k,
                # "stats"  : stats.get_stats(), # this goes at end
                "totals": {
                    "overall_construction": -1,
                    "overall_grounds": -1,
                    "overall_maintenance": -1,
                    "overall_noise": -1,
                    "overall_office_staff": -1,
                    "overall_parking": -1,
                    "overall_safety": -1,
                    "recommended_by": -1,
                    "total_overall_rating": -1
                },
                "comments": []
            }
コード例 #3
0
ファイル: fever_yelp.py プロジェクト: hsd315/Feverbot
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy import log
#from scrapy.contrib.spiders import Rule
from feverbot.items import Yelp
from feverbot.fever_utils import now, get_biz_urls

#-----------------------------------------------------------------------------
#
#-----------------------------------------------------------------------------

BIZ_DICT, PROPS = get_biz_urls("Yelp")


class YelpSpider(BaseSpider):
    '''Spider for crawling Yelp and extracting ratings data
    To use, first cd into your Scrapy project directory. then:
    >>> scrapy crawl YelpSpider
    '''
    name = 'YelpSpider'
    allowed_domains = ['http://www.yelp.com']
    start_urls = PROPS.keys()

    def parse(self, response):
        '''Parse the data out of Yelps HTML page
        '''
        hxs = HtmlXPathSelector(response)
        ylp = Yelp()
        url = response.url

        if response.status == 302:
コード例 #4
0
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.exceptions import CloseSpider
from scrapy.selector import HtmlXPathSelector
from urlparse import urljoin
from scrapy import log
from feverbot.items import Apt, AptReview
from feverbot.fever_utils import now, get_biz_urls, autoViv, replace_txt
from feverbot.fever_utils import replace_escape_chars as esc_chars

#-----------------------------------------------------------------------------
#
#-----------------------------------------------------------------------------

# return a list of all apartment-ratings URL's you want to scrape
BIZ_DICT, PROPS = get_biz_urls("Apartment Ratings")

logline = 79*'*'

class AptratingsSpider(CrawlSpider):
    '''
    Spider for crawling Apartment Ratings and extracting ratings data
    http://readthedocs.org/docs/scrapy/en/latest/topics/spiders.html?highlight=crawlspider
    TODO - dump spider stats into json file in pipelines

    To activate the spider from the command line type:
    >>> cd fever
    >>> cd apps/webcrawlers/feverbot
    >>> scrapy crawl AptratingsSpider
    '''
    name = 'AptratingsSpider'
コード例 #5
0
ファイル: fever_aptratings.py プロジェクト: hsd315/Feverbot
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.exceptions import CloseSpider
from scrapy.selector import HtmlXPathSelector
from urlparse import urljoin
from scrapy import log
from feverbot.items import Apt, AptReview
from feverbot.fever_utils import now, get_biz_urls, autoViv, replace_txt
from feverbot.fever_utils import replace_escape_chars as esc_chars

#-----------------------------------------------------------------------------
#
#-----------------------------------------------------------------------------

# return a list of all apartment-ratings URL's you want to scrape
BIZ_DICT, PROPS = get_biz_urls("Apartment Ratings")

logline = 79 * '*'


class AptratingsSpider(CrawlSpider):
    '''
    Spider for crawling Apartment Ratings and extracting ratings data
    http://readthedocs.org/docs/scrapy/en/latest/topics/spiders.html?highlight=crawlspider
    TODO - dump spider stats into json file in pipelines

    To activate the spider from the command line type:
    >>> cd fever
    >>> cd apps/webcrawlers/feverbot
    >>> scrapy crawl AptratingsSpider
    '''
コード例 #6
0
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy import log
#from scrapy.contrib.spiders import Rule
from feverbot.items import Yelp
from feverbot.fever_utils import now, get_biz_urls

#-----------------------------------------------------------------------------
#
#-----------------------------------------------------------------------------

BIZ_DICT, PROPS = get_biz_urls("Yelp")

class YelpSpider(BaseSpider):
    '''Spider for crawling Yelp and extracting ratings data
    To use, first cd into your Scrapy project directory. then:
    >>> scrapy crawl YelpSpider
    '''
    name            = 'YelpSpider'
    allowed_domains = ['http://www.yelp.com']
    start_urls      = PROPS.keys()

    def parse(self, response):
        '''Parse the data out of Yelps HTML page
        '''
        hxs = HtmlXPathSelector(response)
        ylp = Yelp()
        url = response.url

        if response.status == 302:
            self.log('302: Redirected', level=log.INFO)