class KijijiRideshareSpider(CrawlSpider):
    """
	*A scrapy crawler to extract rideshare information from kijiji
	*Crawls the kijiji wepages to find rideshare information
	*currently, only supports Ottawa
	"""
    name = "kijiji_rideshare_spider"
    allowed_domains = ['kijiji.ca']
    start_urls = ["http://www.kijiji.ca/b-rideshare-carpool/ottawa/c5l1700185"]
    rules = [
        Rule(LinkExtractor(
            allow=['http://www.kijiji.ca/v-rideshare-carpool/ottawa/.+']),
             callback='parse_rideshare'),
        Rule(
            LinkExtractor(allow=[
                "http://www.kijiji.ca/b-rideshare-carpool/ottawa/page-[0-9]/.+"
            ]), ),
    ]

    def parse_item(self, response):
        """
		*An earlier version of the code that uses hxs selector 
		*based on code from Github: mjhea0/Scrapy-Samples
		*Not used currently, but left alone for debugging and initial help purpose
		"""
        selection = scrapy.Selector(response)
        titles = selection.xpath("//td[@class='description']")
        result = []
        for title in titles:
            item = items.KijijiRideshareItem()
            item["title"] = title.select("a/text()").extract()
            item["link"] = title.select("a/@href").extract()
            result.append(item)
        return result

    def parse_rideshare(self, response):
        """
		Parses and stores the required rideshare information 
		"""
        rideshare_item = items.kijijiRideshareData()
        rideshare_item["url"] = response.url
        rideshare_item["title"] = self._extract_title(response)
        rideshare_item["date_listed"] = self._extract_field(
            response, "Date Listed")
        rideshare_item["address"] = self._extract_field(response, "Address")
        rideshare_item["phone_number"] = self._extract_phone_number(response)
        rideshare_item["full_text"] = self._extract_full_text(response)
        return rideshare_item

    def _extract_title(self, response):
        l = " ".join(response.xpath("//h1/text()").extract())
        return self._clean_string(l)

    def _extract_full_text(self, response):
        l = " ".join(
            response.xpath("//span[@itemprop='description']/text()").extract())
        return self._clean_string(l)

    def _extract_phone_number(self, response):
        return "613"

    def _extract_field(self, response, fieldname):
        l = response.xpath(
            "//th[contains(text(), '{0}')]/following::td[1]//./text()".format(
                fieldname)).extract()
        return l[0].strip() if l else None

    def _clean_string(self, string):
        for i in [",", "\n", "\r", ";", "\\"]:
            string = string.replace(i, "")
        return string.strip()
class XhamsterSpider(CrawlSpider):
    name = "xhamster"
    allowed_domains = ["xhamster.com"]
    start_urls = ["http://xhamster.com/channels.php"]

    rules = (
        Rule(SgmlLinkExtractor(allow=['/movies/\d+/.*'], ),
             callback='parse_video'),
        Rule(SgmlLinkExtractor(deny=[
            '/webcam(.*)',
            '/cam(.*)',
            '/start(.*)',
            '/games(.*)',
            '/stories(.*)',
            '/dating(.*)',
            '/photos(.*)',
            '/information(.*)',
        ],
                               allow_domains=["xhamster.com"]),
             follow=True),
    )

    def parse_video(self, response):
        hxs = HtmlXPathSelector(response)

        video = VideoItem()
        video['masturbator'] = self.name

        url_parsed = urlparse(response.url)
        video['remote_url'] = "{0}://{1}{2}".format(url_parsed.scheme,
                                                    url_parsed.netloc,
                                                    url_parsed.path)

        try:
            url_re_result = url_re.search(video['remote_url'])
            video['remote_id'] = int(url_re_result.group(1))
        except:
            return None

        video['title'] = first_or_none(
            hxs.select(
                "//div[@id='playerBox']//h2[@class='gr']/text()").extract())

        if not video['title']:
            return None
        else:
            video['title'] = video['title'].strip()

        remote_date = first_or_none(
            hxs.select(
                "//td[@id='videoUser']//span[@class='hint']/@hint").extract())

        if remote_date:
            video['remote_date'] = datetime.datetime.strptime(
                remote_date, "%Y-%m-%d %H:%M:%S %Z")

        duration = first_or_none(
            hxs.select(
                "//td[@id='videoUser']//div[span[text()='Runtime:']]/text()").
            extract())

        if duration:
            duration = duration.strip().split(":")
            video['duration'] = int(duration[0]) * 60 + int(duration[1])

        video['tags'] = set()
        video['thumbs'] = set()
        video['stars'] = set()

        for tag in hxs.select("//td[@id='channels']//a/text()").extract():
            video['tags'].add(tag.lower().strip())

        id_str_part = str(video['remote_id'])[-3:]

        thumb_pattern_url = "http://et1.xhamster.com/t/{id_part}/{number}_{id}.jpg"

        for i in range(1, 11):
            video['thumbs'].add(
                "http://et0.xhamster.com/t/{0}/{1}_{2}.jpg".format(
                    id_str_part, i, video['remote_id']))

        return video
Beispiel #3
0
class FunTVSpider(CrawlSpider):
    """
    风行综艺爬取
    """

    name = 'fun_variety'
    allows_domains = [
        'fun.tv',
    ]
    start_urls = [
        'http://www.fun.tv/retrieve/c-e7bbbce889ba.n-e5bdb1e78987.o-mf.pg-1'
    ]

    rules = [
        Rule(sle(
            allow=('/retrieve/c-e7bbbce889ba.n-e5bdb1e78987.o-mf.pg-\d+$', )),
             follow=True,
             callback='parse1'),
    ]

    def parse1(self, response):
        sel = Selector(response)

        tv_list = sel.css(
            'body div.mod-list.page-wrap div div.mod-wrap-in.mod-vd-lay.fix div.mod-vd-i'
        )

        for tv in tv_list:
            tv_id = tv.css('div.info h3 a::attr(data-id)').extract()[0]

            if db_session.query(FunVideo).filter(FunVideo.id == tv_id).first():
                continue

            name = tv.css('div.info h3 a::attr(title)').extract()[0]
            image = tv.css('div.pic a img::attr(_lazysrc)').extract()[0]
            description = tv.css('div.info p::text').extract()[0]
            point = tv.css('div.info h3 b::text').extract()[0]

            request = Request('http://www.fun.tv{}'.format(
                tv.css('div.pic a::attr(href)').extract()[0]),
                              callback=self.parse2)
            fv = FunVideo(id=tv_id,
                          name=name,
                          name_pinyin=pinyin.get_initials(name, splitter=''),
                          image=image,
                          description=description,
                          point=point)

            request.meta['tv'] = fv
            yield request

    def parse2(self, response):
        tv = response.meta['tv']

        sel = Selector(response)
        tv.origin_url = response.url

        p_dirsort = sel.css('div#main-rt div.mod-datum p.dirsort')
        for p in p_dirsort:
            p_type = p.css('::text').extract()[0]

            if u'导演' in p_type:
                tv.director = ''.join(p.css('span::text').extract())
            elif u'主演' in p_type:
                tv.starring = ''.join(p.css('span::text').extract())
            elif u'类型' in p_type:
                tv.category = ''.join(p.css('span::text').extract())

        tv.detail = sel.css(
            'div#main-rt div.mod-datum p.dirtext span:nth-child(2)::text'
        ).extract()[0]

        print tv.name, '------->', tv.origin_url

        # 表明综艺
        tv.type = 3

        db_session.add(tv)
        db_session.commit()
Beispiel #4
0
class ZZSpider(CrawlSpider):
    name = "zz_prilepin"
    allowed_domains = ["livejournal.com"]
    start_urls = [
        #"http://prilepin.livejournal.com/2007/03/"
        #"http://prilepin.livejournal.com/2014/10/"
        "http://prilepin.livejournal.com/2015/07/"
    ]

    rules = (
        Rule(LinkExtractor(
            allow=('prilepin.livejournal.com/\d\d\d\d/\d\d/', ),
            deny=('prilepin.livejournal.com/\d\d\d\d/\d\d/\d\d', 'tag',
                  'reply', 'thread', 'page'),
        ),
             callback='parse_overview',
             follow=True),
        Rule(LinkExtractor(
            allow=('http://prilepin.livejournal.com/\d+/.html', ),
            deny=('tag', 'reply', 'thread', 'page'),
        ),
             callback='parse_page',
             follow=True),
    )

    def parse_start_url(self, response):
        list(self.parse_overview(response))

    def parse_overview(self, response):
        urls = response.xpath("//dd/a/@href").extract()
        for url in urls:
            yield Request(url, callback=self.parse_page)

    def parse_page(self, response):

        # use scrapy shell to find xpath
        #from scrapy.shell import inspect_response
        #inspect_response(response)

        item = ScraperItem()

        item["url"] = response.url

        item["date"] = response.xpath(
            "//p[@class='entry-footer']/text()").extract()[0]

        item["text"] = " ".join(
            response.xpath(
                "//div[@class='entry-body']/child::node()").extract())

        try:
            item["title"] = response.xpath(
                "//h3[@class='entry-header']/text()").extract()[0]
        except IndexError:
            item["title"] = ""

        try:
            item["comment_count"] = response.xpath(
                "//p[@class='entry-footer']/a[3]/text()").extract()[0]
        except IndexError:
            item["comment_count"] = "0"

        yield item
Beispiel #5
0
class Yy138Spider(CrawlSpider):
    name = 'yy138'
    
    allowed_domains = ['www.yy138.com']
    
    start_urls = [
      'http://www.yy138.com/android/youxi/',
      'http://www.yy138.com/android/ruanjian/',
      'http://www.yy138.com/wangyou/',
    ]

    rules = (
        Rule(SgmlLinkExtractor(allow = ['/[a-zA-Z][a-zA-Z0-9]*/']), callback = 'parse_item', follow = False),
        
        Rule(SgmlLinkExtractor(allow = ['/wangyou/']), callback = 'noapk', follow = True),
        Rule(SgmlLinkExtractor(allow = ['/\d+/(\d+\.html)*']), callback = 'noapk', follow = True), 
        Rule(SgmlLinkExtractor(allow = ['/wangyou/zuixin/(\d+\.html)*']), callback = 'noapk', follow = True),
        
        Rule(SgmlLinkExtractor(allow = ['/youxi/']), callback = 'noapk', follow = True), 
        Rule(SgmlLinkExtractor(allow = ['/youxi/zuixin/(\d+\.html)*']), callback = 'noapk', follow = True),

        Rule(SgmlLinkExtractor(allow = ['/ruanjian/']), callback = 'noapk', follow = True),
        Rule(SgmlLinkExtractor(allow = ['/ruanjian/zuixin/(\d+\.html)*']), callback = 'noapk', follow = True),
    )

    def noapk(self, response):
        print 'No apk: ', response.url    
   
    def parse_item(self, response):

        print 'There is a new apk: ',response.url
        
        hxs = HtmlXPathSelector(response)
        i = BaidumarketItem()
        print 'begin:'
        try:
        #    print '.....'

            i['app_name'] = ''.join(hxs.select('//div[@class="column download"]/div[1]/h1[1]/text()').extract())

            i['app_keywords'] = ''.join(hxs.select('//div[@class="intro"]/p[3]/a/text()').extract())

 
            i['app_url'] = response.url

            i['app_icon_url'] = ''.join(hxs.select('//div[@class="icon"]/img/@src').extract())
#            print 'zhongduan'
#i['icon_content'] =  
        
            i['app_size'] = ''.join(hxs.select('//*[@id="xiazai"]/div/div[2]/div[2]/div/div/div[1]/div/a/span/text()').extract())
            if i['app_size'] =="":
                 i['app_size'] = ''.join(hxs.select('//*[@id="xiazai"]/div/div/div[2]/div/div/div[1]/div/a/span/text()').extract()) 
            i['app_version'] = ''.join(hxs.select('//*[@id="xiazai"]/div/div[2]/div[2]/div/div/div[1]/p/text()[2]').extract())[5:]
            if i['app_version'] == "":
                 i['app_version'] = ''.join(hxs.select('//*[@id="xiazai"]/div/div/div[2]/div/div/div[1]/p/text()[2]').extract())[5:]

            i['download_times'] = '0'

            i['download_url'] = ''.join(hxs.select('//*[@id="xiazai"]/div/div[2]/div[2]/div/div/div[1]/div/a/@href').extract())
            if i['download_url']=='':
                i['download_url']=''.join(hxs.select('//*[@id="xiazai"]/div/div/div[2]/div/div/div[1]/div/a/@href').extract())

            i['app_author'] =  'None'

            i['os_version'] = ''.join(hxs.select('//*[@id="xiazai"]/div/div[2]/div[2]/div/div/div[1]/p/text()[3]').extract())[5:]

            i['app_description'] = ''.join(hxs.select('//div[@class="column introduction"]/div[2]/p/text()').extract())

            if i['app_description'] == '':
                i['app_description'] = ''.join(hxs.select('//div[@class="column introduction"]/div[2]/text()').extract())

            i['last_update_date'] =  '1990-01-01'

            i['app_class'] = ''.join(hxs.select('//div[@class="intro"]/p[2]/a[2]/text()').extract())

      

            i['app_market'] = u'yy138.com'

            i['market_site'] = 'www.yy138.com'

            i['user_rate'] = ''.join(hxs.select('//div[@class="intro"]/p[1]/span[1]/span/@class').extract())[4]
 
            i['comments_num'] = '0'
            print i

            return i
      
        except Exception, e:

            print e
Beispiel #6
0
class PlaystoreSpider(CrawlSpider):
    def gen_urls():
        for c in ('ARCADE', 'BRAIN', 'CARDS', 'CASUAL', 'GAME_WALLPAPER',
                  'RACING', 'SPORTS_GAMES', 'GAME_WIDGETS',
                  'BOOKS_AND_REFERENCE', 'BUSINESS', 'COMICS', 'COMMUNICATION',
                  'EDUCATION', 'ENTERTAINMENT', 'FINANCE', 'HEALTH',
                  'LIBRARIES_AND_DEMO', 'LIFESTYLE', 'APP_WALLPAPER',
                  'MEDIA_AND_VIDEO', 'MEDICAL', 'MUSIC_AND_AUDIO',
                  'NEWS_AND_MAGAZINES', 'PERSONALIZATION', 'PHOTOGRAPHY',
                  'PRODUCTIVITY', 'SHOPPING', 'SOCIAL', 'SPORTS', 'TOOLS',
                  'TRANSPORTATION', 'TRAVEL_AND_LOCAL', 'WEATHER',
                  'APP_WIDGETS'):
            yield 'https://play.google.com/store/apps/category/%s/collection/topselling_paid' % c
            yield 'https://play.google.com/store/apps/category/%s/collection/topselling_free' % c

    name = 'playstore'
    allowed_domains = ['play.google.com']
    start_urls = gen_urls()
    reg_start = re.compile('start=([\d]+)')

    rules = (
        #Rule(SgmlLinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
        Rule(SgmlLinkExtractor(allow=r'category/[A-Z_]+\?',
                               deny=r'/accounts/'),
             follow=True,
             callback='parse_app'),  #categories
        #        Rule(SgmlLinkExtractor(allow=r'start=[\d]+&num=[\d]+', deny=r'/accounts/'), follow=True), #categories
        Rule(SgmlLinkExtractor(allow=r'/collection/', deny=r'editors_choice'),
             follow=True),  #categories
        #parse_app
    )

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        m = PlaystoreSpider.reg_start.search(response.url)
        start = 0
        if m:
            start = int(m.group(1))

        artworks = hxs.select(
            '//div[@class="thumbnail-wrapper goog-inline-block"]/a/img/@src'
        ).extract()
        ids = hxs.select(
            '//li[@class="goog-inline-block"]/@data-docid').extract()
        ids += hxs.select(
            '//li[@class="goog-inline-block z-last-child"]/@data-docid'
        ).extract()  #scary!
        names = hxs.select(
            '//div[@class="details goog-inline-block"]/div/a/text()').extract(
            )
        urls = hxs.select(
            '//div[@class="details goog-inline-block"]/div/a/@href').extract()
        reg_cat = re.compile('/category/([\w_]+)(/|\?|/)*')
        category = reg_cat.search(response.url).group(1).replace('_',
                                                                 ' ').title()
        sellers = hxs.select('//span[@class="attribution"]/div/a').extract()
        seller_links = hxs.select(
            '//span[@class="attribution"]/div/a/@href').extract()

        assert not "We're sorry" in response.body
        assert len(artworks) == len(ids) == len(names) == len(urls) == len(
            sellers) == len(seller_links), (len(artworks),
                                            len(ids), len(names), len(urls),
                                            len(sellers), len(seller_links))
        for artwork, id, name, url, seller, seller_link in zip(
                artworks, ids, names, urls, sellers, seller_links):
            i = AppStoreItem()
            i['store'] = 'play'
            i['id'] = id
            i['artwork'] = artwork
            i['category'] = category
            i['url'] = 'https://play.google.com' + url
            i['name'] = name
            i['last_update'] = datetime.date.today().isoformat()
            i['seller'] = seller
            i['seller_link'] = 'https://play.google.com' + seller_link
            yield i

        if start == 0:
            prefix = '?'
            if '?' in response.url:
                prefix = '&'
            for i in range(24, 480 + 1, 24):
                yield Request(response.url + prefix + 'start=%d&num=24' % i)
Beispiel #7
0
class ZhihuSpider(CrawlSpider):
    name = "zhihu"
    allowed_domains = ["zhihu.com"]
    start_urls = [
        # "http://www.zhihu.com/",
        # "https://www.zhihu.com/people/hu-shi-wei-63",
        "https://www.zhihu.com/people/hu-shi-wei-63/followees",
    ]
    headers = {
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4,zh-TW;q=0.2',
        'Connection': 'keep-alive',
        'Host': 'www.zhihu.com',
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.130 Safari/537.36',
        'Referer': 'http://www.zhihu.com/',
    }
    rules = [
        # Rule(LinkExtractor(allow=("/people/[^/]+/followees$", )),
        #      callback='parse_followees'),
        # Rule(LinkExtractor(allow=("/people/[^/]+/followers$", )),
        #      callback='parse_followers'),
        Rule(LinkExtractor(allow=("/people/[^/]+$", )),
             callback='parse_people_with_rules',
             follow=True),
        Rule(LinkExtractor(allow=('/question/\d+#.*?', )),
             callback='parse_question',
             follow=True),
        Rule(LinkExtractor(allow=('/question/\d+', )),
             callback='parse_question',
             follow=True),
    ]

    # need dfs/bfs
    all_css_rules = {
        '.zm-profile-header': {
            '.zm-profile-header-main': {
                '__use': 'dump',
                'name': '.title-section .name::text',
                'sign': '.title-section .bio::text',
                'location': '.location.item::text',
                'business': '.business.item::text',
                'employment': '.employment.item::text',
                'position': '.position.item::text',
                'education': '.education.item::text',
                'education_extra': '.education-extra.item::text',
            },
            '.zm-profile-header-operation': {
                '__use': 'dump',
                'agree': '.zm-profile-header-user-agree strong::text',
                'thanks': '.zm-profile-header-user-thanks strong::text',
            },
            '.profile-navbar': {
                '__use': 'dump',
                'asks': 'a[href*=asks] .num::text',
                'answers': 'a[href*=answers] .num::text',
                'posts': 'a[href*=posts] .num::text',
                'collections': 'a[href*=collections] .num::text',
                'logs': 'a[href*=logs] .num::text',
            },
        },
        '.zm-profile-side-following': {
            '__use': 'dump',
            'followees': 'a.item[href*=followees] strong::text',
            'followers': 'a.item[href*=followers] strong::text',
        }
    }

    def start_requests(self):
        return [
            Request("https://www.zhihu.com/login/email",
                    meta={'cookiejar': 1},
                    callback=self.post_login)
        ]

    def get_captcha(self):
        s = requests.session()
        captcha_url = 'http://www.zhihu.com/captcha.gif'
        captcha = s.get(captcha_url, stream=True)
        print captcha
        f = open('captcha.gif', 'wb')
        for line in captcha.iter_content(10):
            f.write(line)
        f.close()
        return s

    # FormRequeset出问题了
    def post_login(self, response):
        print 'Preparing login'
        # 下面这句话用于抓取请求网页后返回网页中的_xsrf字段的文字, 用于成功提交表单
        xsrf = Selector(response).xpath(
            '//input[@name="_xsrf"]/@value').extract()[0]
        s = self.get_captcha()
        captcha_str = raw_input('Input captcha:')
        logindata = {
            '_xsrf': xsrf,
            'email': '*****@*****.**',
            'password': '******',
            'rememberme': 'true',
            'captcha': captcha_str
        }
        res = s.post('https://www.zhihu.com/login/email',
                     headers=self.headers,
                     data=logindata)
        cookies = dict(res.cookies)
        for url in self.start_urls:
            yield Request(url, cookies=cookies)

    def traversal(self, sel, rules, item):
        if '__use' in rules:
            for nk, nv in rules.items():
                if nk == '__use':
                    continue
                if nk not in item:
                    item[nk] = []
                if sel.css(nv):
                    item[nk] += [i.extract() for i in sel.css(nv)]
                else:
                    item[nk] = []
        else:
            for nk, nv in rules.items():
                for i in sel.css(nk):
                    self.traversal(i, nv, item)

    def dfs(self, sel, rules, item_class):
        if sel is None:
            return []
        item = item_class()
        self.traversal(sel, rules, item)
        return item

    def parse_with_rules(self, response, rules, item_class):
        return self.dfs(Selector(response), rules, item_class)

    def parse_people_with_rules(self, response):
        info('Parsed ' + response.url)
        item = self.parse_with_rules(response, self.all_css_rules,
                                     ZhihuPeopleItem)
        item['id'] = urlparse(response.url).path.split('/')[-1]
        yield item

    def parse_followers(self, response):
        return self.parse_people_with_rules(response)

    def parse_followees(self, response):
        return self.parse_people_with_rules(response)

    def parse_question(self, response):
        problem = Selector(response)
        item = QuestionItem()
        item['url'] = response.url
        item['name'] = problem.xpath('//span[@class="name"]/text()').extract()
        item['title'] = problem.xpath(
            '//h2[@class="zm-item-title zm-editable-content"]/text()').extract(
            )
        item['description'] = problem.xpath(
            '//div[@class="zm-editable-content"]/text()').extract()
        item['answer'] = problem.xpath(
            '//div[@class="zm-editable-content clearfix"]/text()').extract()
        return item
Beispiel #8
0
class MonitorSpider(RedisMixin, CrawlSpider):
    # class MonitorSpider( CrawlSpider ):
    # class MonitorSpider(BaseSpider):
    # class MonitorSpider(RedisSpider):

    name = "monitorspider"
    redis_key = 'monitorspider:start_urls'

    allowed_domains = [
        "tmall.com",
        "taobao.com",  # tmall
        "jd.com",
        "3.cn",  # jd
        "feifei.com",  # feifei
        "yhd.com",
        "yihaodian.com",  # yihaodian
        "yixun.com",  # yixun
        "amazon.cn"
    ]  # amazon

    start_urls = []

    pipeline = ['MongoPipeline']

    rules = (
        Rule(
            SgmlLinkExtractor(
                allow=(r'detail.tmall.com'),
                restrict_xpaths=(
                    "//div[@id='J_ItemList']//p[@class='productTitle']"),
                unique=True),
            callback='parseTmall',
        ),
        Rule(SgmlLinkExtractor(
            allow=(r'list.tmall.com'),
            restrict_xpaths=("//a[@class='ui-page-s-next']"),
            unique=True),
             follow=True),
    )

    def set_crawler(self, crawler):

        CrawlSpider.set_crawler(self, crawler)
        RedisMixin.setup_redis(self)

    def parse_start_url(self, response):
        """ Main parse function
		"""
        url = response.url

        if url.find('detail.tmall.com') > -1:
            return self.parseTmall(response)
        elif url.find('jd.com') > -1:
            return self.parseJd(response)
        elif url.find('feifei.com') > -1:
            return self.parseFeifei(response)
        elif url.find('yhd.com') > -1:
            return self.parseYhd(response)
        elif url.find('yixun.com') > -1:
            return self.parseYixun(response)
        elif url.find('amazon.cn') > -1:
            return self.parseAmazon(response)

    def make_requests_from_url(self, url):

        if url.find('yhd.com') > -1:
            return Request(url, dont_filter=True, cookies={'provinceId': 20})
        elif url.find('yixun.com') > -1:
            return Request(url,
                           dont_filter=True,
                           cookies={
                               'loc': '6_1001_440000_440100_440106_0',
                               'wsid': '1001'
                           })
        else:
            return Request(url, dont_filter=True)

    ######
    #
    # Tmall parser
    #

    def parseTmall(self, response):
        """ Tmall parser
		"""
        def _referer():
            referer = response.request.headers.get('Referer')
            if referer and referer.find('list.tmall.com') > -1:
                rto = 'http://list.tmall.com/search_product.htm?'
                resultC = re.compile('[\?&]cat=(\d+)').search(referer)
                if resultC: rto += 'cat=%s' % resultC.group(1)
                resultQ = re.compile('[\?&]q=([^&]+)').search(referer)
                if resultQ:
                    if resultC: rto += '&q=%s' % resultQ.group(1)
                    else: rto += 'q=%s' % resultQ.group(1)
                if not 'http://list.tmall.com/search_product.htm?' == rto:
                    return rto
            elif not referer and response.url.find('detail.tmall.com') > -1:
                return response.url
            return ''

        sel = Selector(response)
        item = ProductItem()

        item['source'] = 'tmall'
        item['name'] = self.get_product_name(sel)
        item['start_url'] = _referer()
        store = ''.join(
            sel.xpath('//input[@name="seller_nickname"]/@value').extract())
        item['tm_store'] = '[%s] %s' % (store[-3:],
                                        store) if len(store) > 3 else store

        try:
            # 获取TShop字符串,并对TShop字符串进行JSON标准化处理
            TShop_str = sel.re('TShop\.Setup\(((.|\n)+?)\);')[0]
            # 移除注释,目前只有天猫超市有注释,以逗号开头
            regex = re.compile(',\s*\/\/[^\n]*')
            TShop_str = re.sub(regex, ',', TShop_str)
            TShop = eval(
                TShop_str,
                type('Dummy', (dict, ), dict(__getitem__=lambda s, n: n))())
        except SyntaxError:
            return

        item['itemId'] = TShop.get('itemDO').get('itemId', '')
        item['url'] = response.url

        initApi_url = TShop.get('initApi')

        yield Request(initApi_url,
                      headers={'Referer': 'http://www.google.com.hk/'},
                      meta={'item': item},
                      dont_filter=True,
                      callback=self.parse_initapi)

    def parse_initapi(self, response):
        """ 处理initApi的链接
		"""
        item = response.meta['item']
        try:
            initObj = eval(
                response.body.strip().decode('gbk'),
                type('Dummy', (dict, ), dict(__getitem__=lambda s, n: n))())
            priceInfo = initObj.get('defaultModel').get(
                'itemPriceResultDO').get('priceInfo')
            item['price'] = self.get_default_price(priceInfo)
            item['tm_moonSellCount'] = initObj.get('defaultModel').get(
                'sellCountDO').get('sellCount', 0)
        except:
            print response.body
        finally:
            yield Request(
                'http://dsr.rate.tmall.com/list_dsr_info.htm?itemId=' +
                item['itemId'],
                meta={'item': item},
                dont_filter=True,
                callback=self.parse_comment)

    def parse_comment(self, response):
        """ 处理获取评论数的链接
		"""
        item = response.meta['item']
        comment = re.findall('rateTotal\":(\d+)', response.body)[0]
        item['comment'] = int(comment) if comment.isdigit() else 0
        yield item

    def get_product_name(self, sel):
        """ 获取商品名
		"""
        name_node = sel.xpath('//div[@id="J_DetailMeta"]//h3')

        if len(name_node.xpath('./a')) > 0:
            return name_node.xpath('./a/text()').extract()[0]
        elif len(name_node.xpath('./a')) == 0:
            return name_node.xpath('./text()').extract()[0]
        else:
            return ''

    def get_default_price(self, priceInfo):
        """ 计算商品的默认价格
		"""
        def_obj = priceInfo.get('def', None)

        if def_obj:
            # 有Def属性
            promotionList = def_obj.get('promotionList', None)
            if type(promotionList) == list and len(promotionList) > 0:
                # 有促销信息
                min_price = sys.maxint
                for i in range(len(promotionList)):
                    if promotionList[i].get('price') and float(
                            promotionList[i].get('price')) < min_price:
                        min_price = float(promotionList[i].get('price'))
                return min_price
            else:
                # 没促销信息
                return float(def_obj.get('price'))
        else:
            # 没有def属性
            for sku in priceInfo:
                promotionList = priceInfo[sku].get('promotionList', None)
                if type(promotionList) == list and len(promotionList) > 0:
                    # 有促销信息
                    min_price = sys.maxint
                    for i in range(len(promotionList)):
                        if promotionList[i].get('price') and float(
                                promotionList[i].get('price')) < min_price:
                            min_price = float(promotionList[i].get('price'))
                    return min_price
                else:
                    # 没促销信息
                    return float(priceInfo[sku].get('price'))

    ######
    #
    # Jd parser
    #

    def parseJd(self, response):
        """ Jd parser
		"""

        sel = Selector(response)
        item = ProductItem()

        item['source'] = 'jd'
        item['name'] = sel.xpath("//div[@id='name']//h1/text()").extract()[0]
        item['url'] = response.url
        item['itemId'] = self.getSku(response.url)

        # return item
        yield Request('http://p.3.cn/prices/get?skuid=J_' + item['itemId'],
                      meta={'item': item},
                      dont_filter=True,
                      callback=self.parsePrice)

    def parsePrice(self, response):
        item = response.meta['item']
        rto = json.loads(response.body)[0]
        item['price'] = float(rto.get('p', 0))
        yield Request(
            'http://club.jd.com/ProductPageService.aspx?method=GetCommentSummaryBySkuId&referenceId='
            + item['itemId'] + '&callback=getCommentCount',
            meta={'item': item},
            dont_filter=True,
            callback=self.parseComment)

    def parseComment(self, response):
        item = response.meta['item']
        regex = re.compile('\{.*\}')
        result = regex.search(response.body)
        if result:
            rto = json.loads(result.group(0))
            item['comment'] = int(rto.get('CommentCount', 0))
        else:
            item['comment'] = 0
        return item

    def getSku(self, url):
        regex = re.compile('\/(\d+)\.htm')
        result = regex.search(url)
        return result.group(1) if result else ''

    ######
    #
    # Feifei parser
    #

    def parseFeifei(self, response):
        """ Feifei parser
		"""
        sel = Selector(response)
        item = ProductItem()

        item['source'] = 'feifei'
        item['name'] = sel.xpath(
            "//h2[@class='np-intro-title']/text()").extract()[0]
        item['url'] = response.url
        price = sel.xpath("//dd[@class='price-m']/text()").extract()[0]
        item['price'] = float(price[1:])
        item['category'] = '|'.join(
            sel.xpath("//ul[@class='np-crumbs']//a/text()").extract())

        return item

    ######
    #
    # Yhd parser
    #

    def parseYhd(self, response):
        """ Yihaodian parser
		"""
        sel = Selector(response)
        item = ProductItem()

        item['source'] = 'yhd'
        item['name'] = sel.xpath(
            "//font[@id='productMainName']/text()").extract()[0]
        item['url'] = response.url

        # get pmId
        regex = re.compile('item\/(\d+)')
        result = regex.search(response.url)
        pmId = result.group(1) if result else 0

        yield Request(
            'http://e.yhd.com/front-pe/queryNumsByPm.do?pmInfoId=%s' % pmId,
            meta={
                'item': item,
                'pmId': pmId
            },
            callback=self.parse_yhd_comment)

    def parse_yhd_comment(self, response):
        item = response.meta['item']
        pmId = response.meta['pmId']
        rto = json.loads(response.body)
        item['comment'] = rto.get('experienceNum', -1)
        yield Request(
            'http://busystock.i.yihaodian.com/restful/detail?mcsite=1&provinceId=20&pmId=%s'
            % pmId,
            meta={'item': item},
            callback=self.parse_yhd_price)

    def parse_yhd_price(self, response):
        item = response.meta['item']
        rto = json.loads(response.body)
        item['price'] = rto.get('currentPrice', -1)
        return item

    ######
    #
    # Yixun parser
    #

    def parseYixun(self, response):
        """ Yixun parser
		"""
        sel = Selector(response)
        item = ProductItem()

        item['source'] = 'yixun'
        item['name'] = sel.xpath(
            "//div[@class='xbase']//h1[@class='xname']/text()").extract()[0]
        item['url'] = response.url

        price = ''.join(
            sel.xpath("//div[@class='xbase']//span[@itemprop='price']/text()").
            extract())
        lowPrice = ''.join(
            sel.xpath(
                "//div[@class='xbase']//span[@itemprop='lowPrice']/text()").
            extract())
        item['price'] = price or lowPrice

        return item

    ######
    #
    # Amazon parser
    #

    def parseAmazon(self, response):
        """ Amazon parser
		"""
        sel = Selector(response)
        item = ProductItem()

        item['source'] = 'amazon'
        item['name'] = ''.join(
            sel.xpath(
                '//span[@id="btAsinTitle"]/span/text()').extract()).strip()
        item['url'] = response.url
        price = ''.join(sel.xpath('//b[@class="priceLarge"]/text()').extract())
        item['price'] = price[2:] if len(price) > 2 else ''

        return item
class SpiderSpider(CrawlSpider):
    count = 0
    name = "pcconnection_camera"

    dic = set()

    allowed_domains = init_allowed_domains

    start_urls = init_start_urls

    rules = (
        #only extract links here
        Rule(SgmlLinkExtractor(allow=allowed_url), callback="parse"), )

    @property
    def sleep_time(self):
        return random.random() * MAX_SLEEP_TIME

    def parse(self, response):
        '''
        extract
        title
        content
        url
        '''
        print '>' * 50
        print 'response url: ', response.url
        hxs = HtmlXPathSelector(response)
        print '>>>> repsonse.url: ', response.url
        #get urls
        content_urls = hxs.select(content_url_format).extract()

        list_urls = hxs.select(list_url_format).extract()
        list_urls = [up.urljoin(response.url, url) for url in list_urls]
        content_urls = [up.urljoin(response.url, url) for url in content_urls]

        print "@" * 60
        time.sleep(self.sleep_time)
        self.start_urls.extend(list_urls)

        for url in list_urls:
            yield Request(url, self.parse)

        #http://www.pcconnection.com/IPA/Shop/Product/Detail.htm?sku=16037879&cac=Result
        content_re = re.compile(
            r'http://www[.]pcconnection[.]com/.*cac=Result')
        for url in content_urls:
            if content_re.match(url):
                if len(self.dic) > 160:
                    self.start_urls = []
                    raise CloseSpider('reach pages limit, end the spider.')

                self.count += 1
                self.dic.add(hash(url))
                #extract data
                item = SpiderItem()
                item['url'] = url
                item['kind'] = self.name
                yield item
            else:
                print "!!!!!!! not match content url:"
                print url
Beispiel #10
0
class GmaSpider(CrawlSpider):
    """
    scrapy crawl inquirer_spider -o gmanews.json
    """

    name = 'inquirer_spider'
    allowed_domains = [
        'inquirer.net',
        'newsinfo.inquirer.net',
        'sports.inquirer.net',
        'lifestyle.inquirer.net',
        'entertainment.inquirer.net',
        'business.inquirer.net',
        'technology.inquirer.net',
        'globalnation.inquirer.net',
    ]
    start_urls = [
        'http://www.inquirer.net',
    ]
    rules = (Rule(SgmlLinkExtractor(allow=('', )),
                  process_links="link_filter",
                  callback="parse_items",
                  follow=True), )

    def parse_items(self, response):
        title = response.xpath('//div[@class="al-headline"]/\
                                div[@class="container"]/h1').extract()
        if len(title):
            item = None
            link = response.url

            title = strip_tags(title[0])

            # parse date
            created = response.xpath('//h4[@class="byline"]').extract()[0]
            created = created.split('>')[-2].strip()[:-4]
            ord_str = None
            if 'st,' in created:
                ord_str = 'st'
            elif 'nd,' in created:
                ord_str = 'nd'
            elif 'rd,' in created:
                ord_str = 'rd'
            elif 'th,' in created:
                ord_str = 'th'
            created_format = '%H:%M %p | %A, %B %d' + ord_str + ', %Y'
            created = time.strptime(created, created_format)

            #content = response.xpath('/html/body/div[6]/div[8]/div/div[2]/div[2]').extract()
            content = response.xpath('//div[@class="main-article"]').extract()

            #tags = response.xpath('//div[@class="story"]\
            #            /div[@class="main"]/div[@class="tags"]\
            #            /a[@class="tag"]/text()').extract()

            item = NewsItem()
            item['link'] = link
            item['title'] = title
            item['created'] = strftime('%Y-%m-%d', created)
            item['content'] = content
            #item['tags'] = list(set(tags))
            item.save()

            return item

    def link_filter(self, links):
        ret = []
        for link in links:
            parsed_url = urlparse(link.url)
            if not News.objects.filter(link=parsed_url).count():
                ret.append(link)
        return ret

    def process_title(response):
        pass
Beispiel #11
0
class DoubanSpider(CrawlSpider):

    name = "doubanmovie"
    allowed_domains = ["movie.douban.com"]
    # start_urls = ["http://movie.douban.com/tag/2016?start=0&type=T"]
    # start_urls = ["http://movie.douban.com"]
    start_urls = [
        "http://movie.douban.com/tag/1994", "http://movie.douban.com/tag/1995",
        "http://movie.douban.com/tag/1996", "http://movie.douban.com/tag/1997"
    ]

    rules = [
        # All grep rules here
        # Parse Movie Information
        Rule(SgmlLinkExtractor(allow=(r'tag/\d{4}\?start=\d+', ))),
        Rule(SgmlLinkExtractor(allow=(
            r'https://movie\.douban\.com/subject/\d+/collections\?start=[2468]0$',
        )),
             callback='parse_comment',
             follow=True),
        Rule(SgmlLinkExtractor(
            allow=(r'https://movie\.douban\.com/subject/\d+/$', )),
             callback='parse_page',
             follow=True),
        # Parse Movie Comments
        Rule(SgmlLinkExtractor(allow=(
            r'https://movie\.douban\.com/subject/\b\d+\b/collections$', )),
             callback='parse_comment',
             follow=True)
    ]

    def parse_page(self, response):
        sel = Selector(response)
        item = MovieItem()
        # print 'Crawl ' + response.url + ' start...'

        item['name'] = sel.xpath(
            '//h1/span[@property="v:itemreviewed"]/text()').extract()
        item['year'] = sel.xpath('//h1/span[@class="year"]/text()').extract()
        item['director'] = sel.xpath(
            '//a[@rel="v:directedBy"]/text()').extract()
        item['date'] = sel.xpath(
            '//span[@property="v:initialReleaseDate"]/text()').extract()
        item['time'] = sel.xpath(
            '//span[@property="v:runtime"]/text()').extract()
        item['description'] = sel.xpath(
            '//span[@property="v:summary"]/text()').extract()
        item['value'] = sel.xpath(
            '//strong[@property="v:average"]/text()').extract()
        item['people'] = sel.xpath(
            '//span[@property="v:votes"]/text()').extract()
        item['image_url'] = sel.xpath(
            '//a[contains(@href, "photos")]/img/@src').extract()
        item['star5'] = sel.xpath(
            '//span[@class="stars5 starstop"]/following-sibling::*[2]/text()'
        ).extract()
        item['star4'] = sel.xpath(
            '//span[@class="stars4 starstop"]/following-sibling::*[2]/text()'
        ).extract()
        item['star3'] = sel.xpath(
            '//span[@class="stars3 starstop"]/following-sibling::*[2]/text()'
        ).extract()
        item['star2'] = sel.xpath(
            '//span[@class="stars2 starstop"]/following-sibling::*[2]/text()'
        ).extract()
        item['star1'] = sel.xpath(
            '//span[@class="stars1 starstop"]/following-sibling::*[2]/text()'
        ).extract()
        item['movietype'] = sel.xpath(
            '//span[@property="v:genre"]/text()').extract()
        item['actor'] = sel.xpath(
            '//span/span[@class="attrs"]/a[@rel="v:starring"]/text()').extract(
            )
        item['writer'] = sel.xpath(
            u'//span/span[./text()="编剧"]/following-sibling::*/a/text()'
        ).extract()
        item['country'] = sel.xpath(
            u'//span[./text()="制片国家/地区:"]/following::text()[1]').extract()
        item['language'] = sel.xpath(
            u'//span[./text()="语言:"]/following::text()[1]').extract()
        item['othername'] = sel.xpath(
            u'//span[./text()="又名:"]/following::text()[1]').extract()
        item['movie_id'] = GetMovieOrUserID(response.url)
        item['movie_url'] = response.url
        #		item['site'] = sel.xpath('//div[@id="info"]/span[contains(@href, "http")]/text()').extract()

        print 'Crawl ' + response.url + ' done...'
        # print item
        return item

    def parse_comment(self, response):
        items = []
        sel = Selector(response)
        print 'Crawl ' + response.url + ' start...'

        comments = sel.xpath('//table[@width="100%"]')
        for comment in comments:
            # print comment
            item = CommentItem()

            item['user_img'] = comment.xpath('.//img/@src').extract()
            item['user_name'] = comment.xpath('.//img/@alt').extract()
            item['user_city'] = comment.xpath(
                './/span[@style="font-size:12px;"]/text()').extract()
            item['user_value'] = comment.xpath(
                './/p[@class="pl"]/span/@class').extract()
            item['comment'] = comment.xpath(
                './/p[@class="pl"]/following::*[1]/text()').extract()
            item['comment_date'] = comment.xpath(
                './/p[@class="pl"]/text()').extract()
            url = comment.xpath(
                './/td[@width="80"]/a[contains(@href, "people")]/@href'
            ).extract()
            print '======================'
            print 'Get This URL ID'
            print url
            print '======================'
            item['user_url'] = url
            item['user_id'] = GetMovieOrUserID(url)
            item['movie_id'] = GetMovieOrUserID(response.url)
            items.append(item)
            print item

        return items

    def parse_try(self, response):
        pass
class WikiSpider(CrawlSpider):
    pcounter = 0
    name = "wiki"
    allowed_domains = ["freeswitch.org"]
    # start_urls = [
    #     "http://wiki.freeswitch.org/"
    # ]
    start_urls = [
        "http://wiki.freeswitch.org/index.php?title=Special:AllPages&from=.1.3.6.1.4.1.27880&to=FS_weekly_2010_11_10",
        "http://wiki.freeswitch.org/index.php?title=Special:AllPages&from=FS_weekly_2010_11_17&to=Java_ESL_Client",
        "http://wiki.freeswitch.org/index.php?title=Special:AllPages&from=Javascript&to=Mod_managed",
        "http://wiki.freeswitch.org/index.php?title=Special:AllPages&from=Mod_memcache&to=Report_Issue_Checklist",
        "http://wiki.freeswitch.org/index.php?title=Special:AllPages&from=Reporting_Bugs&to=Variable_execute_on_tone_detect",
        "http://wiki.freeswitch.org/index.php?title=Special:AllPages&from=Variable_export_vars&to=Variable_stream_prebuffer",
        "http://wiki.freeswitch.org/index.php?title=Special:AllPages&from=Variable_suppress-cng&to=Zeroconf.conf.xml"
    ]

    # rules = (
    #     Rule(SgmlLinkExtractor(), callback='parse_item', follow=True),
    # )

    # <a href="/wiki/Release_Notes" title="Release Notes">
    # wiki/Special:
    # wiki/User_talk:
    # wiki/User:
    # wiki/Talk:

    rules = [
        # Rule(SgmlLinkExtractor(allow=('category\.php', ), deny=('subsection\.php', ))),
        Rule(SgmlLinkExtractor(allow=[r'wiki/\w+']),
             callback='parse_item',
             follow=True),
        # Rule(SgmlLinkExtractor(allow=[r'wiki/\w+'], deny=[r'wiki/[Special\:|User_talk\:|User\:|Talk\:]\w+']), callback='parse_item', follow=True),
    ]

    # r'page/\d+' : regular expression for http://isbullsh.it/page/X URLs

    # rules = (
    #     # Extract links matching 'category.php' (but not matching 'subsection.php')
    #     # and follow links from them (since no callback means follow=True by default).
    #     Rule(SgmlLinkExtractor(allow=('category\.php', ), deny=('subsection\.php', ))),

    #     # Extract links matching 'item.php' and parse them with the spider's method parse_item
    #     Rule(SgmlLinkExtractor(allow=('item\.php', )), callback='parse_item'),
    # )

    # def parse(self, response):
    #     sel = Selector(response)
    #     sites = sel.xpath('//ul/li')
    #     items = []
    #     for site in sites:
    #         item = DmozItem()
    #         item['title'] = site.xpath('a/text()').extract()
    #         item['link'] = site.xpath('a/@href').extract()
    #         item['desc'] = site.xpath('text()').extract()
    #         items.append(item)
    #     return items

    # Try this in a shell
    def parse_item(self, response):
        self.pcounter += 1
        self.log('Hi, this is an item page (%d)! %s' %
                 (self.pcounter, response.url))
        sel = Selector(response)
        item = WikiItem()
        item['title'] = sel.xpath('//title/text()').extract()
        item['pageurl'] = response.url
        # item['content'] = sel.xpath('//div[re:test(@id, "content")]').extract()
        list_links = []
        # for links in sel.xpath('//a/@href').extract():
        for links in sel.xpath(
                '//div[re:test(@id, "content")]//a/@href').extract():
            if links[:6] != '/wiki/':
                continue
            if links.find('wiki/Special:') != -1 or links.find(
                    'wiki/User_talk:'
            ) != -1 or links.find('wiki/User:'******'wiki/Talk:') != -1 or links.find('wiki/Category:') != -1:
                continue
            list_links.append(links)
        # item['links'] = list_links
        return item

    def parse_item_long(self, response):
        self.log('Hi, this is an item page! %s' % response.url)

        hxs = HtmlXPathSelector(response)
        item = IsBullshitItem()
        # Extract title
        item['title'] = hxs.select('//header/h1/text()').extract()[0]
        # Extract author
        item['author'] = hxs.select('//header/p/a/text()').extract()[0]
        # Extract tag(s)
        item['tag'] = hxs.select(
            "//header/div[@class='post-data']/p/a/text()").extract()
        # Extract date
        item['date'] = hxs.select(
            "//header/div[@class='post-data']/p[contains(text(), '20')]/text()"
        ).extract()[0]
        # Extract location
        item['location'] = hxs.select(
            "//header/div[@class='post-data']/p[contains(text(), 'From')]/text()"
        ).extract()[0].replace('From', '')
        # Extract article url
        urls = hxs.select(
            "//div[@class='breadcrumb-container']/ul[@class='breadcrumb']/li/a/@href"
        ).extract()
        item['url'] = urlparse.urljoin(urls[1], urls[2])
        # Extract article text, with html tags
        item['article_html'] = hxs.select(
            "//div[@role='main']/article").extract()[0]

        return item
Beispiel #13
0
class StackoverflowSpider(CrawlSpider):
    name = 'stackoverflow'
    allowed_domains = ['stackoverflow.com']
    start_urls = get_stack_urls()

    rules = (
        Rule(SgmlLinkExtractor(allow=r'(.*)\?tab=answers(.*)'), callback='parse_item', follow=True),
        Rule(SgmlLinkExtractor(allow=r'/questions/'), callback='parse_question', follow = False),
    )

    question_xpath = "//div[@id='question']"
    answers_xpath = "//div[@id='answers']//div[@data-answerid]"

    def parse_item(self, response):
        pass

    def parse_question(self, response):
        # print 'I am parsing question'
        hxs = HtmlXPathSelector(response)
        for question_selector in hxs.select(self.question_xpath):
            yield self.get_question(question_selector, response)

        for answer_selector in hxs.select(self.answers_xpath):
            yield self.get_answer(answer_selector, response)


    # label can be 'question' or 'answer'
    def get_user(self, selector, response, label):
        user_loader = XPathItemLoader(item = StackOverflowUser(),
                selector = selector)
        user_loader.add_xpath('user_name', ''.join([
            './/div[contains(@class, "user-details")]',
            '/a/text()'
            ]))
        user_loader.add_xpath('user_link', ''.join([
            './/div[contains(@class, "user-details")]',
            '/a/@href'
            ]))

        if user_loader.get_output_value('user_link'):
            user_id = user_loader.get_output_value('user_link')
            user_loader.add_value('user_id',
                    user_loader.get_output_value('user_link'))

        return user_loader.load_item()

    def get_question(self, selector, response):
        hxs = HtmlXPathSelector(response)
        number_of_answers = hxs.select(''.join([
            '//div[@id="answers"]',
            '//div[contains(@class, "answers-subheader")]',
            '/h2/text()'
            ])).extract()

        question_loader = XPathItemLoader(item = StackOverflowQuestion(),
                selector = selector)
        question_loader.add_xpath('question_content', ''.join([
            ".//td[@class='postcell']",
            "//div[@class='post-text']/p/text()"
            ]))
        question_loader.add_xpath('question_tags', ''.join([
            ".//div[@class='post-taglist']",
            "//a[@class='post-tag']/text()"
            ]))
        question_loader.add_xpath('question_id', ''.join([
            './@data-questionid'
            ]))
        question_loader.add_xpath('marks', ''.join([
            ".//span[contains(@class, 'vote-count-post')]/text()"
            ]))
        question_loader.add_value('asker', self.get_user(selector, response, 'question'))
        question_loader.add_value('number_of_answers',
                int(number_of_answers[0].strip().split(' ')[0]))

        question_title = hxs.select(''.join([
            '//div[contains(@id, "question-header")]',
            '//a[contains(@class, "question-hyperlink")]/text()'
            ])).extract()
        question_loader.add_value('question_title', question_title)
        # print  question_loader.get_output_value('question_title')

        return question_loader.load_item()

    def get_answer(self, selector, response):
        answer_loader = XPathItemLoader(item = StackOverflowAnswer(),
                selector = selector)
        answer_loader.add_xpath('answer_content', ''.join([
            ".//td[@class='answercell']/div[@class='post-text']",
            "/p/text()"
            ]))
        answer_loader.add_xpath('answer_id', ''.join([
            "./@data-answerid"
            ]))
        answer_loader.add_xpath('marks', ''.join([
            ".//span[contains(@class, 'vote-count-post')]/text()"
            ]))
        # is best answer?
        if selector.select('./@class').extract()[0].find('accepted-answer') != -1:
            answer_loader.add_value('is_best_answer', 1)
        else:
            answer_loader.add_value('is_best_answer', 0)
        # get user name
        answer_loader.add_value('answerer', self.get_user(selector, response, 'answer'))

        return answer_loader.load_item()
Beispiel #14
0
class TestSpider(InitSpider):

    name = "test"
    allowed_domains = ["test.co.kr"]
    login_page = "http://local.test.co.kr/login"
    start_urls = "http://local.test.co.kr/"
  #Rule 객체를 이용해 크롤링 되는 사이트의 동작을 정의 한다.
    rules = (
        #Rule(SgmlLinkExtractor(allow=r'-\w+.html$'), callback='parse_item', follow=True),
        Rule(SgmlLinkExtractor(allow=("local\.test\.co\.kr[^\s]*\/*$")), callback='parse_item', follow=True),
    )

  ## initRequest 메소드가 맨 처음 시작 됨.
    def init_request(self):
      ## 로그인 페이지와 callback 지정
        return Request(url=self.login_page, callback=self.login)

  ## FormRequest를 이용해서 해당 페이지에서 submit요청을 보낸다.
    def login(self, response):
        return FormRequest.from_response(response,
                    formdata={'id': '0000', 'password': '******'},
                    callback=self.check_login_response)

  ## response된 html을 파싱해서 로그인 여부를 판단 한다.
    def check_login_response(self, response):
        //check login success
        if "/auth/logout" in response.body:
          ## 로그인이 성공하면 initialized를 실행해 파싱을 시작한다.
            return self.initialized()
        else
            return self.error()

    def initialized(self):
        return Request(url=self.start_urls, callback=self.parse_item)

    def parse_item(self, response):
        ## 중복처리를 위해 수집된 url을 불러옴.
        if self.isFirstLoop :
            self.tempUrls = self.getUrlSet()
            self.isFirstLoop = 0;
        site = "test"
        rank = "0"
        title = response.xpath('//title/text()').extract()
        req_url = response.request.url.replace('http://'+host, '', 1)
        res_url = response.url
        s  = re.search("<(!\s*doctype\s*.*?)>", response.body, re.IGNORECASE)
        doctype = s.group(1) if s else ""
        css = response.xpath('//link/@href').extract()
        js = response.xpath('//script/@src').extract()
        layout = response.xpath('//div[@class="debug_layout"]/text()').extract()
        sidebar = response.xpath('//div[@class="debug_side_layout"]/text()').extract()
        emulate = response.xpath('//meta[contains(@content, "IE")]/@content').extract()
        embed_style_cnt = len(response.xpath('//style').extract())
        embed_script_cnt = len(response.xpath('//script').extract()) - len(response.xpath('//script/@src').extract())
        # 호스트부분은 제거해 준다.
        ckurl = req_url.replace("http://local.test.co.kr", "")
        ckurl = req_url.replace("https://local.test.co.kr", "")
        if ckurl.find('?') > -1 :
            ckurl = ckurl.split('?')[0]
        elif len(ckurl.split('/')) > 4 :
            piece = ckurl.split('/')
            ckurl = piece[0]+'/'+piece[1]+'/'+piece[2]+'/'+piece[3]+'/'+piece[4]
                # 중복 확인.
        if ckurl in self.tempUrls:
            print ">>>>>>>>>>>>>>>[DropItem]:" + ckurl
            raise #DropItem("Duplicate url found: %s" % ckurl)
        else :
            req_url = ckurl
            self.tempUrls.add(req_url)
            if len(layout) > 0 :
                layout = layout[-1]
            else :
                layout = ",".join(layout)
            if len(sidebar) > 0 :
                sidebar = sidebar[-1]
            else :
                sidebar = ",".join(sidebar)
            item = SaraminWebItem()
            item["site"] = site
            item["rank"] = rank
            item["title"] = ",".join(title)
            item["req_url"] = req_url
            item["res_url"] = res_url
            item["doctype"] = doctype
            item["css"] = ",".join(css)
            item["js"] = ",".join(js)
            item["layout"] = layout
            item["sidebar"] = sidebar
            item["emulate"] = ",".join(emulate)
            item["embed_style_cnt"] = embed_style_cnt
            item["embed_script_cnt"] = embed_script_cnt
            # print(item);
            yield item
Beispiel #15
0
class buildkSpiders(CrawlSpider):
    handle_httpstatus_list = [302]
    name = "bk"
    start_urls = [
        'http://buildkar.com/building-materials/blocks/',
        'http://buildkar.com/building-materials/boards-planks/',
        'http://buildkar.com/building-materials/boards-planks/page/2/',
        'http://buildkar.com/building-materials/bricks/'
    ]
    rules = (
        Rule(SgmlLinkExtractor(allow=(".*/building-materials/cement-rmc/.*", ),
                               deny=(".*add-to-cart=.*", ),
                               unique=True),
             callback='parse_item',
             follow=True),
        Rule(SgmlLinkExtractor(allow=(".*/page/.*", ), unique=True),
             callback='parse_item',
             follow=True),
    )

    def parse(self, response):
        #print ">>>>>", response.request.url
        sel = Selector(response)
        items = []
        item = BuildkarItem()
        # item['url'] = response.request.url
        # #import pdb;pdb.set_trace()
        # title = (map(unicode.strip,sel.xpath('//h3[@class="heading-title product-title"]//a/text()').extract()))
        # #print len(title)
        # item['title'] = title
        # category = (map(unicode.strip,sel.xpath('//div[@class="product-meta-wrapper"]/div[@class="wd_product_categories"]/a/text()').extract()))
        # item['category'] = category
        # price1 = (map(unicode.strip,sel.xpath('//span[@class="price"]/del/span[@class="amount"]/text()').extract()))
        # item['price1'] = price1
        # price = (map(unicode.strip,sel.xpath('//ins/span[@class="amount"]/text()').extract()))
        # item['price'] = price
        item['href'] = sel.xpath(
            '//h3[@class="heading-title product-title"]/a/@href').extract()
        # print len (item['href'])
        # description = []
        # for i in range (len(item['href'])) :
        # 	url = item['href'][i]
        # 	html_doc = urllib2.urlopen (url)
        # 	soup = BeautifulSoup (html_doc.read ())
        # 	raw_data = soup.find('div',{'id':"content_description"})
        # 	p =raw_data.text
        # 	print">>>>>",p
        # 	description.append (p)
        # for x1 in range(len(title)):
        # 	print title[x1]
        # 	item1 = BaseProducts()
        # 	item1.source_url = item['url'][x1]
        # 	item1.Sku = title[x1]
        # 	item1.title = title[x1]
        # 	item1.category_name=category[x1]
        # 	item1.description = description[x1]
        # 	item1.source_id = 5
        # 	item1.save()
        # 	item2 = Subscribe_Product()
        # 	item3 = SPM()
        # 	item2.bp = item1
        # 	item2.source_id =5
        # 	item2.Sku = title[x1]
        # 	item2.save()
        # 	item3.sp = item2
        # 	item3.Sku = title[x1]
        # 	try:
        # 		item3.price = price[x1].replace("[","").replace(",","").replace("u","").replace("]","").replace("'","").replace("\xa0","").replace("Rs.","")
        # 	except : item3.price = 0
        # 	try:
        # 		item3.store_price = price1[x1].replace("[","").replace(",","").replace("u","").replace("]","").replace("'","").replace("\xa0","").replace("Rs.","")
        # 	except: item3.store_price = 0
        # 	item3.source_id = 5
        # 	#item3.saller = item['Saller']
        # 	item3.save()
        # 	item4 = Category()
        # 	item4.category_name = category[x1]
        # 	item4.category_path = category[x1]
        # 	item4.level = "3"
        # 	item4.source_id = 5
        # 	item4.save()
        # 	if item['title'] :
        # 		# print item
        return item
Beispiel #16
0
class MovieSpider(CrawlSpider):
    name = "movie"
    allowed_domains = ["movie.douban.com"]
    start_urls = ["http://movie.douban.com"]

    rules = (
        Rule(LinkExtractor(allow=r"/subject/\d+/($|\?\w+)"), 
            callback="parse_movie", follow=True),
    )
    def parse_movie(self, response):
        item = MovieItem()
        
        if self.parse_tv(response, item) is False:
            return None

        item["douban_id"] = response.url.split("/")[-2]
        self.get_name(response, item)
        self.get_name_other(response, item)
        self.get_directors(response, item)
        self.get_actors(response, item)
        self.get_countries(response, item)
        self.get_genres(response, item)
        self.get_languages(response, item)
        self.get_runtime(response, item)
        self.get_description(response, item)
        self.get_release_date(response, item)
        self.get_tags(response, item)
        self.get_image(response, item)
        self.get_douban_rating(response, item)
        
        #for i in item.keys():
            #print(i + "  " + str(item[i]))

        return item

    def get_name(self, response, item):
        name = response.xpath("//title/text()").extract()
        if name: item["name"] = name[0].replace(u" (豆瓣)", "").strip()

    def get_name_other(self, response, item):
        year = response.xpath("//*[@id='info']").re(NAMEOTHER_RE)
        if year: item["name_other"] = year[0]

    def get_directors(self, response, item):
        directors = response.xpath("//a[@rel='v:directedBy']/text()").extract()
        if directors: item["directors"] = '/'.join(directors)

    def get_actors(self, response, item):
         stars = response.xpath("//a[@rel='v:starring']/text()").extract()
         if stars: item["actors"] = '/'.join(stars)

    def get_genres(self, response, item):
        genres = response.xpath("//span[@property='v:genre']/text()").extract()
        if genres: item["genres"] = '/'.join(genres)

    def get_runtime(self, response, item):
        runtime = response.xpath("//span[@property='v:runtime']/text()").re(NUM_RE)
        if runtime:
            item["runtime"] = runtime[0]
            
    def get_douban_rating(self, response, item):
        average = response.xpath("//strong[@property='v:average']/text()").extract()
        if average and average[0] != "": item["douban_rating"] =  average[0]

    def get_tags(self, response, item):
        T = []
        tags = response.xpath("//div[@class='tags-body']/a")
        for tag in tags:
            t = tag.xpath("text()").extract()
            if t: T.append(t[0])
        if T: item["tags"] = '/'.join(T)

    def get_languages(self, response, item):
        S = "".join(response.xpath("//div[@id='info']").extract() )
        M = LANGUAGES_RE.search(S)
        if M is not None:
            item["languages"] = M.group(1)

    def get_countries(self, response, item):
        S = "".join(response.xpath("//div[@id='info']").extract() )
        M = COUNTRIES_RE.search(S)
        if M is not None:
            item["countries"] = M.group(1)

    def get_description(self, response, item):
        summary = response.xpath("//span[@property='v:summary']/text()").extract()
        if summary: item["description"] = "<br/>".join( summary )

    def get_image(self, response, item):
        image = response.xpath("//*[@id='mainpic']/a/img").re(IMAGE_RE)
        if image: item["image"] = image[0]

    def get_release_date(self, response, item):
        comment = response.xpath("////span[@property='v:initialReleaseDate']/text()").extract()
        if comment: item["release_date"] = '/'.join(comment)

    def parse_tv(self, response, item):
        S = "".join( response.xpath("//div[@id='info']//text()").extract() )
        M = TV_RUNTIME_RE.search(S)
        if M is not None:
            return False
        return True
Beispiel #17
0
class ls(CrawlSpider):
    name = "ls"
    download_delay = 2
    allowed_domains = ["xe.gr"]

    start_urls = [
        "http://www.xe.gr/property/search?Publication.age=1&System.item_type=re_land&Transaction.type_channel=117518&page=1&per_page=50"
    ]

    rules = (Rule(LxmlLinkExtractor(
        allow_domains=('xe.gr'),
        restrict_xpaths=("//a[@class='white_button right']")),
                  callback='parse_start_url',
                  follow=True), )

    def parse_start_url(self, response):
        return self.parse_items(response)

    def parse_items(self, response):
        for sel in response.xpath("//div[contains(@class,'r_desc')]/h2/a"):
            link = "http://www.xe.gr" + sel.xpath(
                "@href").extract_first() + "?mode=spec"
            yield Request(link, callback=self.parse2)

    def parse2(self, response):
        # Creating an empty item object
        item = {}
        # Assigning values to it's fields
        item['url'] = response.url

        region_string = response.xpath(
            u"//th[text()='Περιοχή:']/following-sibling::*/text()"
        ).extract_first()
        region_list = region_string.strip().split(' > ')
        item['regionA'] = region_list[0]
        try:
            item['regionB'] = region_list[1]
        except (IndexError):
            item['regionB'] = None
        try:
            item['regionC'] = region_list[2]
        except (IndexError):
            item['regionC'] = None
        try:
            item['regionD'] = region_list[3]
        except (IndexError):
            item['regionD'] = None
        price_string = response.xpath(
            u"//td[@class='auto_price']/span/text()").extract_first()
        try:
            item['price'] = float(price_string.strip().replace(
                u" €", "").replace(".", "").replace(",", "."))
        except:
            item['price'] = None
        item['location_name'] = response.xpath(
            u"//th[text()='Τοποθεσία:']/following-sibling::*/text()"
        ).extract_first()
        item['category'] = response.xpath(
            u"//th[text()='Είδος:']/following-sibling::*/text()"
        ).extract_first()
        area_string = response.xpath(
            u"//th[text()='Εμβαδόν:']/following-sibling::*/text()"
        ).extract_first()
        try:
            item['area'] = float(area_string.strip().replace(".", "").replace(
                ",", "."))
        except:
            item['area'] = None
        item['city_plan'] = response.xpath(
            u"//th[text()='Σχέδιο Πόλης:']/following-sibling::*/text()"
        ).extract_first()
        item['structure_factor'] = response.xpath(
            u"//th[text()='Συντελεστής Δόμησης:']/following-sibling::*/text()"
        ).extract_first()
        item['coverage_factor'] = response.xpath(
            u"//th[text()='Συντελεστής Κάλυψης:']/following-sibling::*/text()"
        ).extract_first()
        facade_length_string = response.xpath(
            u"//th[text()='Πρόσοψη:']/following-sibling::*/text()"
        ).extract_first()
        try:
            item['facade_length'] = float(facade_length_string)
        except:
            item['facade_length'] = None
        try:
            item['facade_count'] = float(
                response.xpath(
                    u"//th[text()='Αριθμός Όψεων:']/following-sibling::*/text()"
                ).extract_first())
        except:
            item['facade_count'] = None
        item['airy'] = response.xpath(
            u"//th[text()='Διαμπερές:']/following-sibling::*/text()"
        ).extract_first()
        item['slope'] = response.xpath(
            u"//th[text()='Κλίση:']/following-sibling::*/text()"
        ).extract_first()
        item['artio'] = response.xpath(
            u"//th[text()='Άρτιο:']/following-sibling::*/text()"
        ).extract_first()
        item['oikodomisimo'] = response.xpath(
            u"//th[text()='Οικοδομήσιμο:']/following-sibling::*/text()"
        ).extract_first()
        item['me_adia'] = response.xpath(
            u"//th[text()='Με άδεια οικοδομής:']/following-sibling::*/text()"
        ).extract_first()
        try:
            item['ktizei'] = float(
                response.xpath(
                    u"//th[text()='Κτίζει:']/following-sibling::*/text()").
                extract_first())
        except:
            item['ktizei'] = None
        item['availability'] = response.xpath(
            u"//th[text()='Διαθεσιμότητα:']/following-sibling::*/text()"
        ).extract_first()
        item['availability_from'] = response.xpath(
            u"//th[text()='Διαθέσιμο από:']/following-sibling::*/text()"
        ).extract_first()
        item['antiparoxi'] = response.xpath(
            u"//th[text()='Και αντιπαροχή:']/following-sibling::*/text()"
        ).extract_first()  # Δεν είμαι σίγουρος για το xpath
        item['view'] = response.xpath(
            u"//th[text()='Θέα:']/following-sibling::*/text()").extract_first(
            )
        try:
            item['dist_from_sea'] = float(
                response.xpath(
                    u"//th[text()='Απόσταση από Θάλασσα:']/following-sibling::*/text()"
                ).extract_first())
        except:
            item['dist_from_sea'] = None
        item['paling'] = response.xpath(
            u"//th[text()='Περίφραξη:']/following-sibling::*/text()"
        ).extract_first()
        item['supplies'] = response.xpath(
            u"//th[text()='Παροχές:']/following-sibling::*/text()"
        ).extract_first()
        item['drilling'] = response.xpath(
            u"//th[text()='Γεώτρηση:']/following-sibling::*/text()"
        ).extract_first()
        item['with_building'] = response.xpath(
            u"//th[text()='Κτίσμα:']/following-sibling::*/text()"
        ).extract_first()
        item['corner_plot'] = response.xpath(
            u"//th[text()='Γωνιακό:']/following-sibling::*/text()"
        ).extract_first()
        item['mesites'] = response.xpath(
            u"//th[text()='Μεσίτες δεκτοί:']/following-sibling::*/text()"
        ).extract_first()
        item['epaggelmatiki_xrisi'] = response.xpath(
            u"//th[text()='Επαγγελματική χρήση:']/following-sibling::*/text()"
        ).extract_first()
        item['dimensions'] = response.xpath(
            u"//th[text()='Διαστάσεις:']/following-sibling::*/text()"
        ).extract_first()
        item['contains'] = response.xpath(
            u"//th[text()='Περιέχει:']/following-sibling::*/text()"
        ).extract_first()
        #  Τώρα θα πάμε να πάρουμε και την ημερομηνία τελευταίας τροποποίησης.
        yield Request(response.url[:-10],
                      callback=self.parse3,
                      meta={'item': item})

    def parse3(self, response):
        # Retrieving the item
        item = response.meta['item']
        # Assigning more values to it's fields
        x = response.xpath("//td[@class='headItem']/text()").extract_first()
        datelist = x.split(" ")
        months = [
            u'Ιανουαρίου', u'Φεβρουαρίου', u'Μαρτίου', u'Απριλίου', u'Μαΐου',
            u'Ιουνίου', u'Ιουλίου', u'Αυγούστου', u'Σεπτεμβρίου', u'Οκτωβρίου',
            u'Νοεμβρίου', u'Δεκεμβρίου'
        ]
        date = datetime.date(int(datelist[3]),
                             months.index(datelist[2]) + 1, int(datelist[1]))
        item['date'] = date
        try:
            item['details'] = response.xpath("//p[@class='dets']").xpath(
                "text()").extract_first().strip()
        except:
            item['details'] = None
        yield item
Beispiel #18
0
class MySpider(CrawlSpider):
    name = "swimoutlet"
    allowed_domains = ["swimoutlet.com"]
    start_urls = [
        ##    "http://www.swimoutlet.com/womens-tan-thru-swimsuits-c9374/",
        ##    "http://www.swimoutlet.com/shoes-accessories-c10211/",
        ##    "http://www.swimoutlet.com/swim-caps-c9633/#cat=9633&clrc=481&sortby=Popularity"
        ##    "http://www.swimoutlet.com/womens-swim-dresses-c9373/",
        ##    "http://www.swimoutlet.com/shoes-accessories-c10211/",
        ##    "http://www.swimoutlet.com/swimming-watches-c14082/",
        ##    "http://www.swimoutlet.com/kickboards-c9661/"
        i.strip() for i in urllist
    ]

    rules = (
        ##    Rule (SgmlLinkExtractor(allow=(),
        ##                            restrict_xpaths=('//ul[@class="pagination"]',))
        ##    , follow= True),
        Rule(SgmlLinkExtractor(
            allow=(),
            restrict_xpaths=
            ('//ul[@class="pagination floatR"] | //nav[@id="blockcontentmnutop"]/span[last()]/a',
             )),
             callback="parse_category",
             follow=True), )

    def parse_category(self, response):
        ##  def parse(self,response):

        sel = Selector(response)
        hxs = HtmlXPathSelector(response)
        pageurl = response.url.strip()
        breadcrumb = sel.xpath(
            "//nav[@id='blockcontentmnutop']/span[last()]/a/text()").extract(
            )[0].strip()
        ##    url = response.url
        ##    for i in range(len(urllist)):
        ##      if url == urllist[i]:
        ##        row =(breadcrumb,priceidlist[i],url,cat1list[i],cat2list[i],cat3list[i])
        ##        mywriter.writerow(row)

        for i in range(len(urllist)):
            if breadcrumb == categnamelist[i]:
                producturls = sel.xpath(
                    "//div[@class='pd-details']/a/@href").extract()
                for x in producturls:
                    item = BigCItem()
                    item['Category'] = cat1list[i]
                    item['Category2'] = cat2list[i]
                    item['Category3'] = cat3list[i]
                    item['id1'] = priceidlist[i]
                    request = Request(x, callback=self.parse_items)
                    request.meta["item"] = item
                    yield request


##  def parse(self,response):
##    item = BigCItem()
##    item['Category'] = ''
##    item ['id1'] = 'Apparel'

    def parse_items(self, response):
        item = response.meta['item']
        sel = Selector(response)
        hxs = HtmlXPathSelector(response)
        pname = sel.xpath("//h1/text()").extract()[0]
        item["Product_Name"] = pname
        item["Option_Set"] = pname
        item[
            "Product_Image_Description_1"] = "Buy " + pname + " Online in India at LiveYourSport.com| Free Shipping and Massive Discounts"
        item[
            "MetaDescription"] = "Get your hands on the " + pname + ". Buy it Online in India at LiveYourSport.com| Free Shipping and Massive Discounts"
        item[
            "TitleTag"] = "Buy the " + pname + " Online in India at LiveYourSport.com| Free Shipping and Massive Discounts"
        item["Brand_Name"] = sel.xpath(
            "//div[@id='divhoverpopup']/h2/a/@title").extract()[0]
        pcode = sel.xpath(
            "//div[@class='pro-code']/i/text()").extract()[0].replace(
                "Product Code: ", "")
        item["Product_Code"] = pcode
        item["Product_Description"] = sel.xpath(
            "//div[@class='pro-description']/p |//div[@class='pro-description']/ul"
        ).extract()
        item["Product_Description"] = ''.join(
            item["Product_Description"]).encode('utf-8')
        mrp = sel.xpath(
            "//span[@id='ListPrice']/text()| //span[@id='ProductPrice']/text()"
        ).extract()[0].replace("$", "")
        sp = sel.xpath(
            "//span[@id='PriceRange']/text() | //span[@id='SalePrice']/text()"
        ).extract()
        item["Sale_Price"] = ""
        if item["Brand_Name"] in ("FINIS", "Arena", "Speedo", "Finis", "2XU",
                                  "Garmin", "HYDRO-FIT", "Nike", "TYR",
                                  "Yurbuds", "Timex"):
            sortorder = "-300"
        elif item["Brand_Name"] == "Sporti":
            sortorder = "-270"
        else:
            sortorder = "-270"
        item['Retail_Price'], item['Sale_Price'] = mycsv.pricing(
            mrp, sp, item['id1'])

        breadcrumb = sel.xpath(
            "//nav[@class='block-content mnu-top mnu-top-product-detail']/span[last()]/a/text()"
        ).extract()[0].strip()
        if breadcrumb == 'Swim.com Compatible':
            breadcrumb = sel.xpath(
                "//nav[@class='block-content mnu-top mnu-top-product-detail']/span[last()-1]/a/text()"
            ).extract()[0].strip()

        if item['Category'] and item['Category2'] and item['Category3']:
            Category = item['Category'] + '/' + breadcrumb + ';' + item[
                'Category2'] + '/' + breadcrumb + ';' + item[
                    'Category3'] + '/' + breadcrumb
        elif item['Category'] and item['Category2']:
            Category = item['Category'] + '/' + breadcrumb + ';' + item[
                'Category2'] + '/' + breadcrumb + ';'
        else:
            Category = item['Category'] + '/' + breadcrumb

        size = response.xpath(
            '//*[@id="divChooseOption2"]/div[2]/script[1][contains(text(),"arraySize")]'
        ).extract()
        colorArray = response.xpath(
            '//script[@language="JavaScript"][contains(text(),"arrayColor")]'
        ).extract()
        if size or colorArray:
            trackinventory = "By Option"
        else:
            trackinventory = "By Product"

        item["Product_Image_File1"] = response.xpath(
            '//*[@id="divChooseOption2"]/img/@name |//div[@class="box-content block-content mnu-content pro-option"]/img/@name'
        ).extract()

        tup = (
            "Product",
            item["Product_Name"] + "*",
            item["Brand_Name"],
            item["Retail_Price"],
            item["Retail_Price"],
            item["Sale_Price"],  #price
            item["Product_Code"] + "SWMOTLT",
            Category,
            "SWIMOUTLET",
            item["Product_Description"],
            "100",
            item["Product_Name"],
            "15-21 Working days",
            "N",
            sortorder,
            item["MetaDescription"],
            item["TitleTag"],
            item["Product_Image_Description_1"],
            "Y",
            trackinventory,
            "1",
            "2",
            "3",
            "4",
            "5",
            "6",
            "7")
        obj = list(tup)
        c = 0
        for i in item["Product_Image_File1"]:
            c = c + 1
            imgurl = "http://www.swimoutlet.com/photos/" + i + ".jpg"
            if size or colorArray:
                imgurl = "http://www.swimoutlet.com/photos/options/" + i + ".jpg"
            obj.append(imgurl)
            if c == 7:
                break
        row = tuple(obj)

        if size:
            size = response.xpath(
                '//*[@id="divChooseOption2"]/div[2]/script[1][contains(text(),"arraySize")]'
            ).extract()[0].replace("arraySize[0] =", "")
            size = re.sub(r'<script(.*)>', '', size)
            size = size.replace("arraySize = new Array();", "")
            size = re.sub(r'arraySize(.*)=', ',', size)
            size = size.replace("[", '').replace("];", "").replace(
                "'", '"').replace('",', '":').replace("</script>", "")
            size = "[{" + size + "}]"
            item['size'] = {}
            item['size'] = json.loads(size)[0]
        else:
            item['size'] = ""

        if colorArray:
            colorArray = response.xpath(
                '///script[@language="JavaScript"][contains(text(),"arrayColor")]'
            ).extract()[0].replace("arrayColor[0] =", "")
            colorArray = re.sub(r'<script(.*)>', '', colorArray)
            colorArray = colorArray.replace("var arrayColor = new Array();",
                                            "")
            colorArray = re.sub(r'arrayColor(.*)=', ',', colorArray)
            colorArray = colorArray.replace("[", '').replace("];", "").replace(
                "'", '"').replace('",', '":').replace("</script>", "")
            #print colorArray
            colorArray = "[{" + colorArray + "}]"
            item['color'] = {}
            item['color'] = json.loads(colorArray)[0]
            item['variant'] = {}
            for colorcode, color in item['color'].iteritems():
                if item['size'] == "":
                    item['variant'][colorcode + "_" +
                                    colorcode] = "[S]Color= " + color
                elif len(item['size']) == 1:
                    for sizecode, size in item['size'].iteritems():
                        item['variant'][
                            colorcode + "_" +
                            sizecode] = "[S]Color= " + color + ",[RB]Size= " + size
                elif len(item['color']) == 1:
                    for sizecode, size in item['size'].iteritems():
                        item['variant'][
                            colorcode + "_" +
                            sizecode] = "[RB]Color= " + color + ",[S]Size= " + size
                else:
                    for sizecode, size in item['size'].iteritems():
                        item['variant'][
                            colorcode + "_" +
                            sizecode] = "[S]Color= " + color + ",[S]Size= " + size

            combosArray = sel.xpath(
                '//script[@language="JavaScript"][contains(text(),"var separator")]'
            ).extract()[0]
            combosArray = re.findall(r'id=.*name', combosArray)
            combosArray = [
                w.replace("id='size_", "").replace("name", "").replace(
                    '"', "").replace("'", "").replace(" ", "")
                for w in combosArray
            ]
            priceArray = sel.xpath(
                '//script[@language="JavaScript"][contains(text(),"var separator")]'
            ).extract()[0]
            priceArray = re.findall(r'value.*/', priceArray)
            priceArray = [
                w.replace("/", "").replace("value=", "").replace("'", "")
                for w in priceArray
            ]
            item["Price"] = dict(zip(combosArray, priceArray))
            #print pricedict
            notfound = 0

            for key, price in item['Price'].iteritems():
                if key not in item['variant']:
                    notfound = 1
                    break

            if notfound == 0:
                mywriter.writerow(row)
                for key, price in item["Price"].iteritems():
                    row = ("Rule", item['variant'][key], "", "", "", "",
                           pcode + key, "", "SWIMOUTLET", "", "", "", "", "",
                           "", "", "", "", "", "", "", "", "", "", "", "", "",
                           "http://www.swimoutlet.com/photos/options/" +
                           pcode + "-" + key.split("_")[0] + "-zoomin.jpg")
                    mywriter.writerow(row)
                    row1 = ("SKU", item['variant'][key], "", "", "", "",
                            pcode + key, "", "SWIMOUTLET", "", "100", "", "",
                            "", "", "", "", "", "", "", "", "", "", "", "", "",
                            "", "http://www.swimoutlet.com/photos/options/" +
                            pcode + "-" + key.split("_")[0] + "-zoomin.jpg")

                    mywriter.writerow(row1)
Beispiel #19
0
class TabelogSpider(CrawlSpider):
    name = 'tabebot'
    allowed_domains = ['tabelog.com']
    download_delay = 1.0

    prefectures = [
        'hokkaido',
        'aomori',
        'iwate',
        'miyagi',
        'akita',
        'yamagata',
        'fukushima',
        'ibaraki',
        'tochigi',
        'gunma',
        'saitama',
        'chiba',
        'tokyo',
        'kanagawa',
        'niigata',
        'toyama',
        'ishikawa',
        'fukui',
        'yamanashi',
        'nagano',
        'gifu',
        'shizuoka',
        'aichi',
        'mie',
        'shiga',
        'kyoto',
        'osaka',
        'hyogo',
        'nara',
        'wakayama',
        'tottori',
        'shimane',
        'okayama',
        'hiroshima',
        'yamaguchi',
        'tokushima',
        'kagawa',
        'ehime',
        'kochi',
        'fukuoka',
        'saga',
        'nagasaki',
        'kumamoto',
        'oita',
        'miyazaki',
        'kagoshima',
        'okinawa',
    ]

    categories = [
        'japanese',  # 日本料理
        'RC0102',  # 寿司・魚介類
        'RC0103',  # 天ぷら・揚げ物
        'RC0104',  # そば・うどん・麺類
        'RC0105',  # うなぎ・どじょう
        'RC0106',  # 焼鳥・串焼・鳥料理
        'RC0107',  # すき焼き・しゃぶしゃぶ
        'RC0108',  # おでん
        'RC0109',  # お好み焼き・たこ焼き
        'RC0110',  # 郷土料理
        'RC0111',  # 丼もの
        'RC0199',  # 和食(その他)
        'RC0201',  # ステーキ・ハンバーグ
        'RC0203',  # 鉄板焼き
        'RC0202',  # パスタ・ピザ
        'hamburger',  # ハンバーガー
        'RC0209',  # 洋食・欧風料理
        'french',  # フレンチ
        'italian',  # イタリアン
        'RC0219',  # 西洋各国料理
        'RC0301',  # 中華料理
        'RC0302',  # 餃子・肉まん
        'RC0303',  # 中華粥
        'RC0304',  # 中華麺
        'korea',  # 韓国料理
        'RC0402',  # 東南アジア料理
        'RC0403',  # 南アジア料理
        'RC0404',  # 西アジア料理
        'RC0411',  # 中南米料理
        'RC0412',  # アフリカ料理
        'RC0499',  # アジア・エスニック(その他)
        'RC1201',  # カレーライス
        'RC1202',  # 欧風カレー
        'RC1203',  # インドカレー
        'RC1204',  # タイカレー
        'RC1205',  # スープカレー
        'RC1299',  # カレー(その他)
        'RC1301',  # 焼肉・ホルモン
        'RC1302',  # ジンギスカン
        'nabe',  # 鍋
        'izakaya',  # 居酒屋
        'RC2102',  # ダイニングバー
        'RC2199',  # 居酒屋・ダイニングバー(その他)
        'RC9901',  # 定食・食堂
        'RC9902',  # 創作料理・無国籍料理
        'RC9903',  # 自然食・薬膳
        'RC9904',  # 弁当・おにぎり
        'RC9999',  # レストラン(その他)
        'ramen',  # ラーメン
        'MC11',  # つけ麺
        'SC0101',  # パン
        'SC0201',  # 洋菓子
        'SC0202',  # 和菓子・甘味処
        'SC0203',  # 中華菓子
        'SC0299',  # スイーツ(その他)
    ]

    start_urls = [
        'http://tabelog.com/{0}/rstLst/{1}/?SrtT=rt&Srt=D'.format(
            prefecture, category) for prefecture in prefectures
        for category in categories
    ]

    rules = [
        # Follow business list pagination
        Rule(LxmlLinkExtractor(allow=(r'[a-z]+/rstLst/RC\d+/\d+/\?.*', ),
                               deny=(r's.tabelog.com')),
             follow=True),

        # Extract business
        Rule(LxmlLinkExtractor(allow=(r'[a-z]+/A\d{4}/A\d{6}/\d+/$', ),
                               deny=(r's.tabelog.com')),
             callback='parse_business'),

        # Follow review list pagination (first page)
        Rule(LxmlLinkExtractor(
            allow=(r'[a-z]+/A\d{4}/A\d{6}/\d+/dtlrvwlst/$', ),
            deny=(r's.tabelog.com')),
             follow=True),

        # COND-0 すべての口コミ
        # COND-1 夜の口コミ
        # COND-2 昼の口コミ

        # smp0 簡易リスト
        # smp1 通常
        # smp2 全文

        # Follow review list pagination and extract reviews
        Rule(LxmlLinkExtractor(
            allow=(r'[a-z]+/A\d{4}/A\d{6}/\d+/dtlrvwlst/COND-0/smp2/\?.+', ),
            deny=(r'favorite_rvwr', r's.tabelog.com')),
             follow=True,
             callback='parse_reviews_and_users'),
    ]

    def is_tabelog(self, response):
        selector = Selector(response)
        return bool(selector.xpath("//img[@id='tabelogo']"))

    def parse_reviews_and_users(self, response):
        if not self.is_tabelog(response):
            return Request(url=response.url, dont_filter=True)

        dom = PyQuery(response.body)
        review_nodes = dom('div.rvw-item')
        business_id = int(
            re.findall(r'[a-z]+/A\d{4}/A\d{6}/(\d+)/dtlrvwlst/',
                       response.url)[0])

        reviews_and_users = []
        for review_node in review_nodes:
            user_id = self._extract_user_id(review_node)
            review = self._generate_review(review_node, business_id, user_id)
            if review:
                reviews_and_users.append(review)
            user = self._generate_user(review_node, user_id)
            if user:
                reviews_and_users.append(user)
        return reviews_and_users

    def _extract_user_id(self, review_node):
        user_link = review_node.cssselect(
            '.rvw-item__rvwr-name > a:first-child')
        if user_link:
            url = user_link[0].attrib['href']
            return re.findall(r'rvwr/(.+)/', url)[0]

    def _generate_review(self, review_node, business_id, user_id):
        review = ReviewItem()

        review['review_id'] = int(review_node.getchildren()[0].attrib['name'])
        review['business_id'] = business_id
        set_value_if_true(review, 'user_id', user_id)

        review['visit'] = review_node.cssselect(
            '.rvw-item__visit-month-num')[0].text
        review['text'] = [
            sentence for sentence in review_node.cssselect(
                'div.rvw-item__rvw-comment > p')[0].itertext()
        ]
        review['title'] = review_node.cssselect(
            'p.rvw-item__rvw-title')[0].text_content().strip()

        for meal in ['dinner', 'lunch']:
            css = 'span.rvw-item__usedprice-icon--{0}'.format(meal)
            review['price_{0}'.format(meal)] = review_node.cssselect(css)[0] \
                                                          .getnext().text_content()

            set_value_if_true(review, 'stars_{0}'.format(meal),
                              self._extract_stars(review_node, meal))
        review['situations'] = self._extract_situations(review_node)
        return review

    def _extract_stars(self, review_node, meal):
        lis = review_node.cssselect(
            'li.rvw-item__ratings-item--{0}'.format(meal))
        if not lis:
            return

        stars = {}
        li = lis[0]
        stars['total'] = convert_to_float_if_float(
            li.cssselect('strong.rvw-item__ratings-total-score')[0].text)

        lis = li.cssselect('ul.rvw-item__ratings-dtlscore > li')
        for li, criterion in zip(
                lis, ['taste', 'service', 'ambience', 'cp', 'drink']):
            score = li.cssselect(
                'strong.rvw-item__ratings-dtlscore-score')[0].text
            stars[criterion] = convert_to_float_if_float(score)

        return stars

    def _extract_situations(self, review_node):
        imgs = review_node.cssselect('p.rvw-item__situation > img')
        situations = []
        for img, situation in zip(
                imgs,
            ['friends', 'date', 'settai', 'party', 'family', 'alone']):
            if not img.attrib['src'].endswith('_g.gif'):
                situations.append(situation)
        return situations

    def _generate_user(self, review_node, user_id):
        user = UserItem()

        user['user_id'] = user_id
        user['name'] = review_node.cssselect(
            '.rvw-item__rvwr-name > a > span')[0].text.strip()
        counts = review_node.cssselect('.rvw-item__rvwr-rvwcount')
        if counts:
            count = counts[0].text
            count_candidates = re.findall(r'\d+', count)
            if count_candidates:
                user['review_count'] = int(count_candidates[0])

        profile = review_node.cssselect('.rvw-item__rvwr-profile')
        if profile:
            user['profile'] = profile[0].text_content().strip()
        user['verified'] = bool(review_node.cssselect('.mark-auth-mobile'))
        return user

    def parse_business(self, response):
        if not self.is_tabelog(response):
            return Request(url=response.url, dont_filter=True)

        selector = Selector(response)

        business = BusinessItem()
        business['business_id'] = int(
            re.findall(r'[a-z]+/A\d{4}/A\d{6}/(\d+)/', response.url)[0])
        business['name'] = selector.xpath(
            "//span[@class='display-name']/text()")[0].extract().strip()
        business['categories'] = selector.xpath(
            "//span[@property='v:category']/text()").extract()

        stars = selector.xpath(
            "//span[@property='v:average']/text()")[0].extract().strip()
        business['stars'] = convert_to_float_if_float(stars)

        for meal in ['dinner', 'lunch']:
            price = selector.xpath(
                "//dt[@class='budget-{0}']/following-sibling::dd/em/a/text()".
                format(meal)).extract()
            if price:
                business['price_{0}'.format(meal)] = price[0]
            stars = selector.xpath(
                "//div[@class='score-s']/span[@class='{0}']/following-sibling::em/text()"
                .format(meal))[0].extract()
            business['stars_{0}'.format(meal)] = convert_to_float_if_float(
                stars)

        review_count = selector.xpath(
            "//em[@property='v:count']/text()")[0].extract()
        business['review_count'] = convert_to_int_if_int(review_count)

        business['prefecture'] = selector.xpath(
            "//p[@class='pref']/a/text()")[0].extract().strip()
        business['area'] = re.findall(r'[a-z]+/(A\d{4})/A\d{6}/\d+/',
                                      response.url)[0]
        business['subarea'] = re.findall(r'[a-z]+/A\d{4}/(A\d{6})/\d+/',
                                         response.url)[0]

        # business['menu_items'] = self._generate_menu_items(response)
        return business

    def _generate_menu_items(self, response):
        # TODO: implement me
        pass
Beispiel #20
0
class FunTVSpider(CrawlSpider):
    """
    风行电视爬取
    """

    name = 'fun_tv'
    allows_domains = ['fun.tv', ]
    start_urls = [
        'http://www.fun.tv/retrieve/c-e794b5e8a786e589a7.n-e5bdb1e78987.pg-1'
    ]

    rules = [
        Rule(sle(allow=('/retrieve/c-e794b5e8a786e589a7.n-e5bdb1e78987.pg-\d+$', )), follow=True, callback='parse1'),
    ]

    def parse1(self, response):
        sel = Selector(response)

        tv_list = sel.css('body div.mod-list.page-wrap div div.mod-wrap-in.mod-vd-lay.fix div.mod-vd-i')

        for tv in tv_list:
            tv_id = tv.css('div.info h3 a::attr(data-id)').extract()[0]

            if db_session.query(FunVideo).filter(FunVideo.id == tv_id).first():
                continue

            name = tv.css('div.info h3 a::attr(title)').extract()[0]
            image = tv.css('div.pic a img::attr(_lazysrc)').extract()[0]
            description = tv.css('div.info p::text').extract()[0]
            point = tv.css('div.info h3 b::text').extract()[0]

            request = Request('http://www.fun.tv{}'.format(tv.css('div.pic a::attr(href)').extract()[0]), callback=self.parse2)
            fv = FunVideo(id=tv_id, name=name, name_pinyin=pinyin.get_initials(name, splitter=''),
                          image=image, description=description, point=point)

            request.meta['tv'] = fv
            yield request

    def parse2(self, response):
        tv = response.meta['tv']

        sel = Selector(response)
        tv.origin_url = response.url
        tv.director = ''.join(sel.css('div#main-rt div.mod-datum p:nth-child(2) span::text').extract())
        tv.starring = ''.join(sel.css('div#main-rt div.mod-datum p:nth-child(3) span::text').extract())
        tv.category = ''.join(sel.css('div#main-rt div.mod-datum p:nth-child(4) span::text').extract())

        tv.detail = sel.css('div#main-rt div.mod-datum p.dirtext span:nth-child(2)::text').extract()[0]

        print tv.name, '------->', tv.origin_url

        # 表明电视
        tv.type = 1

        db_session.add(tv)
        db_session.commit()

        sub_tv_list = sel.css('div#playCont div div div div.torrent-panel ul li')

        for st in sub_tv_list:
            try:
                st.css('a span').extract()[0]
            except IndexError:
                sub_tv_index = st.css('::attr(data-idx)').extract()[0]
            else:
                continue

            sub_tv = SubFunViedo(fv_id=tv.id, index=sub_tv_index)
            sub_tv.id = st.css('::attr(data-vid)').extract()[0]
            sub_tv.origin_url = 'http://www.fun.tv{}'.format(st.css('a::attr(href)').extract()[0])

            print sub_tv.index, '-------->', sub_tv.origin_url

            request1 = Request(sub_tv.origin_url, callback=self.parse3)
            request1.meta['sub_tv'] = sub_tv
            yield request1

    def parse3(self, response):

        print 'parse 3 ------->'

        sub_tv = response.meta['sub_tv']

        sel = Selector(response)
        play_count = sel.css('div.playInfo.crumbs div.rightBtn.fix a::text').extract()[0]

        sub_tv.play_count = ''.join(play_count[3:].split(','))

        db_session.add(sub_tv)
        db_session.commit()
Beispiel #21
0
class mensrunning(CrawlSpider):
  name = "roadrunner"
  allowed_domains = ["roadrunnersports.com",
                     "roadrunnersports.scene7.com"]
  start_urls = ["http://www.roadrunnersports.com/rrs/products/BRK1078/mens-brooks-beast-12/"
    #"http://www.roadrunnersports.com/rrs/products/ASC1724/mens-asics-gelkayano-21/",
                #"http://www.roadrunnersports.com/rrs/products/ASC1726/"
##    "http://www.roadrunnersports.com/rrs/mensshoes/?SearchHandle=QT1tZW5zc2hvZXMgbWVudXJyc35CPW1lbnNzaG9lcyBtZW51cnJzfkQ9MjR_RT0wXjFeMl5Qcmlvcml0eTJ_ST1Tb3J0VklQUHJpY2V_Sz00fkw9MX5NPTQ0fg&Action=2&AnswerID=2094&searchQuery=mensshoes%20menurrs",
##                "http://www.roadrunnersports.com/rrs/mensshoes/?SearchHandle=QT1tZW5zc2hvZXMgbWVudXJyc35CPW1lbnNzaG9lcyBtZW51cnJzfkQ9MjR_RT0wXjFeMl5Qcmlvcml0eTJ_ST1Tb3J0VklQUHJpY2V_Sz00fkw9MX5NPTQ0fg&Action=2&AnswerID=1482&searchQuery=mensshoes%20menurrs",
##                "http://www.roadrunnersports.com/rrs/mensshoes/?SearchHandle=QT1tZW5zc2hvZXMgbWVudXJyc35CPW1lbnNzaG9lcyBtZW51cnJzfkQ9MjR_RT0wXjFeMl5Qcmlvcml0eTJ_ST1Tb3J0VklQUHJpY2V_Sz00fkw9MX5NPTQ0fg&Action=2&AnswerID=2715&searchQuery=mensshoes%20menurrs",
##                "http://www.roadrunnersports.com/rrs/c/track-spikes/",
##                "http://www.roadrunnersports.com/rrs/mensshoes/?SearchHandle=QT1tZW5zc2hvZXMgbWVudXJyc35CPW1lbnNzaG9lcyBtZW51cnJzfkQ9MjR_RT0wXjFeMl5Qcmlvcml0eTJ_ST1Tb3J0VklQUHJpY2V_Sz00fkw9MX5NPTQ0fg&Action=2&AnswerID=128&searchQuery=mensshoes%20menurrs",
##                "http://www.roadrunnersports.com/rrs/mensshoes/?SearchHandle=QT1tZW5zc2hvZXMgbWVudXJyc35CPW1lbnNzaG9lcyBtZW51cnJzfkQ9MjR_RT0wXjFeMl5Qcmlvcml0eTJ_ST1Tb3J0VklQUHJpY2V_Sz00fkw9MX5NPTQ0fg&Action=2&AnswerID=3555&searchQuery=mensshoes%20menurrs",
##                "http://www.roadrunnersports.com/rrs/mensshoes/?SearchHandle=QT1tZW5zc2hvZXMgbWVudXJyc35CPW1lbnNzaG9lcyBtZW51cnJzfkQ9MjR_RT0wXjFeMl5Qcmlvcml0eTJ_ST1Tb3J0VklQUHJpY2V_Sz00fkw9MX5NPTQ0fg&Action=2&AnswerID=2547&searchQuery=mensshoes%20menurrs",
##                "http://www.roadrunnersports.com/rrs/womensshoes/?SearchHandle=QT13b21lbnNzaG9lcyBtZW51cnJzfkI9d29tZW5zc2hvZXMgbWVudXJyc35EPTI0fkU9MF4xXjJeUHJpb3JpdHkyfkk9U29ydFZJUFByaWNlfks9NH5MPTF_TT00Nn4&Action=2&AnswerID=2094&searchQuery=womensshoes%20menurrs",
##                "http://www.roadrunnersports.com/rrs/womensshoes/?SearchHandle=QT13b21lbnNzaG9lcyBtZW51cnJzfkI9d29tZW5zc2hvZXMgbWVudXJyc35EPTI0fkU9MF4xXjJeUHJpb3JpdHkyfkk9U29ydFZJUFByaWNlfks9NH5MPTF_TT00Nn4&Action=2&AnswerID=128&searchQuery=womensshoes%20menurrs",
##                "http://www.roadrunnersports.com/rrs/womensshoes/?SearchHandle=QT13b21lbnNzaG9lcyBtZW51cnJzfkI9d29tZW5zc2hvZXMgbWVudXJyc35EPTI0fkU9MF4xXjJeUHJpb3JpdHkyfkk9U29ydFZJUFByaWNlfks9NH5MPTF_TT00Nn4&Action=2&AnswerID=1482&searchQuery=womensshoes%20menurrs",
##                "http://www.roadrunnersports.com/rrs/womensshoes/?SearchHandle=QT13b21lbnNzaG9lcyBtZW51cnJzfkI9d29tZW5zc2hvZXMgbWVudXJyc35EPTI0fkU9MF4xXjJeUHJpb3JpdHkyfkk9U29ydFZJUFByaWNlfks9NH5MPTF_TT00Nn4&Action=2&AnswerID=2715&searchQuery=womensshoes%20menurrs",
##                "http://www.roadrunnersports.com/rrs/womensshoes/?SearchHandle=QT13b21lbnNzaG9lcyBtZW51cnJzfkI9d29tZW5zc2hvZXMgbWVudXJyc35EPTI0fkU9MF4xXjJeUHJpb3JpdHkyfkk9U29ydFZJUFByaWNlfks9NH5MPTF_TT00Nn4&Action=2&AnswerID=2905&searchQuery=womensshoes%20menurrs",
##                "http://www.roadrunnersports.com/rrs/womensshoes/?SearchHandle=QT13b21lbnNzaG9lcyBtZW51cnJzfkI9d29tZW5zc2hvZXMgbWVudXJyc35EPTI0fkU9MF4xXjJeUHJpb3JpdHkyfkk9U29ydFZJUFByaWNlfks9NH5MPTF_TT00Nn4&Action=2&AnswerID=2547&searchQuery=womensshoes%20menurrs"
  ]

  rules = (Rule (SgmlLinkExtractor(allow=(),restrict_xpaths=('//td[@id="paging_count"]',)), follow= True),
           Rule (SgmlLinkExtractor(restrict_xpaths=('//div[@class="product_colorways_image"]',)), callback="parse_item", follow= True),)

  csvfile = None
  printHeader = True
  def to_csv(self, item):
    start = 0
    end=0
    
    if self.printHeader:
      self.csvfile = open('RoadRunnerSportsRunningShoes.csv','w')
    if self.csvfile:

      strWrite = ''
      #headers
      if self.printHeader: 
        strWrite +='Item Type,Product ID,Product Name,Brand Name,Price,Retail Price,Sale Price,Product Description,Product Code/SKU,Bin Picking Number,'
        strWrite +='Category,Option Set,Product Availability,Current Stock Level,Free Shipping,Sort Order, Meta Description,Page Title, Product Image Description - 1,Product Image Is Thumbnail - 1,'
        strWrite +='Track Inventory,Product Image Sort - 1,Product Image Sort - 2,Product Image Sort - 3,Product Image Sort - 4,Product Image Sort-5,Product Image Sort-6,Product Image Sort-7,'
        strWrite +='Product Image File - 1,Product Image File - 2,Product Image File - 3,Product Image File - 4,Product Image File - 5 ,Product Image File - 6,Product Image File - 7, \n'
        self.printHeader = False

      pfound = 0 #counter to find product from master sheet. If 0 after the loop, product is a NEW product and not uploaded previously
      productid = 0 #Storing the Product ID value for the product row
      for color,sizes in item['variant'][item['sku']].iteritems():
        for i in range(len(namelist)): #Loop to go through all the Item Types in old file.          
          if typelist[i] == "Product" and skulist[i] == (item['sku']+item['color'][color]):           
          #if typelist[i] == "Product" and namelist[i] == (item['Product_Name']+" " + item['color'][color]+"*"): #Comparing Product Names from old sheet and new scrapped
            
            start = i # Counter to store index of found Product Name
            pfound = 1
            productid = idlist[i]
            for r in range(i+1,len(namelist)): #Loop to start at the Counter and Look for next occurance of Item Type = "Product"
              if typelist[r] == "Product" : 
                break   #Loop breaks for next occurance of Item Type = "Product" 
              else:
                  end = end+1 #Counting the number of SKUS for each product from the OLD sheet  
      print "#",pfound
      #not Found Products
      if pfound ==0:
        if item["Brand_Name"] not in("Nike","adidas","Reebok","Puma"):
          for color,sizes in item['variant'][item['sku']].iteritems():
            # generate product row
            strWrite += 'Product,,'+item['Product_Name']+" " + item['color'][color]+"*"+','+item['Brand_Name']+','+item['Retail_Price']+','+item['Retail_Price']+','+item['Sale_Price']+','
            strWrite += '.'.join(item["Product_Description"]).replace(',',';').replace("\n","").replace("\r","").replace('<.*?>)',"").replace("When choosing running shoes, find your perfect fit using this chart. Category types are based on factors like arch height, running habits and body frame.","").replace("This web exclusive item ships separately within the continental U.S. only. You can count on this item to ship in 3-5 business days!","") + ","
            strWrite += item['sku']+item['color'][color]+ ',' + "ROADRUNNER" +','
            strWrite += item['Category']  + ',' + item['Product_Name']+item['color'][color] + ',' + item["Product_Availability"] +','
            strWrite += item["Current_Stock"] + ',' + item["Free_Shipping"] + ',' + item["Sort_Order"]  + "," + "Buy the " + item['Product_Name']+" " + item['color'][color] + " Online in India at LiveYourSport.com| Free Shipping and Massive Discounts"
            strWrite += ',' + "Buy the " + item['Product_Name']+" " + item['color'][color]+ " Online in India at LiveYourSport.com| Free Shipping and Massive Discounts" + ','
            strWrite += "Buy the " + item['Product_Name']+" " + item['color'][color]+ " Online in India at LiveYourSport.com| Free Shipping and Massive Discounts" + ',' + item["Product_Image_Is_Thumbnail_1"] + ',' + item["Track_Inventory"] + ','
            strWrite += item["Product_Image_Sort_1"] + ',' + item["Product_Image_Sort_2"] + ',' + item["Product_Image_Sort_3"] + ','
            strWrite += item["Product_Image_Sort_4"] + ',' + item["Product_Image_Sort_5"] + ',6,7,'
            strWrite += ','.join(item['Product_Image_File1'][color])+',\n'
            #strWrite += 'Product,'+item['productname']+','+item['sku']+','+item['color'][color]+',,,,'+','.join(item[''][color])+',\n'
            #only write availabe products to csv
            for width,sizeList in sizes.iteritems():
              for size,sku in sizeList.iteritems():
                strWrite += 'SKU,,[S]Size= US '+size+'.Width ='+width+',,,,,,'+sku+','+"ROADRUNNER,,,,100"+',\n'
      else:
        if item["Brand_Name"] not in("Nike","adidas","Reebok","Puma"):
          for color,sizes in item['variant'][item['sku']].iteritems():
            print pfound
            # generate product row
            strWrite += 'Product,'+productid+","+item['Product_Name']+" " + item['color'][color]+"*"+','+item['Brand_Name']+','+item['Retail_Price']+','+item['Retail_Price']+','+item['Sale_Price']+','
            strWrite += '.'.join(item["Product_Description"]).replace(',',';').replace("\n","").replace("\r","").replace('<.*?>)',"").replace("When choosing running shoes, find your perfect fit using this chart. Category types are based on factors like arch height, running habits and body frame.","").replace("This web exclusive item ships separately within the continental U.S. only. You can count on this item to ship in 3-5 business days!","") + ","
            strWrite += item['sku']+item['color'][color]+ ',' + "ROADRUNNER" +','
            strWrite += item['Category']  + ',' + item['Product_Name']+item['color'][color] + ',' + item["Product_Availability"] +','
            strWrite += item["Current_Stock"] + ',' + item["Free_Shipping"] + ',' + item["Sort_Order"]  + "," + "Buy the " + item['Product_Name']+" " + item['color'][color] + " Online in India at LiveYourSport.com| Free Shipping and Massive Discounts"
            strWrite += ',' + "Buy the " + item['Product_Name']+" " + item['color'][color]+ " Online in India at LiveYourSport.com| Free Shipping and Massive Discounts" + ','
            strWrite += "Buy the " + item['Product_Name']+" " + item['color'][color]+ " Online in India at LiveYourSport.com| Free Shipping and Massive Discounts" + ',' + item["Product_Image_Is_Thumbnail_1"] + ',' + item["Track_Inventory"] + ','
            strWrite += item["Product_Image_Sort_1"] + ',' + item["Product_Image_Sort_2"] + ',' + item["Product_Image_Sort_3"] + ','
            strWrite += item["Product_Image_Sort_4"] + ',' + item["Product_Image_Sort_5"] + ',6,7,'
            strWrite += ','.join(item['Product_Image_File1'][color])+',\n'
        #VARIANT PRINTING SECTION

          old_dict = {} #Dictionary to contain old SKUs and Sizes
          oldlen = 0
          for i in range(start+1,start+1+end): #Storing all list of SKUS in a new list. Will be used for comparing with the new list
            old_dict[0,oldlen]=skulist[i]
            old_dict[1,oldlen]= sizelist[i]
            old_dict[2,oldlen]= idlist[i]
            oldlen = oldlen+1
            
          new_dict = {} #Dictionary to contain new SKUs and Sizes
          c=0
          for width,sizeList in sizes.iteritems():
              for size,sku in sizeList.iteritems():
                new_dict[0,c] = sku
                new_dict[1,c] = 'SKU,,[S]Size= US '+size+'.Width ='+width
                c= c+1
                
          diff_dict = {} #Dict which contains older skus
          r=0
          for i in range(oldlen):
            found = 0
            for x in range(c):
              if old_dict[0,i] == new_dict[0,x]:            
                found = 1
                break
            if found ==0:
              diff_dict[0,r] = old_dict[0,i]
              diff_dict[1,r] = old_dict[1,i]
              diff_dict[2,r] = old_dict[2,i]
              r=r+1

          for width,sizeList in sizes.iteritems():
            t=0
            for size,sku in sizeList.iteritems():
              if sku == old_dict[0,i]:
                strWrite += 'SKU,'+old_dict[2,i]+',[S]Size= US '+size+'.Width ='+width+',,,,,,'+sku+','+"ROADRUNNER,,,,100"+',\n'
                t = 1
              if t==0:# For SKUS which are new and hence will not have a product ID
                strWrite += 'SKU,,[S]Size= US '+size+'.Width ='+width+',,,,,,'+sku+','+"ROADRUNNER,,,,100"+',\n'

          if diff_dict:
            for i in range (r):
              strWrite += 'SKU,'+diff_dict[2,i]+','+diff_dict[1,i] +',,,,,,'+diff_dict[0,i] + ',' +'ROADRUNNER,,,,0,,,,,,,,,,,\n' 
        self.csvfile.write(strWrite.encode('utf8'))

  #def parse_item(self, response):
  def parse(self, response):
    sel = Selector(response)
    url = 'http://www.roadrunnersports.com/rrs/product-detail/build-selections.jsp'
    item = BigCItem()
    pname =  response.xpath("//meta[@property='og:title']/@content").extract()[0]
    item ["Product_Name"]  = response.xpath("//meta[@property='og:title']/@content").extract()[0]

    if "Trail" in pname :
      item ["Product_Name"]  = response.xpath("//meta[@property='og:title']/@content").extract()[0] + " Running Shoe"
    
    
    mrp = float(sel.xpath("//span[@class='prod_detail_reg_price']/span/text()").extract()[0])
    
    item ["Retail_Price"]  = str((mrp*65 + mrp*30/100*70/100*65)*112.5/100 + mrp*65*15/100)
    item_sp               = response.xpath("//span[@class='prod_detail_sale_price']/span/text()").extract()
    
    if item_sp:
      sp = float(sel.xpath("//span[@class='prod_detail_sale_price']/span/text()").extract()[0].split("-")[-1].replace("$",""))
      item ["Sale_Price"]         = str((sp*65 + 30/100*70*65)*112.5/100 + sp*65*15/100)
    else:
      item ["Sale_Price"]         = ''
    #categorization
    cat     =  response.xpath("//div[@id='grp_1']/p/span[1]/text()")
    sex =  response.xpath("//meta[@property='og:title']/@content").extract()[0]
    if sex in("Women's"):
      sex= "Women's"
    else:
      sex= "Men's"

    item["Product_Description"] = response.xpath("//div[@id='grp_1']/p").extract() + response.xpath("//div[@id='grp_1']/ul/li").extract()
       
    if cat:
#      item ["Category"] = "Run & Cycle/Running/Running Shoes;Shoes/"+ sex + " Running Shoes/" + sel.xpath("//div[@id='grp_1']/p/span[1]/text()").extract()[0].replace("+","")
      cat= ";Shoes/"+sex+" Running Shoes/"+response.xpath("//div[@id='grp_1']/p/span[1]/text()").extract()[0].replace("+","") +" Running Shoes"
      
      item ["Product_Name"]  = response.xpath("//meta[@property='og:title']/@content").extract()[0] + " " + response.xpath("//div[@id='grp_1']/p/span[1]/text()").extract()[0] + " Running Shoe"
    else:
      cat= ""

    if any("hiking" in s for s in item["Product_Description"]) or any("Hiking" in s for s in item["Product_Description"]):
      item ["Category"] = "Run & Cycle/Running/Running Shoes;Shoes/"+ sex + " Shoes/Hiking Shoes" + cat
    elif any("trail" in s for s in item["Product_Description"]) or any("Trail" in s for s in item["Product_Description"]):
      item ["Category"] = "Run & Cycle/Running/Running Shoes;Shoes/"+ sex + " Running Shoes/Trail Running Shoes" + cat
    elif any("minimalist" in s for s in item["Product_Description"]) or any("barefoot" in s for s in item["Product_Description"]) or any("Barefoot" in s for s in item["Product_Description"]):
      item ["Category"] = "Run & Cycle/Running/Running Shoes;Shoes/"+ sex + " Running Shoes/Barefoot Running Shoes" + cat
    elif any("spike" in s for s in item["Product_Description"]):
      item ["Category"] = "Run & Cycle/Running/Running Shoes;Shoes/"+ sex + " Running Shoes/Racing Spikes" + cat
    elif any("cross-train" in s for s in item["Product_Description"])or any("trainer" in s for s in item["Product_Description"])or any("training shoe" in s for s in item["Product_Description"]) or any("gym" in s for s in item["Product_Description"]) or any("workout" in s for s in item["Product_Description"]):
      item ["Category"] = "Run & Cycle/Running/Running Shoes;Shoes/"+ sex + " Shoes/Cross Training Shoes" + cat   
    else:
      if cat:
        item ["Category"] = "Run & Cycle/Running/Running Shoes"+ cat
      else:
        item ["Category"] = "NULL"
        
    item ["Brand_Name"]          = response.xpath("//span[@itemprop='brand']/text()").extract()[0]
    if item["Brand_Name"] in ("Asics","Mizuno","Brooks","Saucony","New Balance"):
       item ["Sort_Order"] = str(-300-(20/100*mrp))
    elif item["Brand_Name"] in ("Under Armour","Altra","Hoka One One","Inov8","Salomon","Vibram FiveFingers"):
        item ["Sort_Order"] = str(-270-(20/100*mrp))
    else :
      item ["Sort_Order"] = str(-250-(20/100*mrp))
      
    item["Product_Availability"] = "12-17 Working Days"
    item["Current_Stock"] = "100"
    item ["Free_Shipping"] = "N"
    item["Product_Image_Is_Thumbnail_1"] = "Y"
    item["Track_Inventory"] = "By Option"
    item["Product_Image_Sort_1"] = "1"
    item["Product_Image_Sort_2"] = "2"
    item["Product_Image_Sort_3"] = "3"
    item["Product_Image_Sort_4"] = "4"
    item["Product_Image_Sort_5"] = "5"
    
    item ["imageSetUrls"] = {}
    item ["imageSetUrls2"] = {}
    colors                = response.xpath("//a[@class='ref2QIColor']/@name").extract()
    item ["Product_Image_File1"]      = {}
    hrefs                 = response.xpath("//a[@class='ref2QIColor']/@href").extract()
    item ["color"]     = {}
    for idx,href in enumerate(hrefs):
      #create links to image sets
      if colors[idx] not in item ["imageSetUrls"]:
        item ["imageSetUrls"][colors[idx]] = []
      item ["imageSetUrls"][colors[idx]].append("http://roadrunnersports.scene7.com/is/image/roadrunnersports/"+href.split('/')[-1].split('_')[0]+"-IS?req=set,json&scl=1")
      if colors[idx] not in item ["imageSetUrls2"]:
        item ["imageSetUrls2"][colors[idx]] = []
      item ["imageSetUrls2"][colors[idx]].append("http://roadrunnersports.scene7.com/is/image/roadrunnersports/"+href.split('/')[-1].split('_')[0]+"-IS?req=set,json&scl=1")
      item ["color"][href.split('/')[-1].split('_')[0].split('-')[1]] = colors[idx]
      
    #request product info as json
    item ["sku"]          = response.url.strip('/').split('/')[-2]
    payload               = {'id':item ["sku"]}
    request               = FormRequest(url,formdata=payload,callback=self.parseJsonProduct)
    request.meta['item']  = item

    return request

  #parse product info from json file 
  def parseJsonProduct(self,response):
    item                  = response.meta['item']
    #make a valid json file out of it and remove unneeded data
    prodResponse          = response.body.split('$+$')[0].strip().replace("'",'"')
    prodDict              = {}
    sizeWidthDict         = {}
    jsonresponse          = json.loads(prodResponse)
    for product,value in jsonresponse.iteritems():
      if item["sku"] not in prodDict:
        prodDict[item["sku"]]={}
      if value['c'] not in prodDict[item["sku"]]:
        prodDict[item["sku"]][value['c']] ={}
      if value['w'] not in prodDict[item["sku"]][value['c']]:
        prodDict[item["sku"]][value['c']][value['w']]={}
      if value['s'] not in sizeWidthDict:
        sizeWidthDict[value['s']] = []
      if value['w'] not in sizeWidthDict[value['s']]:
        sizeWidthDict[value['s']].append(value['w'])
      prodDict[item["sku"]][value['c']][value['w']][value['s']]=value['sku']
    item['variant']       = prodDict
    item['size_width_list'] = sizeWidthDict
    #request first imageset
    if item["imageSetUrls"]:
      color,href            = item["imageSetUrls"].popitem()
      if len(href)>1:
        item["imageSetUrls"][color] = href[1:]
      request               = Request(href[0],callback=self.parseJsonImageSet)
      request.meta['item']  = item
      return request
      
    self.to_csv(item)
    return item

  def parseJsonImageSet(self,response):
    item                  = response.meta['item']
    imageSetResponse      = response.body
    #make a valid json file out of it, if only one image available it was a list => make a dict 
    imageSetResponse      = imageSetResponse.replace('/*jsonp*/s7jsonResponse(','')
    imageSetResponse      = ','.join(imageSetResponse.split(',')[:-1])
    imageSetResponse      = imageSetResponse.replace('"item":[','"item":')
    imageSetResponse      = imageSetResponse.replace('"item":','"item":[')
    imageSetResponse      = imageSetResponse.replace('}]}}','}}}')
    imageSetResponse      = imageSetResponse[::-1].replace('}}}','}}]}')[::-1]

    color                 = response.url.split('-')[1].split('?')[0]
    isImageSet            = False
    if len(response.url.split('-'))>2:
      isImageSet          = True
    item['Product_Image_File1'][color] = []
    
    jsonresponse          = json.loads(imageSetResponse)
    for index,imageItem in enumerate(jsonresponse['set']['item']):
      #check if there is a image set or only one image
      if 'isDefault' not in imageItem['i']:
        imageUrl = 'http://roadrunnersports.scene7.com/is/image/'+imageItem['i']['n']+'?iv='+imageItem['iv']
        #response url is image set => image can be scaled
        if isImageSet:
          imageUrl += '&scl=1'
        item['Product_Image_File1'][color].append(imageUrl)
      else:
        # there is no image set append request for default image
        if item['color'][color] not in item["imageSetUrls"]:
          item ["imageSetUrls"][item['color'][color]] = []
        if item['color'][color] not in item["imageSetUrls2"]:
          item ["imageSetUrls2"][item['color'][color]] = []
        item["imageSetUrls"][item['color'][color]].append('http://roadrunnersports.scene7.com/is/image/roadrunnersports/'+item['sku']+'-'+color+'?req=set,json&scl=1')
        item["imageSetUrls2"][item['color'][color]].append('http://roadrunnersports.scene7.com/is/image/roadrunnersports/'+item['sku']+'-'+color+'?req=set,json&scl=1')

    if item["imageSetUrls"]:
      color,href            = item["imageSetUrls"].popitem()
      if len(href)>1:
        item["imageSetUrls"][color] = href[1:]
      request               = Request(href[0],callback=self.parseJsonImageSet)
      request.meta['item']  = item
      return request

    self.to_csv(item)
    return item
class FBSpider(CrawlSpider):

    name = "fb_kudryasheva"
    allowed_domains = ["facebook.com"]

    start_urls = [
        '/alya.khaitlina.html',
    ]

    rules = (
        Rule(
            LinkExtractor(
                allow=("facebook.com/alya.khaitlina/posts",
                       "facebook.com/photo"),
                #allow=("facebook.com/TatyanaTolstaya", "facebook.com/photo"),
                restrict_xpaths='//a[@class="_5pcq"]'),
            callback='parse_page',
            follow=True), )

    def __init__(self, category=None, *args, **kwargs):
        super(FBSpider, self).__init__(*args, **kwargs)
        self.driver = webdriver.Firefox()
        dispatcher.connect(self.spider_closed, signals.spider_closed)

    def spider_closed(self, spider):
        self.driver.close()

    def parse_page(self, response):

        # scrape dynamically generated HTML
        self.driver.get(response.url)
        hxs = Selector(text=self.driver.page_source)
        item = ScraperItem()

        # use scrapy shell to find xpath
        # from scrapy.shell import inspect_response
        # inspect_response(response)

        try:
            divs = hxs.xpath(
                '//div[@id="contentArea"]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/descendant-or-self::*/text()'
            ).extract()
            text = u" ".join(divs[1:])
            no_text = len(divs) == 0
        except IndexError:
            no_text = True

        if no_text:
            try:
                text = " ".join(
                    hxs.xpath(
                        '//span[@class="hasCaption"]/child::node()').extract())
            except IndexError:
                text = ""

        item['url'] = response.url
        item['text'] = text
        item['title'] = hxs.xpath('//title/text()').extract()
        item['date'] = hxs.xpath(
            '//span[@class="timestampContent"]/text()').extract()

        comments = float(hxs.xpath('count(//abbr)').extract()[0]) - 1

        try:
            likes = hxs.xpath(
                '//div[@class="UFILikeSentenceText"]/span/span/text()'
            ).extract()[0]

            if "likes" in likes:
                like_count = 1.0
            else:
                try:
                    like_count = len(likes.split(", "))
                    if "others" in likes:
                        like_count += float(
                            likes.split("and ")[1].split(" others")[0].replace(
                                ",", ""))
                    elif "and" in likes:
                        like_count += 1.0
                except IndexError:
                    like_count = 2.0
        except IndexError:
            like_count = 0.0
        # print "like count: "+str(like_count)

        try:
            shares = hxs.xpath(
                '//a[@class="UFIShareLink"]/text()').extract()[0]

            share_count = float(shares.split(" share")[0].replace(",", ""))
        except IndexError:
            share_count = 0.0

        print like_count, share_count, comments

        item['comment_count'] = [like_count, share_count, comments]

        yield item
Beispiel #23
0
class StockSpider(HistorySpider):
    name = 'stocks'
    allowed_domains = ['tase.co.il']
    start_urls = [
        'http://www.tase.co.il/eng/marketdata/stocks/marketdata/Pages/MarketData.aspx'
    ]

    rules = (
        Rule(SgmlLinkExtractor(allow=('MarketData\.aspx', )),
             callback='parse_company_list'),
        Rule(SgmlLinkExtractor(allow=('companyMainData\.aspx', )),
             callback='parse_company'),
        Rule(SgmlLinkExtractor(allow=('companyhistorydata\.aspx', )),
             callback='get_history_data'),
        Rule(SgmlLinkExtractor(allow=('companyDetails\.htm', )),
             callback='parse_company_details'),
    )

    header = (('date_', tase.common.get_date), ('adjusted_closing_price',
                                                tase.common.to_float),
              ('closing_price', tase.common.to_float), ('change_',
                                                        tase.common.to_float),
              ('opening_price', tase.common.to_float), ('base_price',
                                                        tase.common.to_float),
              ('high', tase.common.to_float), ('low', tase.common.to_float),
              ('capital_listed_for_trading', tase.common.to_long),
              ('market_cap', tase.common.to_long), ('turnover',
                                                    tase.common.to_long),
              ('volume', tase.common.to_long), ('trans', tase.common.to_long))

    header2 = {
        'Total Assets': ('total_assets', tase.common.to_long),
        'Current Assets': ('current_assets', tase.common.to_long),
        'Non Current Assets': ('non_current_assets', tase.common.to_long),
        'Shareholders Equity': ('shareholders_equity', tase.common.to_long),
        'Of which Minority Interest':
        ('of_which_minority_interest', tase.common.to_long),
        'Current Liabilities': ('current_liabilities', tase.common.to_long),
        'Long-Term Liabilities':
        ('long_term_liabilities', tase.common.to_long),
        #'Profit & Loss Statement' : ('', tase.common.to_long),
        'Revenues': ('revenues', tase.common.to_long),
        'Gross Profit': ('gross_profit', tase.common.to_long),
        'Operating Income': ('operating_income', tase.common.to_long),
        'Income Before Tax': ('income_before_tax', tase.common.to_long),
        'Net Income': ('net_income', tase.common.to_long),
        'Income Attributable to Shareholders':
        ('income_atributable_to_shareholders', tase.common.to_long),
        'Earnings per Share': ('earnings_per_share', tase.common.to_float),
        #'Additional Data' : ('', tase.common.to_long),
        'Dividends': ('dividends', tase.common.to_long),
        'Net cash flow generated by operating activities':
        ('net_cash_flow_generated_by_operating_activities',
         tase.common.to_long),
        #'Financial Ratios' : ('', tase.common.to_long),
        'Market to book value': ('market_to_book_value', tase.common.to_float),
        'Price-Earning Ratio': ('price_earning_ratio', tase.common.to_float),
        'Equity-Assets Ratio': ('equity_assets_ratio', tase.common.to_float),
        'Return on Equity': ('return_on_equity', tase.common.to_float)
    }

    details_url = "http://www.tase.co.il/Eng/General/Company/Pages/companyDetails.aspx?subDataType=0&companyID={companyID}&shareID={shareID}"
    history_url = "http://www.tase.co.il/Eng/General/Company/Pages/companyHistoryData.aspx?subDataType=0&companyID={companyID}&shareID={shareID}&intPeriod={period}&intFrequency1=0&IsYield=False&IsDollar=False"

    def get_control_id(self):
        return "g_301c6a3d_c058_41d6_8169_6d26c5d97050"

    # Main companies list, with paging
    def parse_company_list(self, response):
        sel = Selector(response)
        fd = dict()
        inputs = sel.xpath("//input[@type='hidden']")
        for inpt in inputs:
            name = tase.common.get_string(inpt.xpath("@name").extract())
            value = tase.common.get_string(inpt.xpath("@value").extract())
            fd[name] = value
        #print fd
        #req_digest = sel.xpath("//input[@id='__REQUESTDIGEST']/@value").extract()
        #ev_val = sel.xpath("//input[@id='__EVENTVALIDATION']/@value").extract()
        links = sel.xpath("//tr[@class='pagerText']/td/a")
        for link in links:
            m = re.search("javascript:__doPostBack\('(.*?)'", link.extract())
            if m:
                url = urllib.unquote(m.group(1))
                fd['__EVENTTARGET'] = url
                #yield FormRequest(self.start_urls[0], method='POST', formdata={'__EVENTTARGET': url, '__EVENTARGUMENT': '', '__REQUESTDIGEST': req_digest, '__EVENTVALIDATION': ev_val})
                #print "url: " + self.start_urls[0]
                yield FormRequest(self.start_urls[0],
                                  method='POST',
                                  formdata=fd)

    def parse_company(self, response):
        sel = Selector(response)
        item = TaseItem()
        item['category'] = category_comp
        item['tase_url'] = response.url
        item['date_'] = ''
        query = parse_qs(urlparse(response.url)[4])  # query
        try:
            item['CompanyID'] = query['CompanyID'][0]
        except KeyError:
            item['CompanyID'] = query['FundID'][0]
        try:
            item['ShareID'] = query['ShareID'][0]
        except KeyError:
            item['ShareID'] = query['FundID'][0]
        try:
            item['name'] = sel.xpath(
                "//td[@class='BigBlue']/text()").extract()[0]
        except IndexError:
            item['name'] = ""
        try:
            base_url = get_base_url(response)
            relative_url = sel.xpath(
                "//td[@rowspan='4']/img/@src").extract()[0]
            item['image_url'] = urljoin(base_url, relative_url)
        except IndexError:
            item['image_url'] = ""
        lst = sel.xpath(
            "//td[contains(child::text(), 'Symbol:')]/following-sibling::td[1]/table/tr/td[1]/text()"
        ).extract()
        if len(lst) > 0:
            item['symbol'] = lst[0]
        else:
            try:
                item['symbol'] = sel.xpath(
                    "//td[contains(., 'Symbol:')]/following-sibling::td[1]/text()"
                ).extract()[0]
            except IndexError:
                item['symbol'] = item['ShareID']
        href = sel.xpath('//tr[1]/td[1]/a[@target="_blank"]/@href').extract()
        url = href[0]
        o = urlparse(url)
        if len(o.netloc) > 0:
            item['url'] = url
        else:
            item['url'] = ''
        try:
            href = sel.xpath("//tr/td[@class='subtitle']/text()").extract()
            item['sector'] = tase.common.unescape(
                urllib.unquote(href[4].strip()))
            item['subsector'] = tase.common.unescape(
                urllib.unquote(href[3].strip()))
        except IndexError:
            item['sector'] = ""
            item['subsector'] = ""
        item['sector_int'] = 0
        item['subsector_int'] = 0
        if PROCESS_FINANCIAL_STATEMENTS:
            yield self.get_company_details(item)
        yield self.process_history(item)

    def get_company_details(self, item):
        url = self.details_url.format(shareID=item['ShareID'],
                                      companyID=item['CompanyID'])
        return Request(url,
                       callback=self.parse_company_details,
                       meta={'item': item})

    def parse_company_details(self, response):
        item = response.request.meta['item']
        sel = Selector(response)
        item['financial_statements'] = []
        for i in range(3):
            fs = self.process_company_statement(sel, i)
            if not fs is None:
                item['financial_statements'].append(fs)
        return self.process_history(item)

    def process_company_statement(self, sel, index):
        table = sel.xpath(
            '//table[@id="ctl00_SPWebPartManager1_g_8e3d9f18_75c6_43cc_bc21_c3e7170427ca_ctl00_gridFinanceReport_DataGrid1"]'
        )
        rows = table.xpath('tr')  #[@class != "gridHeader"]')
        fs = FinancialStatement()
        start = True
        for row in rows:
            if start:
                columns = row.xpath('td[@class="titleGridReg"]/text()')
                if index >= len(columns):
                    return None
                fs['period'] = columns[index].extract()
                start = False
            else:
                name = row.xpath('td/text()')[0].extract().strip()
                values = row.xpath('td/div/text()')
                if len(values) > 0:
                    value = values[index].extract().strip()
                    if not self.header2.get(name) is None:
                        key = self.header2[name][0]
                        func = self.header2[name][1]
                        val = func(value)
                        #self.log('Value: %s' % val)
                        fs[key] = val
        return fs
Beispiel #24
0
class NewTaipeiCitySpider(CrawlSpider):
    name = 'new_taipei_city'
    allowed_domains = ['61.60.124.185']

    rules = [
        Rule(SgmlLinkExtractor(allow=('InfoAllList.asp\?a=c.*')),
             follow=True,
             callback='parse_list'),
    ]

    county_name = u'新北市'

    def __init__(self):
        self.start_urls = ['http://61.60.124.185/tpctempdig/InfoAllList.asp']

        super(CrawlSpider, self).__init__()
        self._compile_rules()

    def parse_list(self, response):
        self.log('crawl: %s' % response.url)
        hxs = HtmlXPathSelector(response)

        # Get data
        records = hxs.select("//div[@class='tabs_content']//tr")
        for r in records:
            fields = r.select('.//td/text()').extract()
            if not fields:
                continue

            cfg.logger.debug('fields: %s', fields)
            data_dict = self._process_data_dict(fields, _columns)
            item = NewTaipeiCityItem()
            item['county_name'] = self.county_name
            item['the_category'] = 'kaohsiung_dig_point'
            item['the_idx'] = fields[6]
            ts = re.findall('(\d+)', fields[7])
            item['start_timestamp'] = ts[0]
            item['end_timestamp'] = ts[1]
            item['the_data'] = data_dict

            item['start_timestamp'] = util.tw_date_to_timestamp(
                item['start_timestamp'])
            item['end_timestamp'] = util.tw_date_to_timestamp(
                item['end_timestamp'])

            process_data(item['county_name'], item['the_category'],
                         item['the_idx'], item['start_timestamp'],
                         item['end_timestamp'], {}, item['the_data'])

            yield item

        # Traverse
        items = hxs.select("//div[@id='pagenate']/a/@href")
        for item in items:
            url = 'http://pipegis.kcg.gov.tw/' + re.findall(
                "(default[^']+)", item.extract())[0]
            yield Request(url,
                          callback=self.parse_list,
                          method='POST',
                          errback=self.errback)

    def errback(self):
        self.log('Request failed')

    def _process_data_dict(self, fields, columns):
        result = {
            column: '' if idx >= len(fields) else fields[idx]
            for (idx, column) in enumerate(columns)
        }
        return result
class MySpider(CrawlSpider):
    name = "gplay"
    allowed_domains = ["play.google.com"]
    start_urls = ["https://play.google.com/store/apps/"]
    rules = (Rule(LinkExtractor(allow=('/store/apps', )), follow=True),
             Rule(LinkExtractor(allow=('/store/apps/details\?')),
                  follow=True,
                  callback='parse_link'))

    def abs_url(url, response):
        """Return absolute link"""
        base = response.xpath('//head/base/@href').extract()
        if base:
            base = base[0]
        else:
            base = response.url
        return urlparse.urljoin(base, url)

    def parse_link(self, response):
        hxs = HtmlXPathSelector(response)
        titles = hxs.select('/html')
        items = []
        for titles in titles:
            item = GplaycrawlerItem()
            item["Link"] = titles.select('head/link[5]/@href').extract()
            item["Item_name"] = titles.select(
                '//*[@class="document-title"]/div/text()').extract()
            item["Updated"] = titles.select(
                '//*[@itemprop="datePublished"]/text()').extract()
            item["Author"] = titles.select(
                '//*[@itemprop="author"]/a/span/text()').extract()
            item["Filesize"] = titles.select(
                '//*[@itemprop="fileSize"]/text()').extract()
            item["Downloads"] = titles.select(
                '//*[@itemprop="numDownloads"]/text()').extract()
            item["Version"] = titles.select(
                '//*[@itemprop="softwareVersion"]/text()').extract()
            item["Compatibility"] = titles.select(
                '//*[@itemprop="softwareVersion"]/text()').extract()
            item["Content_rating"] = titles.select(
                '//*[@itemprop="contentRating"]/text()').extract()
            item["Author_link"] = titles.select(
                '//*[@class="dev-link"]/@href').extract()
            item["Author_link_test"] = titles.select(
                '//*[@class="content contains-text-link"]/a/@href').extract()
            item["Genre"] = titles.select(
                '//*[@itemprop="genre"]/text()').extract()
            item["Price"] = titles.select(
                '//*[@class="price buy id-track-click"]/span[2]/text()'
            ).extract()
            item["Rating_value"] = titles.select(
                '//*[@class="score"]/text()').extract()
            item["Review_number"] = titles.select(
                '//*[@class="reviews-num"]/text()').extract()
            item["Description"] = titles.select(
                '//*[@class="id-app-orig-desc"]//text()').extract()
            item["IAP"] = titles.select(
                '//*[@class="inapp-msg"]/text()').extract()
            item["Developer_badge"] = titles.select(
                '//*[@class="badge-title"]//text()').extract()
            item["Physical_address"] = titles.select(
                '//*[@class="content physical-address"]/text()').extract()
            item["Video_URL"] = titles.select(
                '//*[@class="play-action-container"]/@data-video-url').extract(
                )
            item["Developer_ID"] = titles.select(
                '//*[@itemprop="author"]/a/@href').extract()
            items.append(item)
        return items
Beispiel #26
0
class BondSpider(HistorySpider):
	name = 'bonds'
	allowed_domains = ['tase.co.il']
	start_urls = [
		'http://www.tase.co.il/eng/marketdata/t-bills/Pages/ShortTermLoan.aspx',
		'http://www.tase.co.il/eng/marketdata/bonds/governmentbonds/Pages/BondsGov.aspx',
		'http://www.tase.co.il/eng/marketdata/bonds/corporatebonds/Pages/BondsByCuts.aspx',
	]

	rules = (
		Rule(SgmlLinkExtractor(allow=(r'ErrorHandler.aspx',)), callback='process_error'),
		Rule(SgmlLinkExtractor(allow=('ShortTermLoan\.aspx',)), callback='parse_bond_list'),
		Rule(SgmlLinkExtractor(allow=('BondsGov\.aspx',)), callback='parse_bond_list'),
		Rule(SgmlLinkExtractor(allow=('BondsByCuts\.aspx',)), callback='parse_bond_list'),
		Rule(SgmlLinkExtractor(allow=('BondsMainData\.aspx',)), callback='parse_bond'),
		Rule(SgmlLinkExtractor(allow=('companyMainData\.aspx',)), callback='parse_bond'),
	)

	header = (
		('date_', tase.common.get_date),
		('adjusted_closing_price', tase.common.to_float),
		('closing_price', tase.common.to_float),
		('change_', tase.common.to_float),
		('gross_yield_to_maturity', tase.common.to_float),
		('opening_price', tase.common.to_float),
		('base_price', tase.common.to_float),
		('high', tase.common.to_float),
		('low', tase.common.to_float),
		('capital_listed_for_trading', tase.common.to_int),
		('market_cap', tase.common.to_int),
		('turnover', tase.common.to_int),
		('volume', tase.common.to_int),
		('trans', tase.common.to_int)
	)

	history_url = "http://www.tase.co.il/TASEEng/General/BONDs/bondsHistoryData.htm?bondType=4&subDataType=5&companyID={companyID}&shareID={shareID}&intPeriod={period}&intFrequency1=0&IsYield=False&IsDollar=False"

	def get_control_id(self):
		return "g_ed8af170_7f0e_440a_85fe_19d9352a2a86"

	# Main companies list, with paging
	def parse_bond_list(self, response):
		sel = Selector(response)
		fd = dict()
		inputs = sel.xpath("//input[@type='hidden']")
		for inpt in inputs:
			name = tase.common.get_string(inpt.xpath("@name").extract())
			value = tase.common.get_string(inpt.xpath("@value").extract())
			fd[name] = value
		links = sel.xpath("//tr[@class='pagerText']/td/a")
		for link in links:
			m = re.search("javascript:__doPostBack\('(.*?)'", link.extract())
			if m:
				url = urllib.unquote(m.group(1))
				fd['__EVENTTARGET'] = url
				#print self.start_urls[2]
				#print fd
				#yield FormRequest(self.start_urls[2], method='POST', formdata={'__EVENTTARGET': url, '__EVENTARGUMENT': ''})
				yield FormRequest(self.start_urls[2], method='POST', formdata=fd)

	# almost same as parse_company
	def parse_bond(self, response):
		sel = Selector(response)
		item = TaseItem()
		item['category'] = category_bond
		item['tase_url'] = response.url
		item['date_'] = ''
		query = parse_qs(urlparse(response.url)[4]) # query
		try:
			item['CompanyID'] = query['CompanyID'][0]
		except KeyError:
			item['CompanyID'] = query['FundID'][0]
		try:
			item['ShareID'] = query['ShareID'][0]
		except KeyError:
			item['ShareID'] = query['FundID'][0]
		try:
			item['name'] = sel.xpath("//td[@class='BigBlue']/text()").extract()[0]
		except IndexError:
			item['name'] = ""
		try:
			base_url = get_base_url(response)
			relative_url = sel.xpath("//td[@rowspan='4']/img/@src").extract()[0]
			item['image_url'] = urljoin(base_url, relative_url)
		except IndexError:
			item['image_url'] = ""
		lst = sel.xpath("//td[contains(child::text(), 'Symbol:')]/following-sibling::td[1]/table/tr/td[1]/text()").extract()
		if len(lst) > 0:
			item['symbol'] = lst[0]
		else:
			try:
				item['symbol'] = sel.xpath("//td[contains(., 'Symbol:')]/following-sibling::td[1]/text()").extract()[0]
			except IndexError:
				item['symbol'] = item['ShareID']
		href = sel.xpath('//tr[1]/td[1]/a[@target="_blank"]/@href').extract()
		if len(href) > 0:
			url = href[0]
			o = urlparse(url)
			if len(o.netloc) > 0:
				item['url'] = url
			else:
				item['url'] = ''
		else:
			item['url'] = ''
		try:
			href = sel.xpath("//tr/td[@class='subtitle']/text()").extract()
			item['sector'] = tase.common.unescape(urllib.unquote(href[4].strip()))
			item['subsector'] = tase.common.unescape(urllib.unquote(href[3].strip()))
		except IndexError:
			item['sector'] = ""
			item['subsector'] = ""
		item['sector_int'] = 0
		item['subsector_int'] = 0
		#url = "http://archive.globes.co.il/searchgl/%s" % item['symbol']
		url = "http://www.globes.co.il/serveen/globes/searchresults.asp?exact=%s" % item['symbol']
		yield self.process_history(item)
Beispiel #27
0
class PaipaidaiSpider(CrawlSpider):
    #  to solve the problem:'ascii' codec can't decode byte 0xe5 in position 0: ordinal not in range(128)
    reload(sys)
    sys.setdefaultencoding('utf8')
    #/
    name = 'paipaidai4'
    allowd_domain = ['www.paipaidai.com']
    download_delay = 2  #访问间隔秒数
    #['https://www.itouzi.com/dinvest/invest/detail?id=44335555475675434642733d']
    url1 = ['http://www.ppdai.com/lend/12_s0_p' + str(x)
            for x in range(7, 9)]  #热投区
    url2 = ['http://www.ppdai.com/lend/13_s0_p' + str(x)
            for x in range(7, 9)]  #安全标专区
    url3 = ['http://www.ppdai.com/lend/14_s0_p' + str(x)
            for x in range(7, 9)]  #逾期就赔
    url4 = ['http://www.ppdai.com/lend/8_s0_p' + str(x)
            for x in range(7, 9)]  #网商专区
    url5 = ['http://www.ppdai.com/lend/3_s0_p' + str(x)
            for x in range(7, 9)]  #二次借款
    url6 = ['http://www.ppdai.com/lend/15_s0_p' + str(x)
            for x in range(7, 9)]  #合作机构专区
    url7 = ['http://www.ppdai.com/lend/16_s0_p' + str(x)
            for x in range(7, 9)]  #新手福利标
    url1.extend(url2)
    url1.extend(url3)
    url1.extend(url4)
    url1.extend(url5)
    url1.extend(url6)
    url1.extend(url7)

    start_urls = url1
    #print start_urls

    rules = (Rule(SgmlLinkExtractor(allow=('/list/.*', )),
                  callback='parse_page',
                  follow=True), )

    #def parse_page(self, response):
    def parse_page(self, response):
        item = PaipaidaiItem()
        sel = Selector(response)
        item['name'] = sel.xpath('//span[@class=\"\"]/text()').extract()[0]
        item['link'] = response.url
        item['amount'] = sel.xpath(
            '//dd[@id=\"listRestMoney\"]/text()').extract()[0].strip()
        item['min_amount'] = ''
        item['income_rate'] = sel.xpath(
            '//div[@class=\"w528 clearfix\"]/dl/dd/text()').extract()[1]

        term1 = sel.xpath('//dl[@class=\"nodbr\"]/dd/text()').extract()[0]
        term2 = sel.xpath('//dl[@class=\"nodbr\"]/dd/em/text()').extract()[0]
        item['term'] = term1 + term2

        item['area'] = ''
        item['transfer_claim'] = ''
        item['repay_type'] = sel.xpath(
            '//div[@class=\"item item1\"]/text()').extract()[2].strip()
        item['reward'] = ''
        item['protect_mode'] = ''
        item['description'] = sel.xpath(
            '//div[@class=\"lendDetailTab_tabContent\"]/p/text()').extract()[0]
        item['process'] = sel.xpath(
            '//div[@class=\"item\"]/text()').extract()[1].strip()

        #[0].encode('utf-8')
        #[n.encode('utf-8') for n in title]

        yield item
Beispiel #28
0
class MovieSpider(CrawlSpider):
    name = "movie"
    allowed_domains = ["movie.douban.com"]
    start_urls = ["http://movie.douban.com"]

    rules = (
        Rule(LinkExtractor(allow=r"/subject/\d+/($|\?\w+)"), 
            callback="parse_movie", follow=True),
    )
    def parse_movie(self, response):
        item = MovieItem()

        item["subject_id"] = int(response.url.split("/")[-2])
        self.get_name(response, item)
        self.get_year(response, item)
        self.get_directors(response, item)
        self.get_actors(response, item)
        self.get_genres(response, item)
        self.get_runtime(response, item)
        self.get_languages(response, item)
        self.get_countries(response, item)
        self.get_average(response, item)
        self.get_vote(response, item)
        self.get_tags(response, item)
        self.get_watched(response, item)
        self.get_wish(response, item)
        self.get_summary(response, item)
        self.get_stars(response, item)
        self.get_comment(response, item)
        self.get_question(response, item)
        self.get_review(response, item)
        self.get_discussion(response, item)
        self.get_image(response, item)

        return item

    def get_stars(self, response, item):
        if not item.get("vote", None): return

        xpath = response.xpath("//div[@class='rating_wrap clearbox']/text()").extract()
        stars = "".join( map(unicode.strip, xpath ) ).split("%")[:-1]
        stars = [ int( round((float( "%.3f" % (float(star)/100))) * item["vote"]) )  for star in stars ]
        item["stars"] = stars

    def get_name(self, response, item):
        name = response.xpath("//title/text()").extract()
        if name: item["name"] = name[0].replace(u" (豆瓣)", "").strip()

    def get_year(self, response, item):
        year = response.xpath("//span[@class='year']").re(NUM_RE)
        if year: item["year"] = int( year[0] )

    def get_directors(self, response, item):
        directors = response.xpath("//a[@rel='v:directedBy']/text()").extract()
        if directors: item["directors"] = directors

    def get_actors(self, response, item):
         stars = response.xpath("//a[@rel='v:starring']/text()").extract()
         if stars: item["actors"] = stars

    def get_genres(self, response, item):
        genres = response.xpath("//span[@property='v:genre']/text()").extract()
        if genres: item["genres"] = genres

    def get_runtime(self, response, item):
        if not self.parse_tv(response, item):
            runtime = response.xpath("//span[@property='v:runtime']/text()").re(NUM_RE)
            if runtime:
                item["channel"] = "mv"
                item["runtime"] =  int( runtime[0] )

    def get_average(self, response, item):
        average = response.xpath("//strong[@property='v:average']/text()").extract()
        if average and average[0] != "": item["average"] = float( average[0] ) + 0.0

    def get_vote(self, response, item):
         votes = response.xpath("//span[@property='v:votes']/text()").extract()
         if votes and votes[0] != "": item["vote"] = int( votes[0] )

    def get_tags(self, response, item):
        T = []
        tags = response.xpath("//div[@class='tags-body']/a")
        for tag in tags:
            t = tag.xpath("text()").extract()
            if t: T.append(t[0])
        if T: item["tags"] = T

    def get_watched(self, response, item):
        spec = "//div[@class='subject-others-interests-ft']/a[re:test(@href, 'collections$')]/text()"
        collections = response.xpath(spec).re(NUM_RE)
        if collections: item["watched"] = int( collections[0] )

    def get_wish(self, response, item):
        spec = "//div[@class='subject-others-interests-ft']/a[re:test(@href, 'wishes$')]/text()"
        wishes = response.xpath(spec).re(NUM_RE)
        if wishes: item["wish"] = int( wishes[0] )

    def get_languages(self, response, item):
        S = "".join(response.xpath("//div[@id='info']").extract() )
        M = LANGUAGES_RE.search(S)
        if M is not None:
            item["languages"] = [ lang.strip() for lang in M.group(1).split("/") ]

    def get_countries(self, response, item):
        S = "".join(response.xpath("//div[@id='info']").extract() )
        M = COUNTRIES_RE.search(S)
        if M is not None:
            item["countries"] = [ country.strip() for country in M.group(1).split("/") ]

    def get_summary(self, response, item):
        summary = response.xpath("//span[@property='v:summary']/text()").extract()
        if summary: item["summary"] = "<br/>".join( summary )

    def get_image(self, response, item):
        image = response.xpath("//a[re:test(@href, 'all_photos$')]/text()").re(NUM_RE)
        if image: item["image"] = int( image[0] )

    def get_comment(self, response, item):
        comment = response.xpath("//a[re:test(@href, '/comments$')]/text()").re(NUM_RE)
        if comment: item["comment"] = int( comment[0] )

    def get_question(self, response, item):
        question = response.xpath("//a[re:test(@href, '/questions/\?from=subject$')]/text()").re(NUM_RE)
        if question: item["question"] = int( question[0] )

    def get_review(self, response, item):
        review = response.xpath("//a[re:test(@href, '/reviews$')]/text()").re(NUM_RE)
        if review: item["review"] = int( review[0] )

    def get_discussion(self, response, item):
         discussion =  response.xpath("//a[re:test(@href, 'discussion/')]/text()").re(NUM_RE)
         if discussion: item["discussion"] = int( discussion[0] )

    def parse_tv(self, response, item):
        S = "".join( response.xpath("//div[@id='info']//text()").extract() )
        M = TV_RUNTIME_RE.search(S)
        if M is not None:
            item["channel"] = "tv"
            item["runtime"] = int(M.group(1))
            return True
        return False
Beispiel #29
0
class TraderaSpider(CrawlSpider):
    name = 'tradera'
    allowed_domains = ['tradera.com']
    start_urls = [
        'http://www.tradera.com/nintendo-gamecube-c3_3004',
        'http://www.tradera.com/playstation-psone-c3_3012',
        'http://www.tradera.com/playstation-2-ps2-c3_3013',
        'http://www.tradera.com/sega-dreamcast-c3_3001',
        'http://www.tradera.com/xbox-c3_3016'
    ]

    rules = (
        Rule(SgmlLinkExtractor(allow=('.*', ),
                               restrict_xpaths='//a[@class="nextPageBtn"]'),
             callback='parse_item',
             follow=True),
        Rule(SgmlLinkExtractor(
            allow=('.*', ),
            restrict_xpaths=('//div[@class="ObjectHeadline"]', )),
             callback='parse_item2',
             follow=True),
    )

    #
    def parse_item(self, response):
        """A callback function"""
        hxs = HtmlXPathSelector(response)

    def getStringFromArray(self, array):
        result = u""
        for item in array:
            result = result + u" " + item.strip()

        return result

    def getStringFromXPath(self, hxs, xPath):
        extractedText = hxs.select(xPath).extract()
        return self.getStringFromArray(extractedText)

    def parse_item2(self, response):
        """A callback function that produces traderaItems from auction html"""
        hxs = HtmlXPathSelector(response)
        traderaItem = TraderaItem()

        traderaItem['itemHeading'] = self.getStringFromXPath(
            hxs, '//h1[@class="auction_headline"]/text()')
        traderaItem['leadingBid'] = self.getStringFromXPath(
            hxs, '//label[@id="leadingBidAmount"]/text()')
        traderaItem['bids'] = self.getStringFromXPath(
            hxs, '//h5[@id="numberOfBids"]/text()')
        traderaItem['remainingTime'] = self.getStringFromXPath(
            hxs, '//label[@id="timeLeftLabel"]/text()')
        traderaItem['itemText'] = self.getStringFromXPath(
            hxs, '//div[@class="description"]/p/text()')
        traderaItem['seller'] = self.getStringFromXPath(
            hxs, '//a[@class="blueLink"]/b/text()')
        traderaItem['sellerRating'] = self.getStringFromXPath(
            hxs,
            '//div[@class="rightSideInfoInBoxG-bottomLine"]/a[@class="DSRMedium"]/text()'
        )
        if len(hxs.select('//div[@class="objectInfoOnTop"]/text()')) == 3:
            traderaItem['publiced'] = hxs.select(
                '//div[@class="objectInfoOnTop"]/text()').extract()[1].strip()
            traderaItem['objectID'] = hxs.select(
                '//div[@class="objectInfoOnTop"]/text()').extract()[2].strip()

        return traderaItem
Beispiel #30
0
    def load_config(self):

        self.pretty_conf = utils.load_cfg(self.config, pretty=True)
        conf_dump = json.dumps(self.pretty_conf)
        conf = json.loads(conf_dump)

        ### debug
        if self.debug==None:
            self.debug = conf.get('debug', False)

        ### site
        self.site = conf.get('site', u'未知站点')
        self.macro = utils.MacroExpander({
            'SITE': self.site,
            'CONF': conf_dump
        })

        ### allowed_domains
        self.allowed_domains = conf.get('domains', [])

        ### start_urls
        urls = conf.get('urls', [])
        self.start_urls = utils.generate_urls(urls, self.macro)
        if isinstance(urls, dict):
            self.start_method = urls.get('method', 'GET')
            self.make_headers(urls.get('headers', {}))
            if urls.get('parse'):
                self.parse_start_url = self.parse_page
        else:
            self.start_method = 'GET'
            self.make_headers({})

        ### rules
        self.tr = HTMLTranslator()
        self.rules = []
        self.page_extractor = None
        for k,v in conf.get('rules', {}).iteritems():

            follow = v.get('follow', True)
            callback = None if follow else 'parse_page'
            follow = True if follow is None else follow

            match = self.macro.expand(v.get('match'))
            regex = self.macro.expand(v.get('regex'))
            css = self.macro.expand(v.get('css'))
            if css:
                xpath = self.tr.css_to_xpath(css)
            else:
                xpath = self.macro.expand(v.get('xpath'))
            pages = v.get('pages')
            sub = v.get('sub')
            vars = v.get('vars')

            rule = Rule(
                SgmlLinkExtractor(
                    allow=regex,
                    restrict_xpaths=xpath,
                    process_value=utils.first_n_pages(regex, pages)
                ),
                process_links=self.sub_links(sub),
                process_request=self.set_vars(k, vars),
                callback=callback,
                follow=follow
            )
            rule.match = match

            self.rules.append(rule)
        self._compile_rules()

        if not self.rules:
            self.parse_start_url = self.parse_page
            self.make_page_extractor(conf.get('urls', []))

        ### mappings(loop/fields)
        self.build_item(conf)

        ### settings
        self.load_settings(conf)

        return conf
Beispiel #31
0
class HideMyAssSpider(CrawlSpider):
    name = 'hidemyass'
    start_urls = ['http://hidemyass.com/proxy-list/']
    allowed_domains = ['hidemyass.com']

    rules = (Rule(SgmlLinkExtractor(restrict_xpaths=("//a[@class='next']")),
                  callback='parse',
                  follow=True), )

    def parse(self, response):
        self.log('No item received for %s' % response.url)

        for elem in super(HideMyAssSpider, self).parse(response):
            yield elem

        hxs = HtmlXPathSelector(response)
        links = hxs.select('//tr[@class="altshade"]')

        for link in links:
            ipaddress_parts = link.select('td[2]/span')

            style_text = ipaddress_parts.select('style/text()').extract()
            style_text = style_text[0].split('\n')
            display_none = [
                style[1:style.index('{')] for style in style_text
                if 'none' in style
            ]
            display_inline = [
                style[1:style.index('{')] for style in style_text
                if 'inline' in style
            ]
            display_none = set(display_none)
            display_inline = set(display_inline)

            ipaddress = []

            for ipaddress_part in ipaddress_parts.select('span|div|text()'):
                tag_class = tag_style = tag_name = None
                try:
                    tag_class = ipaddress_part.select('@class').extract()
                except TypeError:
                    # Workaround bug in lxml.etree: Argument 'element' has incorrect type (expected lxml.etree._Element, got _ElementStringResult)
                    pass

                try:
                    tag_style = ipaddress_part.select('@style').extract()
                except TypeError:
                    # Workaround bug in lxml.etree: Argument 'element' has incorrect type (expected lxml.etree._Element, got _ElementStringResult)
                    pass

                try:
                    tag_name = ipaddress_part.select("name()")
                except TypeError:
                    # Workaround bug in lxml.etree: Argument 'element' has incorrect type (expected lxml.etree._Element, got _ElementStringResult)
                    pass

                if tag_name:
                    tag_text = ipaddress_part.select('text()').extract()
                else:
                    tag_text = ipaddress_part.extract()

                if tag_style and 'none' in tag_style[0]:
                    continue
                if tag_class and tag_class[0] in display_none:
                    continue

                if isinstance(tag_text, list):
                    tag_text = ''.join(tag_text)

                tag_texts = tag_text.split('.')
                for tag_text in tag_texts:
                    tag_text = tag_text.strip()
                    if not tag_text.isdigit():
                        continue
                    ipaddress.append(tag_text)

            ipaddress = '.'.join(ipaddress)

            loader = WebsiteLoader(selector=link)
            loader.add_value('ipaddress', ipaddress)
            loader.add_xpath('port', 'td[3]/text()')
            loader.add_xpath('country', 'td[4]/span/text()')
            loader.add_xpath('proxy_type', 'td[7]/text()')
            loader.add_xpath('anonimity', 'td[8]/text()')
            loader.add_value('url', response.url)

            item = loader.load_item()

            yield item