Exemple #1
0
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Rule

rules = {
    'china': (
        Rule(LinkExtractor(
            allow='article\/.*\.html',
            restrict_xpaths='//div[@id="left_side"]//div[@class="con_item"]'),
             callback='parse_item'),
        #Rule(LinkExtractor(restrict_xpaths='//div[@id="pageStyle"]//a[contains(., "下一页")]'))
    ),
    'tianyancha': (Rule(LinkExtractor(
        allow='company\/.*',
        restrict_xpaths=
        '//div[contains(@class,"search-result-single")]//div[@class="header"]'
    ),
                        callback='parse_item'), )
}
Exemple #2
0
# Auto generated by generator.py. Delete this line if you make modification.
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor

XPATH = {
    'name' : "//div[@id='info_detail_product']/h5[1]/b",
    'price' : "//div[@id='info_detail_product']/h5[3]/b",
    'category' : "",
    'description' : "//div[@id='info_detail_product']/font",
    'images' : "//div[@class='img_detail_product']/img/@src",
    'canonical' : "",
    'base_url' : "",
    'brand' : ""
}
name = 'chipmobile.com.vn'
allowed_domains = ['chipmobile.com.vn']
start_urls = ['http://chipmobile.com.vn']
tracking_url = ''
sitemap_urls = ['']
sitemap_rules = [('', 'parse_item')]
sitemap_follow = []
rules = [
    Rule(LinkExtractor(allow=['/?page=chi-tiet-san-pham']), 'parse_item'),
    Rule(LinkExtractor(allow=['/?page=nhom-san-pham']), 'parse'),
    #Rule(LinkExtractor(), 'parse_item_and_links'),
]
Exemple #3
0
class BossSpider(CrawlSpider):
    name = 'boss'
    start_urls = [
        'https://www.zhipin.com/gongsir/5d627415a46b4a750nJ9.html?page=1'
    ]
    url1 = 'https://www.zhipin.com'  #用来做拼接

    # 匹配职位列表页的规则(定义抽取连接规则)
    rules = (Rule(LinkExtractor(allow=r'.+\?page=\d+'),
                  callback="parse_url",
                  follow=True), )

    # 匹配详情页的规则
    # rules = (
    #     Rule(LinkExtractor(allow=r'.+job_detail/\w+~.html'), callback="detail_parse", follow=False),
    # )

    def parse_url(self, response):
        item = GetBossItem()

        for i in range(1, 15):
            url = response.xpath(
                '//*[@id="main"]/div[2]/div[2]/div[2]/ul/li[{}]/a/@href'.
                format(str(i))).extract()
            url = self.url1 + str(url[0])
            print(url)
            # if item['url']:
            yield Request(
                url,
                callback=self.detail_parse,  #回调详情页函数
                meta={'item': item},  #将参数传递给meta#
                priority=10,
                dont_filter=True,  #强制不过滤
                #headers=headers
                # headers=self.headers
            )

    def detail_parse(self, response):
        item = response.meta['item']  #接收item
        # 企业名称
        dp_name = response.xpath(
            '//div[@class="job-sec"]/div[@class="name"]/text()').get().strip()
        # 企业类型
        dp_type = response.xpath(
            '//div[@class="level-list"]/li[@class="company-type"]/text()'
        ).getall()[0]
        # 企业成立时间
        dp_founded = response.xpath(
            '//div[@class="level-list"]/li[@class="res-time"]/text()').getall(
            )[0]
        # 职位名称
        job_name = response.xpath(
            '//div[@class="company-info"]/div[@class="name"]/h1/text()').get(
            ).strip()
        # 学历要求
        education = response.xpath(
            '//*[@id="main"]/div[1]/div/div/div[2]/p/text()').getall()[2]
        # 工作经验要求
        experience = response.xpath(
            '//*[@id="main"]/div[1]/div/div/div[2]/p/text()').getall()[1]
        # 薪资
        salary = response.xpath(
            '//*[@id="main"]/div[1]/div/div/div[2]/div[2]/span/text()').get(
            ).strip()
        # 招聘状态
        state = response.xpath(
            '//*[@id="main"]/div[3]/div/div[1]/div[2]/p[6]/text()').get(
            ).strip()
        # 职位描述
        description = response.xpath(
            '//*[@id="main"]/div[3]/div/div[2]/div[2]/div[1]/div/text()'
        ).getall()
        description = str(description).strip('[\']\\n ')
        # 员工福利
        welfare = response.xpath(
            '//*[@id="main"]/div[1]/div/div/div[2]/div[3]/div[2]/span/text()'
        ).getall()
        welfare = str(welfare)
        # 工作地址
        address = response.xpath(
            '//div[@class="job-location"]/div[@class="location-address"]/text()'
        ).get().strip()

        item['dp_name'] = dp_name
        item['dp_type'] = dp_type
        item['dp_founded'] = dp_founded
        item['job_name'] = job_name
        item['education'] = education
        item['experience'] = experience
        item['salary'] = salary
        item['state'] = state
        item['description'] = description
        item['welfare'] = welfare
        item['address'] = address

        yield item
Exemple #4
0
class YiyaoSpider(CrawlSpider):
    name = 'YiYao'
    allowed_domains = ['www.cpi.ac.cn', 'www.cccmhpie.org.cn']
    start_urls = [
        'http://www.cpi.ac.cn/publish/default/hyzx/index.htm',
        'http://www.cccmhpie.org.cn/ShowNewsList.aspx?QueryStr=x08x12o8q7x09x01w1z4892x9994z6164z5759zO3w8w1u9v5v5v5zO3x10x02x11p4x2X12x01w1u8z8p2x01q9p4x2X12x01w1u9z8w7x08q7x15x15p3x0X14x18x0X14o3w8w1p3p9p3p3x0X14x18x0X14z8w7x08q7x15x15p4q7q8x08x01o8q7x09x01w1p3x2X15q5w7x08q7x15x15z8p5x10x05x13x17x01o3w8w1z8w8q7x16q7p3x0X14x18x0X14o3w8w1p3p9p3p3x0X14x18x0X14z8w8q7x16q7p4q7q8x08x01o8q7x09x01w1w8x11q9q5o0x05x14x15x16pQ7x03x01z8x00x0X15q9p5x10x05x13x17x01o3w8w1u9v5v5v5z8p2x1X1X16w7x08q7x15x15o3w8w1v7u8u9v5z8w7x08q7x15x15o3w8w1u9v5v5v5zO6x05x10x07o3w8w1u9v5v5v5',
        'http://www.cccmhpie.org.cn/ShowNewsList.aspx?QueryStr=x08x12o8q7x09x01w1y2269z8469y1160y4577zO3w8w1u9v5v5v1zO3x10x02x11p4x2X12x01w1u8z8p2x01q9p4x2X12x01w1u9z8w7x08q7x15x15p3x0X14x18x0X14o3w8w1p3p9p3p3x0X14x18x0X14z8w7x08q7x15x15p4q7q8x08x01o8q7x09x01w1p3x2X15q5w7x08q7x15x15z8p5x10x05x13x17x01o3w8w1z8w8q7x16q7p3x0X14x18x0X14o3w8w1p3p9p3p3x0X14x18x0X14z8w8q7x16q7p4q7q8x08x01o8q7x09x01w1w8x11q9q5o0x05x14x15x16pQ7x03x01z8x00x0X15q9p5x10x05x13x17x01o3w8w1u9v5v5v1z8p2x1X1X16w7x08q7x15x15o3w8w1v7u8u9v5z8w7x08q7x15x15o3w8w1u9v5v5v1zO6x05x10x07o3w8w1u9v5v5v1'
    ]
    custom_settings = {
        # 并发请求
        'CONCURRENT_REQUESTS': 10,
        # 'CONCURRENT_REQUESTS_PER_DOMAIN': 1000000,
        'CONCURRENT_REQUESTS_PER_IP': 0,
        # 下载暂停
        'DOWNLOAD_DELAY': 0.5,
        'ITEM_PIPELINES': {
            # 设置异步入库方式
            'HY_NEWS.pipelines.MysqlTwistedPipeline': 600,
            # 去重逻辑
            # 'HY_NEWS.pipelines.DuplicatesPipeline': 200,
        },
        'DOWNLOADER_MIDDLEWARES': {
            # 调用 scrapy_splash 打开此设置
            # 'scrapy_splash.SplashCookiesMiddleware': 723,
            # 'scrapy_splash.SplashMiddleware': 725,

            # 设置设置默认代理
            'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 700,
            # 设置请求代理服务器
            # 'HY_NEWS.util_custom.middleware.middlewares.ProxyMiddleWare': 100,
            # 设置scrapy 自带请求头
            'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
            # 自定义随机请求头
            'HY_NEWS.util_custom.middleware.middlewares.MyUserAgentMiddleware':
            120,
            # 重试中间件
            'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
            # 重试中间件
            'HY_NEWS.util_custom.middleware.middlewares.MyRetryMiddleware': 90,
        },
        # 调用 scrapy_splash 打开此设置
        # 'SPIDER_MIDDLEWARES': {
        #     'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
        # },
        # 去重/api端口
        # 'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
        # # 'SPLASH_URL': "http://10.8.32.122:8050/"
        # 'SPLASH_URL': "http://127.0.0.1:8050/"
    }
    rules = (
        Rule(LinkExtractor(restrict_css='.pager a:nth-child(3) '),
             follow=True),
        Rule(LinkExtractor(restrict_css='.news-li a '),
             callback='parse_item',
             follow=True),
        Rule(LinkExtractor(restrict_css='.DocTitle a'),
             callback='parse_item',
             follow=True),
    )

    def parse_item(self, response):
        item = HyNewsItem()
        resp = response.text
        extractor = GeneralNewsExtractor()
        result = extractor.extract(resp, with_body_html=False)
        title = result['title']
        txt = result['content']
        p_time = result['publish_time']
        lyurl = response.url
        lyname = '医药'
        content_css = [
            '.left-cc',
            '.pagesContent',
        ]
        for content in content_css:
            content = ''.join(response.css(content).extract())
            if content:
                break
            if not content:
                logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent')
        classify, codes, region = get_category(txt)
        item['title'] = title
        item['txt'] = txt
        item['p_time'] = get_times(p_time)
        item['content'] = content
        item['spider_name'] = 'YiYao'
        item['module_name'] = '行业新闻'
        item['cate'] = classify
        item['region'] = region
        item['code'] = codes
        item['link'] = lyurl
        item['website'] = lyname
        if content:
            yield item
Exemple #5
0
class LagouSpider(CrawlSpider):
    name = 'lagou'
    allowed_domains = ['www.lagou.com']
    start_urls = ['http://www.lagou.com/']

    rules = (
        # Rule(LinkExtractor(allow=(r'zhaopin/.*',)), follow=True),
        # Rule(LinkExtractor(allow=(r'gongsi/j\d+.html',)), follow=True),
        Rule(LinkExtractor(allow=r'jobs/\d+.html'),
             callback='parse_job',
             follow=False), )

    def parse_job(self, response):
        # 解析拉勾网的职位
        item_loader = ArticleItemLoader(item=LagouJobItem(), response=response)
        item_loader.add_css("title", ".job-name::attr(title)")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("salary", ".job_request .salary::text")
        item_loader.add_xpath("job_city",
                              "//*[@class='job_request']//span[2]/text()")
        item_loader.add_xpath("work_years",
                              "//*[@class='job_request']//span[3]/text()")
        item_loader.add_xpath("degree_need",
                              "//*[@class='job_request']//span[4]/text()")
        item_loader.add_xpath("job_type",
                              "//*[@class='job_request']//span[5]/text()")

        item_loader.add_css("tags", '.position-label li::text')
        item_loader.add_css("publish_time", ".publish_time::text")
        item_loader.add_css("job_advantage", ".job-advantage p::text")
        item_loader.add_css("job_desc", ".job_bt div")
        item_loader.add_css("job_addr", ".work_addr")
        item_loader.add_css("company_name", "#job_company dt a img::attr(alt)")
        item_loader.add_css("company_url", "#job_company dt a::attr(href)")
        item_loader.add_value("crawl_time", datetime.now())

        job_item = item_loader.load_item()

        return job_item

    def start_requests(self):
        # 使用selenium模拟登录后拿到cookie交给scrapy的request使用
        # 1、通过selenium模拟登录
        # 从文件中读取cookies
        cookies = []
        if os.path.exists("lagou.cookie"):
            cookies = pickle.load(open("lagou.cookie", "rb"))

        if not cookies:
            import time
            from selenium import webdriver
            browser = webdriver.Chrome(
                executable_path="C:/scrapy/chromedriver.exe")
            browser.get("https://passport.lagou.com/login/login.html")
            browser.find_element_by_css_selector(
                ".form_body .input.input_white").send_keys("18140523326")
            browser.find_element_by_css_selector(
                '.form_body input[type="password"]').send_keys("linfeng0328")
            browser.find_element_by_css_selector(
                'div[data-view="passwordLogin"] input.btn_lg').click()
            input("检查网页是否有验证码要输入,有就在网页输入验证码,输入完后,控制台回车;如果无验证码,则直接回车")
            time.sleep(3)
            cookies = browser.get_cookies()
            # 写入cookie到文件中
            pickle.dump(cookies, open("lagou.cookie", "wb"))

        cookie_dict = {}
        for cookie in cookies:
            cookie_dict[cookie["name"]] = cookie["value"]

        for url in self.start_urls:
            yield scrapy.Request(url, dont_filter=True, cookies=cookie_dict)
Exemple #6
0
class LagouSpider(CrawlSpider):
    name = 'lagou'
    allowed_domains = ['www.lagou.com']
    start_urls = [
        'https://www.lagou.com/jobs/list_%E8%BD%AF%E4%BB%B6%E6%B5%8B%E8%AF%95?px=default&city=%E6%B7%B1%E5%9C%B3#filterBox'
    ]

    custom_settings = {
        "COOKIES_ENABLED": False,
        "DOWNLOAD_DELAY": 1,
        'DEFAULT_REQUEST_HEADERS': {
            'Accept':
            'application/json, text/javascript, */*; q=0.01',
            'Accept-Encoding':
            'gzip, deflate, br',
            'Accept-Language':
            'zh-CN,zh;q=0.8',
            'Connection':
            'keep-alive',
            'Cookie':
            'user_trace_token=20171015132411-12af3b52-3a51-466f-bfae-a98fc96b4f90; LGUID=20171015132412-13eaf40f-b169-11e7-960b-525400f775ce; SEARCH_ID=070e82cdbbc04cc8b97710c2c0159ce1; ab_test_random_num=0; X_HTTP_TOKEN=d1cf855aacf760c3965ee017e0d3eb96; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=0; PRE_UTM=; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DsXIrWUxpNGLE2g_bKzlUCXPTRJMHxfCs6L20RqgCpUq%26wd%3D%26eqid%3Dee53adaf00026e940000000559e354cc; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; index_location_city=%E5%85%A8%E5%9B%BD; TG-TRACK-CODE=index_hotjob; login=false; unick=""; _putrc=""; JSESSIONID=ABAAABAAAFCAAEG50060B788C4EED616EB9D1BF30380575; _gat=1; _ga=GA1.2.471681568.1508045060; LGSID=20171015203008-94e1afa5-b1a4-11e7-9788-525400f775ce; LGRID=20171015204552-c792b887-b1a6-11e7-9788-525400f775ce',
            'Host':
            'www.lagou.com',
            'Origin':
            'https://www.lagou.com',
            'Referer':
            'https://www.lagou.com/',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
        }
    }

    rules = (
        Rule(LinkExtractor(allow=(r'zhaopin/.*', )), follow=True),
        Rule(LinkExtractor(allow=(r'gongsi/j\d\.html', )), follow=True),
        Rule(LinkExtractor(
            allow=(r'jobs/.*', ),
            restrict_css=("div#s_position_list ul.item_con_list"),
        ),
             callback='parse_item',
             follow=False),
    )

    def parse_item(self, response):
        Item_loader = LagouItemLoader(item=LagouItem(), response=response)

        # TITLE
        Item_loader.add_xpath("title", "//div[@class='job-name']/@title")
        # URL
        Item_loader.add_value("url", response.url)
        # salary
        Item_loader.add_xpath("salary",
                              "//dd[@class='job_request']/p/span[1]/text()")
        # job_city
        Item_loader.add_xpath("job_city",
                              "//dd[@class='job_request']/p/span[2]/text()")
        # work_years
        Item_loader.add_xpath("work_years",
                              "//dd[@class='job_request']/p/span[3]/text()")
        # degree_need
        Item_loader.add_xpath("degree_need",
                              "//dd[@class='job_request']/p/span[4]/text()")
        # job_type
        Item_loader.add_xpath("job_type",
                              "//dd[@class='job_request']/p/span[5]/text()")
        # tags
        Item_loader.add_xpath("tags", "//li[@class='labels']/text()")
        # publish-time
        Item_loader.add_xpath("publish_time",
                              "//p[@class='publish_time']/text()")
        # job_advantage
        Item_loader.add_xpath("job_advantage",
                              "//dd[@class='job-advantage']/p/text()")
        # job_desc
        Item_loader.add_xpath("job_desc", "//dd[@class='job_bt']/div/p/text()")
        # work_addr
        Item_loader.add_xpath("work_addr",
                              "//div[@class='work_addr']/a/text()")
        # company_name
        Item_loader.add_xpath("company_name",
                              "//dl[@class='job_company']/dt/a/img/@alt")
        # company_url
        Item_loader.add_xpath("company_url",
                              "//dl[@class='job_company']/dt/a/@href")

        lagou_item_loader = Item_loader.load_item()
        return lagou_item_loader
Exemple #7
0
class XiaoshoumoviesSpider(CrawlSpider):
    """
       name:scrapy唯一定位实例的属性,必须唯一
       allowed_domains:允许爬取的域名列表,不设置表示允许爬取所有
       start_urls:起始爬取列表
       start_requests:它就是从start_urls中读取链接,然后使用make_requests_from_url生成Request,
                       这就意味我们可以在start_requests方法中根据我们自己的需求往start_urls中写入
                       我们自定义的规律的链接
       parse:回调函数,处理response并返回处理后的数据和需要跟进的url
       log:打印日志信息
       closed:关闭spider
     """
    name = 'XiaoShouMovies'
    allowed_domains = ['www.p4vip.com']
    start_urls = ['http://www.p4vip.com/?m=vod-type-id-1.html', 'http://www.p4vip.com/?m=vod-type-id-2.html',
                  'http://www.p4vip.com/?m=vod-type-id-3.html', 'http://www.p4vip.com/?m=vod-type-id-4.html',
                  'http://www.p4vip.com/?m=vod-type-id-16.html']

    # 连接提取器:会去起始url响应回来的页面中提取指定的url
    # rules元组中存放的是不同的规则解析器(封装好了某种解析规则)
    rules = (
        # 规则解析器:可以将连接提取器提取到的所有连接表示的页面进行指定规则(回调函数)的解析
        # Rule(LinkExtractor(allow=r'.*type-id-8-.*'), follow=True),
        Rule(LinkExtractor(restrict_xpaths='//a[text()="下一页"]'), follow=True),
        # Rule(LinkExtractor(allow=r'.*detail-id-\d{1,8}\.html'),callback='parse_list',  follow=True),
        Rule(LinkExtractor(restrict_xpaths='//a[@class="link-hover"]'), callback='parse_list', follow=True),
        # 由于该网站所有的播放链接都是存在一起的,即在任何一个剧集播放页面即可拿到全集的播放地址链接。所以单独爬取一个页面即可,不需要爬取全部页面链接
        Rule(LinkExtractor(allow='.*play-id-\d{1,8}-src-1-num-1.html'), callback='parse_link', follow=False),
    )

    # 解析列表页方法
    def parse_list(self, response):
        # # 根据xpath表达式提取电影各种信息
        # for a in a_list:
        #     item = MovieByXiaoShouItem()
        #     item['movie_name'] = a.xpath('@title').get()
        #     item['movie_cover'] = a.xpath('./img/@data-original').get()
        #     yield item
        # print(response.xpath('//dt[@class="name"]/text()').get())
        item = MoviespidersItem()
        try:
            # item['num'] = re.findall('(?<=detail-id-).*(?=\.html)',response.request.url)[0]
            item['num'] = re.findall(r'\d{1,8}', response.request.url)[1]
            item['name'] = response.xpath('//dt[@class="name"]/text()').get()
            item['cover'] = response.xpath('//img[@class="lazy"]/@data-original').get()
            starrings = response.xpath('//dt[2]/a/text()')
            starring_arr = []
            for starring in starrings:
                starring_arr.append(starring.get())
            item['starrings'] = ','.join(starring_arr)
            item['type'] = response.xpath('//dt[3]/a/text()').get()
            item['director'] = response.xpath('//dd[1]/a/text()').get()
            item['region'] = response.xpath('//dd[1]/dd/text()').get()
            item['year'] = response.xpath('//dd[2]/text()').get()
            item['language'] = response.xpath('//dd[3]/text()').get()
            introduction = response.xpath('//div[@class="tab-jq"]/span/text()').get()
            if introduction is None:
                introduction = response.xpath('//div[@class="tab-jq"]/text()').get().strip()
            item['introduction'] = introduction
            item['state'] = response.xpath('//span[@class="bz"]/text()').get()
            item['fromUrl'] = response.request.url
        except Exception:
            with open('./exception.txt', 'w') as f:
                f.write(response.xpath('//dt[@class="name"]/text()').get() + '   ---   ' + response.request.url + '\n')
                f.close()
        return item

    # 解析播放页的播放地址
    def parse_link(self, response):
        item = MovieplayItem()
        try:
            item['num'] = re.findall(r'\d{1,8}', response.request.url)[1]
            playLink = re.findall(r'(?<=mac_url\=unescape\(\').*?(?=\'\))', response.text)[0]
            link = execjs.eval("unescape('" + playLink + "')")
            item['playLink'] = link
        except Exception:
            with open('./exception.txt', 'w') as f:
                f.write(
                    response.xpath('//dt[@class="name"]/text()').get() + '   -url-   ' + response.request.url + '\n')
                f.close()
        return item
Exemple #8
0
class EzyVisionSpider(Spider):
    name = 'specsavers_nz-ezyvision.co.nz'
    allowed_domains = ('ezyvision.co.nz', )
    start_urls = ['http://www.ezyvision.co.nz/']
    rules = (Rule(LinkExtractor(allow=('Brand', ))), )

    def __init__(self, *args, **kwargs):
        super(EzyVisionSpider, self).__init__(*args, **kwargs)
        dispatcher.connect(self.spider_idle, signals.spider_idle)

    def spider_idle(self, spider):
        self.log('spider idle called')
        if spider.name == self.name:
            req = Request('http://www.ezyvision.co.nz/search',
                          callback=self.parse_search)
            self.crawler.engine.crawl(req, self)

    def parse(self, response):
        urls = response.xpath('//a/@href').extract()
        urls = [url for url in urls if 'Brand=' in url]
        for url in urls:
            yield Request(urljoin(get_base_url(response), url),
                          callback=self.parse_brand)

    def parse_brand(self, response):
        brand = url_query_parameter(response.url, 'Brand', '')
        urls = response.xpath(
            '//section[@id="productList"]//a/@href').extract()
        for url in urls:
            yield Request(urljoin(get_base_url(response), url),
                          meta={'brand': brand},
                          callback=self.parse_product)

    def parse_search(self, response):

        brand = url_query_parameter(response.url, 'Brand', '')
        urls = response.xpath(
            '//section[@id="productList"]//a/@href').extract()
        for url in urls:
            yield Request(urljoin(get_base_url(response), url),
                          meta={'brand': brand},
                          callback=self.parse_product)

        yield Request('http://www.ezyvision.co.nz/ajax/search',
                      callback=self.parse_ajax_search)

    def parse_ajax_search(self, response):
        base_url = 'http://www.ezyvision.co.nz/product/'
        data = json.loads(response.body)

        if data.get('products', None):
            for product in data['products']:
                yield Request(urljoin(base_url, product['url']),
                              meta={'brand': ''},
                              callback=self.parse_product)

            products_loaded = int(response.meta.get('products_loaded', 6)) + 6
            formdata = {
                'action': 'load products',
                'productsLoaded': str(products_loaded)
            }
            yield FormRequest('http://www.ezyvision.co.nz/ajax/search',
                              dont_filter=True,
                              formdata=formdata,
                              callback=self.parse_ajax_search,
                              meta={'products_loaded': products_loaded})

    def parse_product(self, response):
        loader = ProductLoader(item=Product(), response=response)
        loader.add_xpath('name', '//section[@class="product"]//h1/text()')
        loader.add_value('url', response.url)
        loader.add_value('brand', response.meta.get('brand', ''))
        price = ''.join(response.xpath('//h2[@id="tprices"]/text()').extract())
        loader.add_value('price', price)
        image_url = response.xpath(
            '//figure[@class="main"]//img/@src').extract()[0]
        if image_url.endswith('.jpg'):
            loader.add_value('image_url',
                             urljoin(get_base_url(response), image_url))
        cat = response.xpath(
            '//article[@class="breadcrumbs"]//text()').extract()
        cat = [r for r in cat if r.strip().replace(u'\u203a', '')]
        cat = cat[2:-1]
        for c in cat:
            loader.add_value('category', c)
        loader.add_xpath('identifier', '//input[@name="product_id"]/@value')
        loader.add_xpath('sku', '//input[@name="product_id"]/@value')
        loader.add_value('shipping_cost', '0')
        item = loader.load_item()

        metadata = SpecSaversMeta()
        promotional_data = response.xpath(
            '//font[@color="red" and contains(text(), "use this code")]//text()'
        ).extract()
        metadata['promotion'] = ' '.join(
            promotional_data).strip() if promotional_data else ''
        item['metadata'] = metadata
        yield item
Exemple #9
0
class NingboSpider(CrawlSpider):
    name = 'ningbo'
    allowed_domains = ['ningbo.gov.cn']
    start_urls = ['http://gtog.ningbo.gov.cn/col/col381/index.html']

    rules = (Rule(LinkExtractor(allow=r'.*gtog.ningbo.gov.cn/art.*'),
                  callback='parse_page',
                  follow=False), )

    cont_dict = {}

    def parse_item(self, response):
        print("5. parse_item(): " +
              datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') +
              " -> " + response.url)
        title = response.xpath("//*[@id='ivs_title']/text()").get()
        cont = response.xpath("//*[@id='ivs_content']").get()
        index_id = str('_NULL')
        pub_org = response.xpath(
            "//*[@id='c']/tbody/tr[4]/td/table/tbody/tr/td/text()[1]").get()

        pub_time = response.xpath(
            "//*[@id='c']/tbody/tr[4]/td/table/tbody/tr/td/text()[2]").get()
        doc_id = str('_NULL')
        region = str('宁波')
        update_time = datetime.datetime.now().strftime("%Y-%m-%d 00:00:00")

        if not title:
            return

        print("\t>>> " + title)
        for key in keys:
            if key in title:
                self.dict_add_one(re.sub('[\s+]', ' ', title), response.url,
                                  re.sub('[\s+]', ' ', cont),
                                  re.sub('[\s+]', ' ', pub_time),
                                  re.sub('[\s+]', ' ', pub_org), index_id,
                                  doc_id, region, update_time, key)

        item = YqcNingboSpiderItem(cont_dict=self.cont_dict)

        yield item

    def dict_add_one(self, title, url, cont, pub_time, pub_org, index_id,
                     doc_id, region, update_time, doc_key):
        time.sleep(0.3)
        if title in self.cont_dict:
            self.cont_dict[title]['key_cnt'] += 1
            self.cont_dict[title][
                'doc_key'] = self.cont_dict[title]['doc_key'] + ',' + doc_key
        else:
            cnt_dict = {
                'key_cnt': 1,
                'title': title,
                'url': url,
                'cont': cont,
                'pub_time': pub_time,
                'pub_org': pub_org,
                'index_id': index_id,
                'doc_id': doc_id,
                'region': region,
                'update_time': update_time,
                'doc_key': doc_key
            }

            self.cont_dict[title] = cnt_dict

    def parse_page(self, response):
        url = response.url

        print("4. parse_page(): " +
              datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') +
              " -> " + url)

        url_prefix = 'http://gtog.ningbo.gov.cn'

        if str('REPORT_NDOC_006051') in url or str(
                'REPORT_NDOC_006010') in url:
            print("\t>>> debug: " + url)

        if str('currentPage') in url:
            tr_list = response.xpath(
                "//*[@id='main']/div[1]/div/div[2]/table/tbody//tr")

            for tr in tr_list:
                # print(tr)
                url = tr.xpath("./td[1]/a/@href").get()
                full_url = url_prefix + url

                yield scrapy.Request(full_url, callback=self.parse_item)

        else:
            if str('REPORT_NDOC_006051') in url or str(
                    'REPORT_NDOC_006010') in url:
                print('\t>>> no currentPage')

            title = response.xpath("//*[@id='ivs_title']/text()").get()
            cont = response.xpath("//*[@id='ivs_content']").get()
            index_id = str('_NULL')
            pub_org = response.xpath(
                "//*[@id='c']/tbody/tr[4]/td/table/tbody/tr/td/text()[1]").get(
                )

            pub_time = response.xpath(
                "//*[@id='c']/tbody/tr[4]/td/table/tbody/tr/td/text()[2]").get(
                )
            doc_id = str('_NULL')
            region = str('宁波')
            update_time = datetime.datetime.now().strftime("%Y-%m-%d 00:00:00")

            if not title:
                return

            print("\t>>> " + title)
            for key in keys:
                if key in title:
                    # print("\t>>> included")
                    self.dict_add_one(re.sub('[\s+]', ' ',
                                             title), response.url,
                                      re.sub('[\s+]', ' ', cont),
                                      re.sub('[\s+]', ' ',
                                             pub_time), pub_org, index_id,
                                      doc_id, region, update_time, key)

            item = YqcNingboSpiderItem(cont_dict=self.cont_dict)

            print("6. parse_page(): " +
                  datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') +
                  " -> " + url)
            # print("\n")
            # print(item)

            yield item
Exemple #10
0
class OddsSpider(CrawlSpider):
    name = 'odds'
    allowed_domains = ['www.oddsportal.com']

    start_urls = [
        'https://www.oddsportal.com/soccer/africa/africa-cup-of-nations/results/'
    ]

    #def start_requests(self):
    #  yield SplashRequest(self.link, args={'wait': 4}, meta={'real_url': self.link})

    rules = (
        # Rule(LinkExtractor(allow=('/soccer/'),
        #           deny=('/standings/')), process_request='use_splash'),
        Rule(LinkExtractor(
            allow=(r'/soccer/[a-z-]+/[a-z0-9-]+/[a-zA-Z0-9-]+/'),
            deny=(
                "/soccer/[a-z-]+/[a-z0-9-]+/results",
                "/soccer/[a-z-]+/[a-z0-9-]+/standing",
            )),
             callback='parse_items',
             process_request='use_splash',
             follow=True))

    def _requests_to_follow(self, response):
        if not isinstance(
                response,
            (HtmlResponse, SplashJsonResponse, SplashTextResponse)):
            return
        seen = set()
        for n, rule in enumerate(self._rules):
            links = [
                lnk for lnk in rule.link_extractor.extract_links(response)
                if lnk not in seen
            ]
            if links and rule.process_links:
                links = rule.process_links(links)
            for link in links:
                seen.add(link)
                r = self._build_request(n, link)
                yield rule.process_request(r)

    #def splash_request(self, request):
    #  return SplashRequest(url=request.url, args={'wait': 4}, meta={'real_url': request.url})

    def use_splash(self, request):
        request.meta.update(splash={
            'args': {
                'wait': 1,
            },
            'endpoint': 'render.html',
        })
        return request

    def parse_items(self, response):

        items = OddsportalItem()
        items['country'] = response.css("a:nth-child(4)::text").get()
        items['liga'] = response.css("a:nth-child(5)::text").get()
        items['teams'] = response.css("h1::text").get()
        items['data'] = response.css(".t1559221200-4-1-1-1::text").get()
        yield items
        SplashRequest()
Exemple #11
0
# Auto generated by generator.py. Delete this line if you make modification.
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor

XPATH = {
    'name': "//title",
    'price': "//tr[2]/td/font/strong | //font[@color='#FF0000']/strong",
    'category': "",
    'description': "//tr[2]/td/p/span",
    'images': "//table//tr[1]/td/a/img/@src",
    'canonical': "",
    'base_url': "",
    'brand': ""
}
name = 'tamduc.org'
allowed_domains = ['tamduc.org']
start_urls = ['http://tamduc.org']
tracking_url = ''
sitemap_urls = ['']
sitemap_rules = [('', 'parse_item')]
sitemap_follow = []
rules = [
    Rule(LinkExtractor(allow=['/Product/']), 'parse_item'),
    Rule(LinkExtractor(allow=['/listProduct/']), 'parse'),
    #Rule(LinkExtractor(), 'parse_item_and_links'),
]
Exemple #12
0
class QicheSpider(CrawlSpider):
    name = 'qiche'
    allowed_domains = ['autohome.com.cn']
    start_urls = ['https://www.autohome.com.cn/news/']
    #page_links = LinkExtractor(allow=r'news/\d+/')
    contentlinks = LinkExtractor(allow=r'news/\d+?/\d+?.html#pvareaid=\d+')
    rules = (
        #Rule(page_links),
        Rule(contentlinks,callback="parse_item",follow=True),
    )

   # def parse(self, response):

    def parse_item(self, response):
        item = QichezhijiaItem()
        item['name']= self.get_name(response)
        item['time'] = self.get_time(response)
        item['source'] = self.get_source(response)
        item['type'] = self.get_type(response)
        item['editor'] = self.get_editor(response)
        item['content'] = self.get_content(response)
        yield item

    def get_name(self,response):
        name = response.xpath('//div[@class="row"]/div/div/h1/text()').extract()
        if len(name):
            name = name[0]
        else:
            name = "NULL"
        return name

    def get_time(self,response):
        time = response.xpath('//div[@class="row"]/div/div/div[1]/span[1]/text()').extract()
        if len(time):
            time = time[0]
        else:
            time = "NULL"
        return time

    def get_source(self,response):
        source = response.xpath('//div[@class="row"]/div/div/div[1]/span[2]/text()').extract()
        if len(source):
            source = source[0]
        else:
            source = "Null"
        return source

    def get_type(self,response):
        type = response.xpath('//div[@class="row"]/div/div/div[1]/span[3]/text()').extract()
        if len(type):
            type = type[0]
        else:
            type = "NULL"
        return type

    def get_editor(self,response):
        editor = response.xpath('//div[@class="row"]/div/div/div[1]/div/a/text()').extract()
        if len(editor):
            editor = editor[0]
        else:
            editor = "NULL"
        return editor

    def get_content(self,response):
        content = response.xpath('//div/div[@id="articleContent"]/p/text()').extract()
        if len(content):
            content= ' '.join(content)
        else:
            content = "NULL"
        return content
    class GenericCrawlSpider(CrawlSpider):

        crawl_specification = settings

        # load parser from specification
        try:
            parser_class = shared.get_class(crawl_specification.parser)
            parser = parser_class(data=crawl_specification.parser_data)
        except AttributeError or TypeError as exc:
            MLOG.exception(exc)

        domain = urlparse(start_url).netloc

        name = crawler_name

        allowed_domains = [domain]

        start_urls = [start_url]

        denied_extensions = [
            'mng', 'pct', 'bmp', 'gif', 'jpg', 'jpeg', 'png', 'pst', 'psp',
            'tif', 'tiff', 'ai', 'drw', 'dxf', 'eps', 'ps', 'svg', 'mp3',
            'wma', 'ogg', 'wav', 'ra', 'aac', 'mid', 'au', 'aiff', '3gp',
            'asf', 'asx', 'avi', 'mov', 'mp4', 'mpg', 'qt', 'rm', 'swf', 'wmv',
            'm4a', 'm4v', 'flv', 'xls', 'xlsx', 'ppt', 'pptx', 'pps', 'doc',
            'docx', 'odt', 'ods', 'odg', 'odp', 'css', 'exe', 'bin', 'rss',
            'zip', 'rar', 'gz', 'tar'
        ]
        if isinstance(crawl_specification.parser, ParagraphParser):
            denied_extensions.append("pdf")

        rules = [
            Rule(VerboseLxmlLinkExtractor(logname=crawler_name,
                                          spec=crawl_specification,
                                          deny=crawl_specification.blacklist,
                                          allow=crawl_specification.whitelist,
                                          deny_extensions=denied_extensions),
                 callback=parser.parse,
                 follow=True,
                 errback=parser.errback)
        ]

        # ensure that start_urls are also parsed
        parse_start_url = parser.parse

        def __init__(self):
            super().__init__()
            # setup individual logger for every spider
            if self.crawl_specification.logs:
                self.s_log = shared.simple_logger(
                    loger_name="crawlspider",
                    file_path=os.path.join(self.crawl_specification.logs,
                                           self.name + ".log"))
            else:
                self.s_log = shared.simple_logger(loger_name="crawlspider")

            # enter spider to parser
            self.parser.spider = self

            for hand in self.s_log.handlers:
                self.logger.logger.addHandler(hand)
            self.s_log.info("[__init__] - Crawlspider logger setup finished.")

        def start_requests(self):
            for url in self.start_urls:
                yield Request(url)
Exemple #14
0
# Auto generated by generator.py. Delete this line if you make modification.
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor

XPATH = {
    'name': "//h1[@class='product-name']",
    'price':
    "//span[@class='special-price']/span|//div[@class='price-box']/span/span[@class='price']",
    'category': "//div[@class='col-md-12 breadcrumbs']/ul/li/a",
    'description': "//div[@id='yt_tab_products']",
    'images': "//div/a/img/@src",
    'canonical': "",
    'base_url': "",
    'brand': ""
}
name = 'hcplus.vn'
allowed_domains = ['hcplus.vn']
start_urls = ['http://hcplus.vn/']
tracking_url = ''
sitemap_urls = ['']
sitemap_rules = [('', 'parse_item')]
sitemap_follow = []
rules = [
    Rule(LinkExtractor(allow=['/[a-zA-Z-]+/?[a-zA-Z0-9-]+.html$']),
         'parse_item'),
    Rule(LinkExtractor(allow=['vn/[a-zA-Z-]+.html']), 'parse'),
    #Rule(LinkExtractor(), 'parse_item_and_links'),
]
class BBCSpider(Spider):
    name = 'bbc_spider'
    allowed_domains = ['www.bbc.com']
    start_urls = ['http://www.bbc.com/news']

    rules = (Rule(LinkExtractor(allow="http://www.bbc.com/news"),
                  callback='parse'), )

    def parse(self, response):
        items = []

        for article in response.xpath(
                '//*[@class="nw-c-most-read__items gel-layout '
                'gel-layout--no-flex"]/ol/li'):
            item = Article()
            item["Title"] = article.xpath(
                'span/div/a/span/text()').extract()[0]
            temp = article.xpath('span/div/a/@href').extract()[0]
            item["URL"] = ''.join("http://www.bbc.com" + str(temp))
            #item["Summary"] = article.xpath('article/div/header/p[2]/text()').extract()[0]
            # item["Photo"] = article.xpath('article/figure/a/img/@src').extract()[0]
            item["Site"] = "BBC News"

            items.append(item)

            if item["Title"] != "":
                title = item["Title"]
                title = title.encode('utf-8').strip()
            else:
                title = ""

            #if item["Summary"] != "":
            #    summary = item["Summary"]
            #    summary = summary.encode('utf-8').strip()
            #else:
            #    summary = ""

            if item["URL"] != "":
                url = item["URL"]
                url = url.encode('utf-8').strip()
            else:
                url = ""

            if item["Site"] != "":
                site = item["Site"]
            else:
                site = ""

            summary = ""

            text = get_article(item["URL"]).encode('utf-8').strip()
            text = text.replace('\n', ' ')

            with open("db_data.txt", "a") as myfile:
                myfile.write('\t')
                myfile.write(title)
                myfile.write('\t')
                myfile.write(summary)
                myfile.write('\t')
                myfile.write("")
                myfile.write('\t')
                myfile.write(url)
                myfile.write('\t')
                myfile.write(site)
                myfile.write('\t')
                myfile.write(text)
                myfile.write('\n')

            myfile.close()
Exemple #16
0
class PexelSpider(CrawlSpider):
    name = "PexelSpider"
    allowed_domains = ['www.pexels.com']
    start_urls = ["https://www.pexels.com/"]
    rules = [
        Rule(LinkExtractor(allow_domains="www.pexels.com"),
             follow=True,
             callback='parse_link')
    ]

    def parse_link(self, response):
        image_links = response.xpath(
            '//*[@download="true"]/../a/@href').extract()
        path_to_store_image = '/home/fahad/Spyder_Projects/PexelCrawler/images/'
        request_url = response.request.url
        headers = {
            'User-Agent':
            'Mozilla/5.0 (X11; Linux x86_64; rv:65.0) Gecko/20100101 Firefox/65.0'
        }

        if not os.path.exists(path_to_store_image):
            os.makedirs(path_to_store_image)

        for image_link in image_links:
            if "images.pexels.com" not in image_link:
                continue
            image_id = image_link.split('/')[4]
            image_name = path_to_store_image + image_id + '.jpeg'
            if os.path.exists(image_name):
                continue

            picture_request = requests.get(image_link, headers=headers)
            if picture_request.status_code == 200:
                with open(image_name, 'wb') as f:
                    f.write(picture_request.content)
            else:
                print(
                    "response code %d for image id %s" %
                    picture_request.status_code, image_id)

        if '/photo/' not in request_url and '/photos/' not in request_url:
            for page_no in range(0, 41):
                Ajax_Request_URL = 'https://www.pexels.com/?dark=true&format=js&page=%d' % page_no
                yield scrapy.Request(url=Ajax_Request_URL,
                                     headers=headers,
                                     callback=self.Ajax_Parse)
                time.sleep(1)

    def Ajax_Parse(self, response):
        path_to_store_image = '/home/fahad/Spyder_Projects/PexelCrawler/images/'
        if not os.path.exists(path_to_store_image):
            os.makedirs(path_to_store_image)

        image_links = response.xpath(
            '//*[@download="true"]/../a/@href').extract()
        headers = {
            'User-Agent':
            'Mozilla/5.0 (X11; Linux x86_64; rv:65.0) Gecko/20100101 Firefox/65.0'
        }

        for image_link in image_links:
            if "images.pexels.com" not in image_link:
                continue
            image_id = image_link.split('/')[4]
            image_name = path_to_store_image + image_id + '.jpeg'
            if os.path.exists(image_name):
                continue

            picture_request = requests.get(image_link, headers=headers)
            if picture_request.status_code == 200:
                with open(image_name, 'wb') as f:
                    f.write(picture_request.content)
            else:
                print(
                    "response code %d for image id %s" %
                    picture_request.status_code, image_id)
Exemple #17
0
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor

XPATH = {
    'name':
    "//div[@class='col-xs-12 col-sm-12 col-md-12 col-lg-7 body-holder']/div[@class='summary entry-summary body']/div[@class='title']/h1[@class='product_title entry-title']",
    'price':
    "//div[@class='summary entry-summary body']/div[@class='prices clearfix']/ins/span[@class='amount']",
    'category':
    "//div[@class='breadcrumb-nav-holder minimal']/ul[@class='mc-breadcrumb']/li/span",
    'description':
    "//div[@class='container-fluid']/div[@class='tab-holder']/div[@class='tab-content']/div[@id='tab-description']",
    'images':
    "//div[@class='col-xs-12 col-sm-8 col-sm-offset-2 col-md-offset-3 col-lg-offset-0 col-md-6 col-lg-5 gallery-holder']/div[@class='images']/a/img/@src",
    'canonical': "//link[@rel='canonical']/@href",
    'base_url': "",
    'brand': ""
}
name = 'bepluaviet.vn'
allowed_domains = ['bepluaviet.vn']
start_urls = ['http://bepluaviet.vn']
tracking_url = ''
sitemap_urls = ['']
sitemap_rules = [('', 'parse_item')]
sitemap_follow = []
rules = [
    Rule(LinkExtractor(allow=['/shop/']), 'parse_item'),
    Rule(LinkExtractor(allow=['/sp/[/a-zA-Z0-9-]+$']), 'parse'),
    #Rule(LinkExtractor(), 'parse_item_and_links'),
]
Exemple #18
0
class ScieloBr(BasePortiaSpider):
    name = "scielo_br"
    allowed_domains = [u'www.scielo.br', u'www.scielo.org.mx']
    start_urls = [
        #OA补漏
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0102-093520170001&lng=en&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0102-093520170002&lng=en&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0102-093520170003&lng=en&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0102-093520170004&lng=en&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0102-093520170005&lng=en&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0102-093520170006&lng=en&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1679-395120160007&lng=en&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-509820100001&lng=pt&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-509820100002&lng=pt&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-509820100003&lng=pt&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-509820100004&lng=pt&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-509820110001&lng=pt&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-509820110002&lng=pt&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-509820110003&lng=pt&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-509820110004&lng=pt&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-509820120001&lng=pt&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-509820120002&lng=pt&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-509820120003&lng=pt&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-509820120004&lng=pt&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-509820130001&lng=pt&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-509820130002&lng=pt&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-509820130003&lng=pt&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-509820130004&lng=pt&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-576420100001&lng=en&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-576420100002&lng=en&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-576420100003&lng=en&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-576420100004&lng=en&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-576420110001&lng=en&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-576420110002&lng=en&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-576420110003&lng=en&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-576420110004&lng=en&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-576420120001&lng=en&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-576420120002&lng=en&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-576420120003&lng=en&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-576420120004&lng=en&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-576420130002&lng=en&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-576420130003&lng=en&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-576420130004&lng=en&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0031-104920160001&lng=pt&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0031-104920160005&lng=pt&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0031-104920160017&lng=pt&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0031-104920170002&lng=pt&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0031-104920170010&lng=pt&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0031-104920170015&lng=pt&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0031-104920170017&lng=pt&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0031-104920170036&lng=pt&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0031-104920170039&lng=pt&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0031-104920170040&lng=pt&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1808-243220160001&lng=en&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1808-243220160002&lng=en&nrm=iso',
        'http://www.scielo.org.mx/scielo.php?script=sci_issuetoc&pid=0185-330920100001&lng=en&nrm=iso',
        'http://www.scielo.org.mx/scielo.php?script=sci_issuetoc&pid=0185-330920100002&lng=en&nrm=iso',
        'http://www.scielo.org.mx/scielo.php?script=sci_issuetoc&pid=0185-330920110001&lng=en&nrm=iso',
        'http://www.scielo.org.mx/scielo.php?script=sci_issuetoc&pid=0185-330920110002&lng=en&nrm=iso',
        'http://www.scielo.org.mx/scielo.php?script=sci_issuetoc&pid=0185-330920120001&lng=en&nrm=iso',
        'http://www.scielo.org.mx/scielo.php?script=sci_issuetoc&pid=0185-330920120002&lng=en&nrm=iso',
        'http://www.scielo.org.mx/scielo.php?script=sci_issuetoc&pid=0185-330920140001&lng=en&nrm=iso',
        'http://www.scielo.org.mx/scielo.php?script=sci_issuetoc&pid=0185-330920140002&lng=en&nrm=iso',
        'http://www.scielo.org.mx/scielo.php?script=sci_issuetoc&pid=0185-330920150001&lng=en&nrm=iso',
        'http://www.scielo.org.mx/scielo.php?script=sci_issuetoc&pid=0185-330920150002&lng=en&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=2175-786020100001&lng=en&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=2175-786020100002&lng=en&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=2175-786020100003&lng=en&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=2175-786020100004&lng=en&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=2175-786020100005&lng=en&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=2175-786020110001&lng=en&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=2175-786020110002&lng=en&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=2177-705520160001&lng=pt&nrm=is',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=2177-705520160002&lng=pt&nrm=is',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=2177-705520160003&lng=pt&nrm=is',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=2177-705520170001&lng=pt&nrm=is',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=2177-705520170002&lng=pt&nrm=is',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=2177-705520170003&lng=pt&nrm=is',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0101-662820160001&lng=en&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0101-662820160002&lng=en&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0101-662820160003&lng=en&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0101-662820170001&lng=en&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0101-662820170002&lng=en&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0101-662820170003&lng=en&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1984-467020170001&lng=en&nrm=iso'
        u'http://www.scielo.org.mx/scielo.php?script=sci_issues&pid=0185-3309&lng=en&nrm=iso',
        u'http://www.scielo.br/scielo.php?script=sci_issues&pid=0102-0935&lng=en&nrm=iso',
        'http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0100-204X2000000900007&lng=en&nrm=iso&tlng=pt'
        'http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0102-09351999000600014&lng=en&nrm=iso&tlng=pt'
        'http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0102-09352011000500030&lng=en&nrm=iso&tlng=pt',
        'http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0102-09352011000600001&lng=en&nrm=iso&tlng=pt',
        'http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0100-204X2015000900854&lng=en&nrm=iso&tlng=pt'
    ]
    rules = [
        #Rule(
        #    LinkExtractor(
        #        allow=(
        #            u'script=sci_issuetoc&pid=.*&lng=en&nrm=iso$'
        #        ),
        #    )),
        Rule(
            LinkExtractor(
                allow=(
                    u'script=sci_arttext&pid=.*&lng=en&nrm=iso&tlng=en',
                    u'script=sci_arttext&pid=.*&lng=en&nrm=iso&tlng=pt'
                ),),
            callback='parse_item',
            )
    ]

    def get_pdf(response):
        try:
            pdf_link_elem = Utils.select_element_by_content(response, "//*[@id='toolBox']/div/ul/li/a", "English (pdf)|Portuguese (pdf)")
        except Exception as e:
            return ""
        pdf_link = pdf_link_elem.xpath("@href").extract_first()
        pdf_link = urlparse.urljoin(response.url, pdf_link)
        return pdf_link

    def get_title(response):
        #title也有多种情况..醉了
        title = response.xpath("//p[@class='trans-title']/text()").extract_first()
        if title is None:
            title = response.xpath("//p[@class='title']/text()").extract_first()
            if title is None:
                try:
                    title_elem = response.xpath("//div[contains(@class, 'index')]//p[@align='CENTER']")[0]
                    title = " ".join(title_elem.xpath(".//text()").extract())
                except Exception as e:
                    top_a_elem = response.xpath("//div[contains(@class, 'index')]//a[@name='top']")
                    title = top_a_elem.xpath("./..//b/text()").extract_first()
                    if title is None:
                        title_elem = response.xpath("//div[contains(@class, 'index')]/p")[2]
                        title = " ".join(title_elem.xpath(".//text()").extract())

        print "title is %s" % title
        return Utils.format_text(title)

    def get_abstract(response):
        try:
            abstract_elem = Utils.select_element_by_content(response, "//div[contains(@class, 'index')]//p", "ABSTRACT|Abstract")
            abstract_text = Utils.get_all_inner_texts(abstract_elem, "./following-sibling::p[1]")
            return abstract_text
        except Exception as e:
            return ""

    def get_keyword(response):
        try:
            keyword_elem = Utils.select_element_by_content(response, "//div[contains(@class, 'index')]//p", "Keywords|Index terms|Key words")
            keyword_text = "".join(keyword_elem.xpath(".//text()").extract()).replace("Keywords:", "").split(",")
        except Exception as e:
            return ""
        return keyword_text

    def get_author(response):
        #有两种格式的author
        try:
            author = response.xpath("//div[@class='autores']/p[@class='author']/span[@class='author-name']/text()").extract()
            if len(author) == 0:
                sup_elem = response.xpath("//div[contains(@class, 'index')]/p//sup")[0]
                author_elem = sup_elem.xpath('./..')
                tag_name = author_elem.xpath('name()').extract_first()
                while tag_name != "p":
                    author_elem = author_elem.xpath('./..')
                    tag_name = author_elem.xpath('name()').extract_first()
                    
                author_raw_text = author_elem.extract()
                author = author_raw_text
        except Exception as e:
            return ""

        return author

    def get_author_sup(response):
        #有两种格式的author_sup
        try:
            author_sup = response.xpath("//div[@class='autores']/p[@class='author']/sup/a/text()").extract()
            if len(author_sup) == 0:
                i = 0
                num = 1 #len(response.xpath("//div[contains(@class, 'index')]/p//sup"))
                sup_elem = response.xpath("//div[contains(@class, 'index')]/p//sup")[0]
                author_elem = sup_elem.xpath("./..")
                author_sup = author_elem.xpath("./sup/text()").extract()
        except Exception as e:
            return ""

        return author_sup

    def get_author_affiliation(response):
        #有两种格式的author_affiliation
        try:
            author_aff = []
            elems = response.xpath("//p[@class='aff']")
            for elem in elems:
                txt = " ".join(elem.xpath(".//text()").extract()).strip().replace("\n", "")
                txt = ' '.join(txt.split()) #remove multi space
                author_aff.append(txt)

            if len(author_aff) == 0:
                sup_elem = response.xpath("//div[contains(@class, 'index')]/p//sup")[-1]
                author_raw_text = sup_elem.xpath("./..").extract()
                author_aff = author_raw_text
                print "author affiliation is :%s" % author_aff
        except Exception as e:
            return  ""
        return author_aff

    items = [[Item(PortiaItem,
                   None,
                   '',
                   [Field(u'pdf_link',
                          get_pdf,
                          []),
                       Field(u'journal',
                             '.content h2 a',
                             []),
                       Field(u'print_issn',
                             'h2:nth-child(6) *::text',
                             [Regex(u'Print version ISSN (\\d+-\\d+)')]),
                       Field(u'online_issn',
                             'h2:nth-child(6) *::text',
                             [Regex(u'On-line version ISSN (\\d+-\\d+)')]),
                       Field(u'issue',
                             'h3 *::text',
                             [Regex(u'.*(no\\.|supl\\.|n\\.)(\\d+).*$')]),
                       Field(u'volumn',
                             'h3 *::text',
                             [Regex(u'(vol\\.\\d+|ahead of print)')]),
                       Field(u'date',
                             'h3 *::text',
                             [Regex(u'.*(\\d{4})$')]),
                       Field(u'doi',
                             'h4 *::text',
                             [Regex(u'dx.doi.org/(.*)')]),
                       Field(u'title',
                             get_title,
                             []),
                       Field(u'author_raw',
                             get_author,
                             []),
                       #Field(u'author_sup',
                       #      get_author_sup,
                       #      []),
                       #Field(u'author_affiliation_raw', #如果命名为author_affiliation,那么因为此field已经在item里面注册了process,爬取到的数据标签会被处理掉
                       #      get_author_affiliation,
                       #      []),
                       Field(u'abstracts',
                             get_abstract,
                             []),
                       Field(u'keywords',
                             get_keyword,
                             []),
                       Field(u'copyright',
                             '.copyright *::text',
                             []),
                       Field(u'license_text',
                             '.license',
                             []),
                       Field(u'license_url',
                             '.license a:first-child::attr(href)',
                             [])])]]
class OldAutoSpider(CrawlSpider):
    name = "old_autos"
    allowed_domains = ['turbo.az']
    start_urls = [
        'https://turbo.az/autos/%s' % page
        for page in range(3895724, 603621, -1)
    ]

    # def start_requests(self):
    #     """
    #     :param self:
    #     """
    #     try:
    #         token = cfscrape.get_tokens(OldAutoSpider.start_urls[0])
    #         for url in OldAutoSpider.start_urls:
    #             yield scrapy.Request(
    #                 url=url,
    #                 cookies=token,
    #             )

    custom_settings = {
        'ITEM_PIPELINES': {
            'auto.pipelines.SaveOldAutosPipeline': 200,
            'auto.pipelines.OldAutoPipeline': 300,
        },
        'DOWNLOAD_DELAY': 2,
        'ROBOTSTXT_OBEY': True,
        'COOKIES_ENABLED': False,
        # 'ROTATING_PROXY_LIST': [
        #     'http://*****:*****@209.127.191.180:80',
        #     'http://*****:*****@193.8.56.119:80',
        #     'http://*****:*****@185.164.56.20:80',
        #     'http://*****:*****@45.130.255.243:80',
        #     'http://*****:*****@45.95.96.132:80',
        #     'http://*****:*****@45.95.96.237:80',
        # ],
    }
    rules = (Rule(
        LinkExtractor(restrict_xpaths=['//div[@class="product-body"]']),
        callback='parse_auto',
        follow=False), )

    def parse_auto(self, response):
        exists = response.xpath(
            '//div[@class="product-properties-container"]').extract_first()
        salon = response.xpath(
            '//div[@class="products-i vipped salon"]').extract()
        manat = response.xpath(
            '//div[@class="product-price"]/span/text()').extract_first()
        selector = Selector(response)
        l = OldAutosItemLoader(OldAutoItem(), selector)
        if exists:
            if salon:
                pass
            else:
                l.add_xpath(
                    'city',
                    '//li[@class="product-properties-i"]/div[@class="product-properties-value"]/text()'
                )
                l.add_xpath(
                    'brand',
                    '//ul[@class="product-properties"]/li[2]/div[@class="product-properties-value"]/a/text()'
                )
                l.add_xpath(
                    'model',
                    '//ul[@class="product-properties"]/li[3]/div[@class="product-properties-value"]/a/text()'
                )
                l.add_xpath(
                    'year',
                    '//ul[@class="product-properties"]/li[4]/div[@class="product-properties-value"]/a/text()'
                )
                l.add_xpath(
                    'bodytype',
                    '//ul[@class="product-properties"]/li[5]/div[@class="product-properties-value"]/text()'
                )
                l.add_xpath(
                    'color',
                    '//ul[@class="product-properties"]/li[6]/div[@class="product-properties-value"]/text()'
                )
                l.add_xpath(
                    'engine',
                    '//ul[@class="product-properties"]/li[7]/div[@class="product-properties-value"]/text()'
                )
                l.add_xpath(
                    'power',
                    '//ul[@class="product-properties"]/li[8]/div[@class="product-properties-value"]/text()'
                )
                l.add_xpath(
                    'fuel',
                    '//ul[@class="product-properties"]/li[9]/div[@class="product-properties-value"]/text()'
                )
                l.add_xpath(
                    'mileage',
                    '//ul[@class="product-properties"]/li[10]/div[@class="product-properties-value"]/text()'
                )
                l.add_xpath(
                    'transmission',
                    '//ul[@class="product-properties"]/li[11]/div[@class="product-properties-value"]/text()'
                )
                l.add_xpath(
                    'drivetype',
                    '//ul[@class="product-properties"]/li[12]/div[@class="product-properties-value"]/text()'
                )
                l.add_xpath(
                    'new',
                    '//ul[@class="product-properties"]/li[13]/div[@class="product-properties-value"]/text()'
                )
                if (manat == "AZN"):
                    l.add_xpath('pricem',
                                '//div[@class="product-price"]/text()')
                    l.add_value('priced', '1')
                else:
                    l.add_xpath('priced',
                                '//div[@class="product-price"]/text()')
                    l.add_value('pricem', '1')
                l.add_xpath('order',
                            '//div[@class="product-statistics"]/p[3]/text()')
                # l.add_value('adddate', datetime.datetime.now())
                l.add_xpath('adddate',
                            '//div[@class="product-statistics"]/p[2]/text()')

            return l.load_item()
Exemple #20
0
class CKSpider(CrawlSpider):

    name = 'dvwa_login'

    form_username = '******'
    form_password = '******'
    username = '******'
    password = '******'

    allowed_domains = ['192.168.57.30']

    login_page = 'http://192.168.57.30/login.php'

    start_urls = [
        'http://192.168.57.30/index.php',
    ]

    rules = [
        Rule(LinkExtractor(allow=(), deny=('/logout*')),
             callback="parse_item",
             follow=True)
    ]

    def start_requests(self):
        logging.debug("start send request")
        yield Request(url=self.login_page,
                      callback=self.login,
                      dont_filter=True)

    def login(self, response):
        logging.debug("submit login ")
        yield FormRequest.from_response(response,
                                        formdata={
                                            self.form_username: self.username,
                                            self.form_password: self.password
                                        },
                                        callback=self.check_login_response)

    def check_login_response(self, response):
        if "logout" in response.body:
            logging.debug("finish login")
            return self.parse(response)

    def parse(self, response):
        logging.debug("run parse item")
        yield self.parse_item(response)
        logging.debug("run parse")
        parsed_response = self._parse_response(response,
                                               self.parse_start_url,
                                               cb_kwargs={},
                                               follow=True)
        for requests_or_item in parsed_response:
            logging.debug("\nrequest or item after log in: \n")
            logging.debug(requests_or_item)
            yield requests_or_item

    def parse_item(self, response):

        url_obj = urlparse.urlsplit(response.url)
        url_ret = urlparse.urlunsplit(
            (url_obj.scheme, url_obj.netloc, url_obj.path, '', ''))

        item = URLItem()
        item['url_base'] = url_ret
        item['url_parameters'] = url_obj.query

        return item
Exemple #21
0
class XiamiSpider(BaseSpider):

    name = 'xiami'
    extractor = xiami_extractor
    download_delay = 0.1
    proxy_stable = True
    start_urls = ['http://www.xiami.com/']
    allowed_domains = [
        'www.xiami.com',
        'i.xiami.com',
    ]
    rules = (
        # Find some available indexes
        Rule(LinkExtractor(allow=(
            'com/chart',
            'com/genre',
            'com/zone',
        ))),
        # Find collect pages
        Rule(LinkExtractor(allow=(
            'com/collect$',
            'com/collect/\d+',
        ))),
        # Find artist indexes
        Rule(LinkExtractor(allow=(
            'artist/index',
            'artist/tag',
        ))),
        # Find artist pages
        Rule(LinkExtractor(allow=(
            'com/artist/\w+$',
            'search/find/artist',
        )),
             process_request='store_artists'),
    )
    set_artist = set()
    api_count = 'http://www.xiami.com/count/getplaycount?id=%s&type=song'

    def __init__(self):
        super(XiamiSpider, self).__init__()
        dispatcher.connect(self.spider_idle, signals.spider_idle)

    def store_artists(self, request):
        """Store artist urls into set rather than send requests"""
        if request.url not in self.set_artist:
            with open('xiami_artist_%s' % self.time_stamp, 'a') as output:
                output.write(request.url + '\n')
            self.set_artist.add(request.url)
        return None

    def spider_idle(self):
        """Send 10 requests of artist when request queue is empty in order to
        limit memory consumption"""
        count = 10
        while self.set_artist and count:
            count -= 1
            url = self.set_artist.pop()
            request = Request(url, dont_filter=True, callback=self.get_info)
            self.crawler.engine.crawl(request, self)

    def get_info(self, response):
        """Find callback function for different urls"""
        try:
            if re.search('artist/\d+', response.url) or \
                    re.search('i\.xiami\.com/[^/]+$', response.url):
                self.get_artist(response)
            elif re.search('album/\d+', response.url):
                self.get_albums(response)
            elif re.search('song/\d+', response.url):
                self.get_songs(response)
            elif 'count/getplaycount' in response.url:
                self.get_count(response)
            else:
                self.get_pages(response)
        except (AttributeError, TypeError):
            return
        request = self.gen_info(response)
        if not request:
            self.save(response.meta['source_id'], response.meta['raw_info'],
                      response.meta['result'])
        else:
            yield request

    def get_artist(self, response):
        result = self.extractor.parse_response(response)
        raw_info = {
            'html': response.body,
            'albums': [],
        }
        source_id = self.get_source_id(response)
        if 'redirect_urls' in response.meta:
            response.meta.pop('redirect_times')
            response.meta.pop('redirect_ttl')
            response.meta.pop('redirect_urls')
        response.meta.update({
            'requests': [],
            'raw_info': raw_info,
            'result': result,
            'source_id': source_id
        })
        soup = BeautifulSoup(response.body)
        album_info = soup.find('div', id='artist_album')
        if album_info:
            a_info = album_info.find('a', class_='more')
            response.meta['requests'].append(
                urlparse.urljoin(response.url, a_info['href']))

    @staticmethod
    def get_pages(response):
        soup = BeautifulSoup(response.body)
        div_info = soup.find('div', class_='albumBlock_list')
        p_info = div_info.find_all('p', class_='cover')
        for p in p_info:
            if p.find('span', class_='pubbing'):
                continue
            response.meta['requests'].append(
                urlparse.urljoin(response.url, p.a['href']))
        a_info = soup.find('a', class_='p_redirect_l')
        if a_info:
            response.meta['requests'].append(
                urlparse.urljoin(response.url, a_info['href']))

    def get_albums(self, response):
        raw_info = {
            'html': response.body,
            'songs': [],
        }
        response.meta['raw_info']['albums'].append(raw_info)
        response.meta['result']['albums'].append(
            self.extractor.parse_response_album(response))
        soup = BeautifulSoup(response.body)
        td_info = soup.find_all('td', class_='song_name')
        for td in td_info:
            response.meta['requests'].append(
                urlparse.urljoin(response.url, td.a['href']))

    def get_songs(self, response):
        response.meta['raw_info']['albums'][-1]['songs'].append(response.body)
        response.meta['result']['albums'][-1]['songs'].append(
            self.extractor.parse_response_song(response))
        m = re.search('song/(\d+)', response.url)
        response.meta['requests'].append(self.api_count % m.group(1))

    @staticmethod
    def get_count(response):
        data = json.loads(response.body)
        response.meta['result']['albums'][-1]['songs'][-1][
            'song_played'] = data['plays']

    def gen_info(self, response):
        if not response.meta['requests']:
            return None
        url = response.meta['requests'].pop()
        request = Request(url, meta=response.meta, callback=self.get_info)
        if re.search('song/\d+', url):
            self.download_delay = 1
            request.meta['download_slot'] = 'song'
        return request

    def save(self, source_id, raw_info, result):
        raw_json = json.dumps(raw_info, ensure_ascii=False, sort_keys=True)
        page_id = '%s_%s' % (self.get_source_name(), source_id)
        if self.storage:
            self.storage.save(page_id, raw_json)
        result_json = json.dumps(result, ensure_ascii=False, sort_keys=True)
        with open('%s_result_%s' % (self.name, self.time_stamp),
                  'a') as output:
            output.write(result_json + '\n')

    @staticmethod
    def get_source_id(response):
        m = re.search("id = '(\d+)'", response.body)
        return m.group(1)

    @staticmethod
    def process_request_headers(request):
        """Process request to get 200 response for xiami

            Xiami checks User-Agent in headers. Keep referer empty can keep
            away from login operation.
        """
        request.headers.setdefault(
            'User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) '
            'AppleWebKit/537.36 (KHTML, like Gecko) '
            'Chrome/51.0.2704.103 Safari/537.36')
        if 'redirect_urls' not in request.meta:
            request.headers['Referer'] = None
Exemple #22
0
# Auto generated by generator.py. Delete this line if you make modification.
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor

XPATH = {
    'name': "//table[@class='catalog-detail'][2]//tr[1]/td[1]/span",
    'price': "//div[@class='pro'][3]/span[@class='value']",
    'category': "//ul[@class='breadcrumb-navigation']/li/a",
    'description': "//table[@class='catalog-detail'][2]//tr[1]/td[1]",
    'images': "//div[@id='catalog-detail-main-image']/a/@href",
    'canonical': "",
    'base_url': "",
    'brand': ""
}
name = 'thegioigiay.vn'
allowed_domains = ['thegioigiay.vn']
start_urls = ['http://thegioigiay.vn']
tracking_url = ''
sitemap_urls = ['']
sitemap_rules = [('', 'parse_item')]
sitemap_follow = []
rules = [
    Rule(LinkExtractor(allow=['/products+/\d+/\d+/']), 'parse_item'),
    Rule(LinkExtractor(allow=['/products+/\d+/']), 'parse'),
    #Rule(LinkExtractor(), 'parse_item_and_links'),
]
Exemple #23
0
class WalgreensProductsSpider(CrawlSpider):
    """Walgreens products spider."""

    name = 'products'
    allowed_domains = ['walgreens.com', 'bazaarvoice.com']
    start_urls = ['https://www.walgreens.com/store/catalog/shopLanding']
    rules = (
        Rule(LinkExtractor(allow=('/store/c/', ),
                           deny=('/ID=[^-]+-product', )),
             callback='parse_listing',
             follow=True),
        Rule(LinkExtractor(allow=('/ID=[^-]+-product', )),
             callback='parse_product'),
    )

    # -------------------------------------------------------------------------

    def parse_listing(self, response):
        """
        Extract product list.
        
        @url https://www.walgreens.com/store/c/eyes/ID=360457-tier3
        @returns requests 1
        """
        blob = response.css('script').re_first(
            r'__APP_INITIAL_STATE__ = (\{.+\});')
        if not blob:
            return

        data = json.loads(blob)

        if not data['searchResult'].get('productList'):
            return

        for each in data['searchResult']['productList']:
            yield response.follow(each['productInfo']['productURL'],
                                  callback=self.parse_product)

        limit = response.meta.get('limit', 24)
        offset = int(url_query_parameter(response.url, 'No', 0)) + limit

        return response.follow(add_or_replace_parameter(
            response.url, 'No', offset),
                               callback=self.parse_listing,
                               meta={
                                   'offset': offset,
                                   'limit': limit
                               })

    # -------------------------------------------------------------------------

    def parse_product(self, response):
        """
        Extract product details.
        
        @url https://www.walgreens.com/store/c/l.a.-colors-eyeliner-&-brow-pencil/ID=prod6248128-product
        @returns requests 1
        """
        loader = ProductItemLoader(ProductItem(), response)
        loader.add_value('id', response.url, re=r'/ID=([^-]+)')
        loader.add_css('name', '#productTitle')
        loader.add_css('regular_price', '#regular-price-info')
        loader.add_css('unit_price', '#unit-price')
        loader.add_xpath(
            'category',
            '//ul[has-class("nav__bread-crumbs")]/li[position() > 2]//a')
        loader.add_value('url', response.url)
        loader.add_css('description',
                       '#Details-0 + .wag-accordion-tab-content[id]')
        loader.add_css('warnings',
                       '#Warnings-1 + .wag-accordion-tab-content[id]')
        loader.add_css('ingredients',
                       '#Ingredients-2 + .wag-accordion-tab-content[id]')
        loader.add_css('shipping',
                       '#Shipping-3 + .wag-accordion-tab-content[id]')
        loader.add_css('main_image', '#productImg::attr(src)')
        loader.add_css('image_urls', '#thumbnailImages img::attr(src)')
        loader.add_css('rating', '#reviewsData > .pr10::text')
        loader.add_css('reviews_count',
                       '#reviewsData > .ml10::text',
                       re=r'\d+')
        product = loader.load_item()

        if product.get('reviews_count') and product['reviews_count'] > 0:
            return self.request_reviews(product)
        return product

    # -------------------------------------------------------------------------

    def parse_reviews(self, response):
        """Extract product reviews."""
        product = response.meta.get('product') or {}
        product['reviews'] = product.get('reviews') or []
        data = json.loads(response.body)

        for each in data['Results']:
            review = self.extract_review(each)
            product['reviews'].append(review)

        if product.get('reviews_count') > len(product['reviews']):
            offset = response.meta.get('offset') + len(data['Results'])
            return self.request_reviews(product, offset=offset)
        return product

    # -------------------------------------------------------------------------

    def request_reviews(self, product, offset=0, limit=30):
        """Request reviews."""
        return scrapy.FormRequest(
            method='GET',
            url='https://api.bazaarvoice.com/data/reviews.json',
            formdata={
                'Filter': 'ProductId:%s' % product['id'],
                'Sort': 'Helpfulness:desc',
                'Limit': str(limit),
                'Offset': str(offset),
                'Include': 'Comments',
                'Stats': 'Reviews',
                'passkey': 'tpcm2y0z48bicyt0z3et5n2xf',
                'apiversion': '5.4'
            },
            meta={
                'offset': offset,
                'limit': limit,
                'product': product
            },
            callback=self.parse_reviews)

    # -------------------------------------------------------------------------

    def extract_review(self, data):
        """Extract review details."""
        loader = ReviewItemLoader(ReviewItem())
        loader.add_value('id', data.get('Id'))
        loader.add_value('rating', data.get('Rating'))
        loader.add_value('title', data.get('Title'))
        loader.add_value('text', data.get('ReviewText'))
        loader.add_value('is_featured', data.get('IsFeatured'))
        loader.add_value('published_at', data.get('SubmissionTime'))
        loader.add_value('positive_feedback_count',
                         data.get('TotalPositiveFeedbackCount'))
        loader.add_value('negative_feedback_count',
                         data.get('TotalNegativeFeedbackCount'))
        loader.add_value('reviewer', self.extract_reviewer(data))
        return loader.load_item()

    # -------------------------------------------------------------------------

    def extract_reviewer(self, data):
        """Extract reviewer details."""
        loader = ReviewerItemLoader(ReviewerItem())
        loader.add_value('id', data.get('AuthorId'))
        loader.add_value('username', data.get('UserNickname'))
        loader.add_value('location', data.get('UserLocation'))
        loader.add_value(
            'properties', {
                name: data.get('Value')
                for name, data in data.get('ContextDataValues', {}).items()
            })
        return loader.load_item()
Exemple #24
0
class MopSpider(CrawlSpider):
    name = 'mop'

    allowed_domains = ['mop.com']

    start_urls = ['http://dzh.mop.com/']

    post_extract = LxmlLinkExtractor(
        allow=('/\d+.html', '/nofresh/\d+', 'mop\.com/\d+'),
        allow_domains=('dzh.mop.com'),
        # deny=(
        #
        # ),
        # deny_domains=(
        #
        # )
    )

    author_extract = LxmlLinkExtractor(
        allow=('/space/\d+/profile', ),
        allow_domains=('hi.mop.com'),
        # deny=(
        #
        # ),
        # deny_domains=(
        #
        # )
    )

    author_page_extract = LxmlLinkExtractor(
        allow=('/space/\d+', ),
        allow_domains=('hi.mop.com'),
        # deny=(
        #
        # ),
        # deny_domains=(
        #
        # )
    )

    fans_extract = LxmlLinkExtractor(
        allow=('/space/\d+/fans', ),
        allow_domains=('hi.mop.com'),
        # deny=(
        #
        # ),
        # deny_domains=(
        #
        # )
    )

    friends_extract = LxmlLinkExtractor(
        allow=('/space/\d+/follow', ),
        allow_domains=('hi.mop.com'),
        # deny=(
        #
        # ),
        # deny_domains=(
        #
        # )
    )

    follow_extract = LxmlLinkExtractor(
        # allow=(
        #     '/s/[0-9]+',
        # ),
        allow_domains=('dzh.mop.com'),
        # deny=(
        #     '/print.html'
        # ),
        # deny_domains=(
        #     'q.blog.sina.com.cn'
        # )
    )

    rules = (
        Rule(author_extract, follow=True, callback='parse_author'),
        Rule(fans_extract, follow=True, callback='parse_fans'),
        Rule(friends_extract, follow=True, callback='parse_friends'),
        Rule(author_page_extract, follow=True),
        Rule(post_extract, follow=True, callback='parse_post'),
        # Rule(follow_extract, follow=True, callback='parse_follow'),
        Rule(follow_extract, follow=True),
    )

    a_p_count = 0
    a_count = 0
    p_count = 0
    f_count = 0

    # def parse_page(self, response):
    #     self.a_p_count += 1
    #     print('author page: ', self.a_p_count, '  ', response.url)

    def parse_author(self, response):
        # self.a_count += 1
        # print('author: ', self.a_count, '  ', response.url)
        author_item = get_author_item(response)
        author_id = author_item['author_id']

        data_param = 'data=%7B"header"%3A%7B%7D%2C"req"%3A%7B"User%2FSubCount"%3A%7B"uid"%3A"' + \
                     author_id + '"%7D%2C"User%2FSnsCount"%3A%7B"uid"%3A"' + author_id + '"%7D%7D%7D'

        data_url = 'http://hi.mop.com/ajax/get?' + data_param

        yield Request(
            url=data_url,
            callback=self.parse_author_data,
            method='POST',
            meta={'author_item': author_item},
            priority=10,
        )

    def parse_author_data(self, response):
        author_item = response.meta['author_item']
        data_json = response.text
        try:
            json_obj = json.loads(data_json)
            if json_obj:
                friends_num = json_obj['resp']['User/SnsCount']['retObj'][
                    'follow']
                author_item['friends_num'] = friends_num

                fans_num = json_obj['resp']['User/SnsCount']['retObj']['fans']
                author_item['fans_num'] = fans_num

                post_num = json_obj['resp']['User/SubCount']['retObj'][
                    'subject']
                author_item['post_num'] = post_num

                reply_num = json_obj['resp']['User/SubCount']['retObj'][
                    'reply']
                author_item['reply_num'] = reply_num
        finally:
            yield author_item

    def parse_post(self, response):
        # self.p_count += 1
        # print('post: ', self.p_count, '  ', response.url)
        post_item = get_post_item(response)
        post_id = post_item['post_id']

        for comment_item in get_comment_item(response, post_id):
            post_item['comment_ids'].append(comment_item['comment_id'])

            yield comment_item

        yield post_item

    # def parse_follow(self, response):
    #     self.f_count += 1
    #     print('follow: ', self.f_count, '  ', response.url)

    def parse_fans(self, response):
        sel = Selector(response)
        user_id = sel.xpath('//div[@class="hpUserInfo1"]/@uid').extract_first()

        fans_list = get_fans_item(response)
        for fans_id, fans_url in fans_list:
            fans_item = FansItem()
            fans_item['fans_id'] = fans_id
            fans_item['friends_id'] = user_id

            yield fans_item
            yield Request(url=fans_url + '/profile',
                          callback=self.parse_author)

    def parse_friends(self, response):
        sel = Selector(response)
        user_id = sel.xpath('//div[@class="hpUserInfo1"]/@uid').extract_first()

        friends_list = get_fans_item(response)
        for friends_id, friends_url in friends_list:
            fans_item = FansItem()
            fans_item['fans_id'] = user_id
            fans_item['friends_id'] = friends_id

            yield fans_item
            yield Request(url=friends_url + '/profile',
                          callback=self.parse_author)
Exemple #25
0
class FoshanSpider(CrawlSpider):
    name = "foshan"
    allowed_domains = ["foshannews.com", "foshannews.net"]
    start_urls = [
        'https://www.foshannews.com/', 'https://www.foshannews.com/fstt/',
        'https://www.foshannews.com/cc/', 'https://www.foshannews.com/nh/',
        'https://www.foshannews.com/sd/', 'https://www.foshannews.com/gm/',
        'https://www.foshannews.com/ss/', 'https://www.foshannews.com/jdyw/',
        'https://www.foshannews.com/cc/sstt/',
        'https://www.foshannews.com/sd/sdtt/',
        'https://www.foshannews.com/nh/nhtt/',
        'https://www.foshannews.com/gm/gmtt/',
        'https://www.foshannews.com/ss/sstt/'
    ]

    url_pattern = r'./*/t(\d{8})_(\d+)\.html'

    rules = (Rule(LinkExtractor(allow=(url_pattern)), 'parse_news'), )

    page_link = set()

    def start_requests(self):

        self.page_link = {
            'https://www.foshannews.com/', 'https://www.foshannews.com/fstt/',
            'https://www.foshannews.com/cc/', 'https://www.foshannews.com/nh/',
            'https://www.foshannews.com/sd/', 'https://www.foshannews.com/gm/',
            'https://www.foshannews.com/ss/',
            'https://www.foshannews.com/jdyw/'
        }

        for local in [
                'cc/cctt', 'sd/sdtt', 'nh/nhtt', 'gm/gmtt', 'ss/sstt', 'fstt',
                'jdyw'
        ]:
            for i in range(1, 35):
                url = "https://www.foshannews.com/{}/index_{}.html".format(
                    local, str(i))
                self.page_link.add(url)

        for url in self.page_link:
            yield self.make_requests_from_url(url)

    url_map = dict()

    def get_url_id(self, url):
        return url.split('/')[-1].split('.')[0]

    def parse_news(self, response):
        sel = Selector(response)
        pattern = re.match(self.url_pattern, str(response.url))

        item = NewsItem()

        url_id = self.get_url_id(str(response.url))
        if url_id in self.url_map:
            self.url_map[url_id] += 1
            return item
        else:
            self.url_map[url_id] = 1

        item['contents'] = {
            'link': str(response.url),
            'title': u'',
            'passage': u''
        }
        item['contents']['title'] = sel.xpath('////h1/text()').extract_first()

        divs = sel.xpath('//div[@class=\'cont\']/div')
        list_doc = []
        for l in divs.xpath('.//div/text() | .//div/b/text()').extract():
            t = l.strip()
            if len(t) > 0:
                list_doc.append(t)

        for l in divs.xpath('.//span/text()').extract():
            t = l.strip()
            if len(t) > 0:
                list_doc.append(t)

        for l in divs.xpath('.//p/text() | .//p/strong/text()').extract():
            t = l.strip()
            if len(t) > 0:
                list_doc.append(t)

        item['contents']['passage'] = list_doc

        return item
Exemple #26
0
# Auto generated by generator.py. Delete this line if you make modification.
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor

XPATH = {
    'name': "//div[@class='Gpod_box_giua']/div[@class='protp2']/h1",
    'price':
    "//div[@class='Gpod_box_giua']/div[@class='protp2']/ol/li/b/text()",
    'category': "//div[@class='menudd']/h1/a",
    'description':
    "//div[@class='Gpod_box_giua']/table//tr/td/div/div[@id='country1']",
    'images': "//div[@class='hbimg zoomp']/a[@id='Zoomer']/@href",
    'canonical': "//link[@rel='canonical']/@href",
    'base_url': "",
    'brand': ""
}
name = 'tanphat.com.vn'
allowed_domains = ['tanphat.com.vn']
start_urls = ['http://www.tanphat.com.vn/']
tracking_url = ''
sitemap_urls = ['']
sitemap_rules = [('', 'parse_item')]
sitemap_follow = []
rules = [
    Rule(LinkExtractor(allow=['/[a-zA-Z0-9-]+_id+\d+\.html']), 'parse_item'),
    Rule(
        LinkExtractor(allow=['/[a-zA-Z0-9-]+_dm+\d+\.html($|\?page=\d+$)'],
                      deny=['max=', 'min=', 'brand=', 'filter=']), 'parse'),
    #Rule(LinkExtractor(), 'parse_item_and_links'),
]
class AlbawabaSpider(CrawlSpider):

    name = "albawabacrawler"
    #allowed_domains =[url[0][8:] for url in csv.reader(open('/home/chrx/Desktop/Scrapy/HezbollahScraper/urls.csv','r'),delimiter =',')]
    allowed_domains = ["www.albawaba.com"]

    #start_urls = [url[0] for url in csv.reader(open('/home/chrx/Desktop/Scrapy/HezbollahScraper/urls.csv','r'),delimiter =',')]
    start_urls = ["https://www.albawaba.com"]
    custom_settings = {
        'CONCURRENT_REQUESTS_PER_DOMAIN': 5,
        'AUTOTHROTTLE_ENABLED': True,
        'AUTOTHROTTLE_START_DELAY': 1,
        'AUTOTHROTTLE_MAX_DELAY': 3
    }
    rules = [
        Rule(LinkExtractor(unique=True),
             follow=True,
             callback="check_buzzwords")
    ]

    terms = []
    locations = []
    organizations = []
    wordlist = []

    with open('C:/Users/Alex/Desktop/HezbollahScrapper/terms_english.csv',
              'r') as csvfile:
        terms_reader = csv.reader(csvfile, delimiter=',')
        for row in terms_reader:
            terms.append(row[0])

    with open(
            'C:/Users/Alex/Desktop/HezbollahScrapper/organizations_english.csv',
            'r') as csvfile:
        terms_reader = csv.reader(csvfile, delimiter=',')
        for row in terms_reader:
            organizations.append(row[0])
    for term in terms:
        for organization in organizations:
            wordlist.append(tuple((term, organization)))

    def check_buzzwords(self, response):
        url = response.url
        contenttype = response.headers.get("content-type",
                                           "").decode('utf-8').lower()
        items = []

        paragraph_text = response.css("p::text")
        p_texts = [p.get() for p in paragraph_text]

        for p_text in p_texts:
            p_text_lower = p_text.lower()
            for word_row in self.wordlist:
                if word_row[0].lower() in p_text_lower and word_row[1].lower(
                ) in p_text_lower:
                    item = TutorialItem()
                    item["word"] = word_row[0]
                    item["url"] = url
                    item["sentence"] = p_text
                    items.append(item)

        return (items)

    #gets the requests to follow recursively
    def _requests_to_follow(self, response):
        if getattr(response, "encoding", None) != None:
            return CrawlSpider._requests_to_follow(self, response)
        else:
            return []
Exemple #28
0
class MonsterSpider(CrawlSpider):

    name = 'monster'
    allowed_domains = ['monster.fr']
    start_urls = gen_start_urls("http://www.monster.fr/emploi/recherche/?q=%s",
                                KEYWORDS, "-")
    login_page = 'https://login.monster.fr/Login/SignIn'
    rules = [
        Rule(
            LinkExtractor(allow=['.*jobPosition=.*?']),
            'parse_job',
        ),
        Rule(LinkExtractor(allow=('.*page=.*?', )), follow=True)
    ]

    def parse_job(self, response):
        job = ScraperItem()
        sel = Selector(response)
        job['url'] = response.url
        job_offer = sel.xpath('//title/text()').extract()
        job_offer = job_offer[0].strip()
        job_offer = job_offer.split('-')
        job['name'] = job_offer[0]
        job["email"] = None
        job["phone"] = None
        return job

    @staticmethod
    def execute_js():
        from scraper.models import Jobs, db_connect
        from selenium.webdriver.common.action_chains import ActionChains
        from sqlalchemy.orm import sessionmaker
        from selenium.common.exceptions import NoSuchElementException, UnexpectedAlertPresentException
        from selenium import webdriver
        import re
        import time

        # Get DB engine
        engine = db_connect()
        Session = sessionmaker()
        Session.configure(bind=engine)
        session = Session()

        # Iterate through job urls
        urls = []
        q = session.query(
            Jobs).filter((Jobs.url.like('http://offre-emploi.monster.fr%'))
                         & (Jobs.processed == False)).all()
        for url in q:
            urls.append(url.url)

        # Init browser
        profile = webdriver.FirefoxProfile()
        profile.set_preference("browser.cache.disk.enable", False)
        profile.set_preference("browser.cache.memory.enable", False)
        profile.set_preference("browser.cache.offline.enable", False)
        profile.set_preference("network.http.use-cache", False)

        browser = webdriver.Firefox(profile)
        action = ActionChains(browser)

        # Login to user space
        browser.get("https://login.monster.fr/Login/SignIn", )
        browser.find_element_by_name("EmailAddress").send_keys(EMAIL)
        browser.find_element_by_name("Password").send_keys(PASSWORD)

        elem = browser.find_element_by_xpath(
            "//*[@id=\"signInContent\"]/form/div[3]/input[1]")
        action.move_to_element(elem).click().perform()
        time.sleep(5)

        # for each url, click on 'postuler'
        link = "http://offre-emploi.monster.fr/Apply/Apply.aspx?JobID="
        for url in urls:
            apply_link = re.findall(r"\b\d{6}\w+", url)
            try:
                apply_link = link + apply_link[0]
                print "* Processing %s" % url
                browser.get(apply_link)
                if 'Vous postulez' in browser.page_source.encode("utf-8"):
                    browser.find_element_by_css_selector(
                        "#CoverLetter1_DropDownListLetters > option:nth-child(2)"
                    ).click()
                    browser.find_element_by_css_selector(
                        "#rbAuthorizedNo0").click()

                    # Click on "POSTULER"
                    browser.find_element_by_id('btnSubmit').click()
                    time.sleep(5)

                else:
                    pass

            except NoSuchElementException:
                raise

            except UnexpectedAlertPresentException:
                alert = browser.switch_to_alert()
                #alert.dismiss()
                continue

            finally:
                # Update database
                session.query(Jobs).filter(Jobs.url == url).update(
                    {'processed': True})
                session.commit()
                session.close()

        browser.close()
Exemple #29
0
class PCComponentes(CrawlSpider):
    name = 'sonae-pccomponentes'
    allowed_domains = ['pccomponentes.pt']
    start_urls = ['https://www.pccomponentes.pt/']

    categories = LinkExtractor(
        restrict_xpaths=('//*[contains(@class, "menu-principal")]',
                         '//*[contains(@class, "enlaces-clave")]'))

    rules = (Rule(categories, callback='parse_category'), )

    def start_requests(self):
        for url in self.start_urls:
            yield Request(url)

        if hasattr(self, 'prev_crawl_id'):
            filename = os.path.join(DATA_DIR,
                                    '%s_products.csv' % self.prev_crawl_id)

            with open(filename) as f:
                reader = csv.DictReader(f)
                for row in reader:
                    product = Product()
                    for key in row:
                        if row[key]:
                            product[key] = row[key].decode('utf8')
                    yield Request(row['url'],
                                  self.parse_product,
                                  meta={'item': Product(product)})

    def parse_category(self, response):
        try:
            data = SpiderSchema(response).get_products()
        except:
            return
        products = False
        for product in data:
            if not product.get('sku'):
                continue
            products = True
            loader = ProductLoader(Product(), response=response)
            loader.add_value('identifier', product['sku'])
            loader.add_value('url', product['url'][0])
            loader.add_value('name', product['name'])
            loader.add_value('sku', product['sku'])
            category = response.css('a.GTM-breadcumb::text').extract(
            )[1:] or response.meta.get('category')
            loader.add_value('category', category)
            loader.add_value('image_url', product['image'])
            loader.add_value('brand', product['brand'])
            if product['offers']['properties']['availability'] != 'in stock':
                loader.add_value('stock', 0)
            price = product['offers']['properties']['price']
            yield Request(loader.get_output_value('url'),
                          self.parse_product,
                          meta={'item': Product(loader.load_item())})
        if not products:
            return

        page = url_query_parameter(response.url, 'page')
        if page:
            url = add_or_replace_parameter(response.url, 'page', int(page) + 1)
        else:
            id_families = response.xpath(
                '//input[@data-key="idFamilies"]/@value').extract_first()
            if id_families:
                url = add_or_replace_parameter(
                    'https://www.pccomponentes.pt/listado/ajax?page=0&order=price-desc',
                    'idFamilies[]', id_families)
            elif response.url.endswith('/novedades/'):
                return
            elif response.url.endswith('/'):
                url = response.url + 'ajax?page=0&order=price-desc'
            else:
                return

        yield Request(url, self.parse_category, meta={'category': category})

    def parse_product(self, response):
        item = response.meta['item']
        data = SpiderSchema(response).get_product()
        category = response.css('a.GTM-breadcumb::text').extract()[1:]
        loader = ProductLoaderEU(Product(), response=response)
        loader.add_value(None, item)
        loader.replace_value('price', data['offers']['properties']['price'])
        loader.replace_value('category', category)
        if data['offers']['properties']['availability'] != 'inStock':
            loader.replace_value('stock', 0)
        yield loader.load_item()
Exemple #30
0
class BiqugeSpider(CrawlSpider):
    name = 'biquge'
    allowed_domains = ['www.biquyun.com', 'biquyun.com']
    start_urls = ['http://www.biquyun.com/']

    custom_settings = {
        "DOWNLOAD_DELAY": 0.5,
        "USE_PROXY": False,
        "IGNORE_NOVEL": set({}),
        "RETAIN_NOVEL": set({}),
    }

    rules = (
        Rule(LinkExtractor(allow=r'\d+_\d+/$'), callback='parse_novel', follow=True),
        # Rule(LinkExtractor(allow=r'.*?/\d+.html'), callback='parse_chapter', follow=True,
        #      process_request="custom_process_request"),
    )

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(BiqugeSpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.spider_close, signal=signals.spider_closed)
        return spider

    def __init__(self, *args, **kwargs):
        super(BiqugeSpider, self).__init__(*args, **kwargs)

    def spider_close(self):
        self.logger.info("{0} finish.".format(self.name))

    def parse_novel(self, response):
        author_widget = response.xpath("//div[@id='info']/p[1]/text()").extract()
        author = "".join([get_author_by_biquge(author) for author in author_widget])
        novel = response.css("#info h1::text").extract_first().strip()

        # 已存在的小说不做处理
        if novel_is_exists(novel_name=novel, author_name=author):
            self.logger.info("过滤已存在的小说[{0}]".format(novel))
            return None

        item_loader = ItemLoader(item=NovelItem(), response=response)
        item_loader.add_value("url", response.url)
        item_loader.add_css("image_url", "#fmimg img::attr(src)")
        item_loader.add_css("site_name", ".header_logo a::text")
        item_loader.add_value("novel_name", novel)
        item_loader.add_value("spider_name", self.name)
        item_loader.add_value("author", author)

        category = response.xpath("//div[@class='con_top']/a[2]/text()").extract_first()
        if not category:
            categories = response.css(".con_top::text").extract()
            category = get_category_by_biquge("".join(categories))
        category = category or "其他小说"
        item_loader.add_value("category", category)

        item_loader.add_css("intro", "#intro")
        item = item_loader.load_item()
        yield item

        urls = response.css("#list dl dd a::attr(href)").extract()
        for url in urls:
            url = parse.urljoin(response.url, url)
            yield Request(url=url, dont_filter=True, meta={"novel": novel, "author": author},
                          callback=self.parse_chapter)

    def parse_chapter(self, response):
        author = response.meta.get("author", "")
        novel = response.meta.get("novel", "")

        item_loader = ItemLoader(item=ChapterItem(), response=response)
        item_loader.add_value("url", response.url)

        index = get_chapter_index_by_biquge(response.url)
        item_loader.add_value("index", index)

        item_loader.add_css("name", ".bookname h1::text")
        item_loader.add_value("novel_name", novel)
        item_loader.add_value("author_name", author)
        item = item_loader.load_item()
        return item