from scrapy.linkextractors import LinkExtractor from scrapy.spiders import Rule rules = { 'china': ( Rule(LinkExtractor( allow='article\/.*\.html', restrict_xpaths='//div[@id="left_side"]//div[@class="con_item"]'), callback='parse_item'), #Rule(LinkExtractor(restrict_xpaths='//div[@id="pageStyle"]//a[contains(., "下一页")]')) ), 'tianyancha': (Rule(LinkExtractor( allow='company\/.*', restrict_xpaths= '//div[contains(@class,"search-result-single")]//div[@class="header"]' ), callback='parse_item'), ) }
# Auto generated by generator.py. Delete this line if you make modification. from scrapy.spiders import Rule from scrapy.linkextractors import LinkExtractor XPATH = { 'name' : "//div[@id='info_detail_product']/h5[1]/b", 'price' : "//div[@id='info_detail_product']/h5[3]/b", 'category' : "", 'description' : "//div[@id='info_detail_product']/font", 'images' : "//div[@class='img_detail_product']/img/@src", 'canonical' : "", 'base_url' : "", 'brand' : "" } name = 'chipmobile.com.vn' allowed_domains = ['chipmobile.com.vn'] start_urls = ['http://chipmobile.com.vn'] tracking_url = '' sitemap_urls = [''] sitemap_rules = [('', 'parse_item')] sitemap_follow = [] rules = [ Rule(LinkExtractor(allow=['/?page=chi-tiet-san-pham']), 'parse_item'), Rule(LinkExtractor(allow=['/?page=nhom-san-pham']), 'parse'), #Rule(LinkExtractor(), 'parse_item_and_links'), ]
class BossSpider(CrawlSpider): name = 'boss' start_urls = [ 'https://www.zhipin.com/gongsir/5d627415a46b4a750nJ9.html?page=1' ] url1 = 'https://www.zhipin.com' #用来做拼接 # 匹配职位列表页的规则(定义抽取连接规则) rules = (Rule(LinkExtractor(allow=r'.+\?page=\d+'), callback="parse_url", follow=True), ) # 匹配详情页的规则 # rules = ( # Rule(LinkExtractor(allow=r'.+job_detail/\w+~.html'), callback="detail_parse", follow=False), # ) def parse_url(self, response): item = GetBossItem() for i in range(1, 15): url = response.xpath( '//*[@id="main"]/div[2]/div[2]/div[2]/ul/li[{}]/a/@href'. format(str(i))).extract() url = self.url1 + str(url[0]) print(url) # if item['url']: yield Request( url, callback=self.detail_parse, #回调详情页函数 meta={'item': item}, #将参数传递给meta# priority=10, dont_filter=True, #强制不过滤 #headers=headers # headers=self.headers ) def detail_parse(self, response): item = response.meta['item'] #接收item # 企业名称 dp_name = response.xpath( '//div[@class="job-sec"]/div[@class="name"]/text()').get().strip() # 企业类型 dp_type = response.xpath( '//div[@class="level-list"]/li[@class="company-type"]/text()' ).getall()[0] # 企业成立时间 dp_founded = response.xpath( '//div[@class="level-list"]/li[@class="res-time"]/text()').getall( )[0] # 职位名称 job_name = response.xpath( '//div[@class="company-info"]/div[@class="name"]/h1/text()').get( ).strip() # 学历要求 education = response.xpath( '//*[@id="main"]/div[1]/div/div/div[2]/p/text()').getall()[2] # 工作经验要求 experience = response.xpath( '//*[@id="main"]/div[1]/div/div/div[2]/p/text()').getall()[1] # 薪资 salary = response.xpath( '//*[@id="main"]/div[1]/div/div/div[2]/div[2]/span/text()').get( ).strip() # 招聘状态 state = response.xpath( '//*[@id="main"]/div[3]/div/div[1]/div[2]/p[6]/text()').get( ).strip() # 职位描述 description = response.xpath( '//*[@id="main"]/div[3]/div/div[2]/div[2]/div[1]/div/text()' ).getall() description = str(description).strip('[\']\\n ') # 员工福利 welfare = response.xpath( '//*[@id="main"]/div[1]/div/div/div[2]/div[3]/div[2]/span/text()' ).getall() welfare = str(welfare) # 工作地址 address = response.xpath( '//div[@class="job-location"]/div[@class="location-address"]/text()' ).get().strip() item['dp_name'] = dp_name item['dp_type'] = dp_type item['dp_founded'] = dp_founded item['job_name'] = job_name item['education'] = education item['experience'] = experience item['salary'] = salary item['state'] = state item['description'] = description item['welfare'] = welfare item['address'] = address yield item
class YiyaoSpider(CrawlSpider): name = 'YiYao' allowed_domains = ['www.cpi.ac.cn', 'www.cccmhpie.org.cn'] start_urls = [ 'http://www.cpi.ac.cn/publish/default/hyzx/index.htm', 'http://www.cccmhpie.org.cn/ShowNewsList.aspx?QueryStr=x08x12o8q7x09x01w1z4892x9994z6164z5759zO3w8w1u9v5v5v5zO3x10x02x11p4x2X12x01w1u8z8p2x01q9p4x2X12x01w1u9z8w7x08q7x15x15p3x0X14x18x0X14o3w8w1p3p9p3p3x0X14x18x0X14z8w7x08q7x15x15p4q7q8x08x01o8q7x09x01w1p3x2X15q5w7x08q7x15x15z8p5x10x05x13x17x01o3w8w1z8w8q7x16q7p3x0X14x18x0X14o3w8w1p3p9p3p3x0X14x18x0X14z8w8q7x16q7p4q7q8x08x01o8q7x09x01w1w8x11q9q5o0x05x14x15x16pQ7x03x01z8x00x0X15q9p5x10x05x13x17x01o3w8w1u9v5v5v5z8p2x1X1X16w7x08q7x15x15o3w8w1v7u8u9v5z8w7x08q7x15x15o3w8w1u9v5v5v5zO6x05x10x07o3w8w1u9v5v5v5', 'http://www.cccmhpie.org.cn/ShowNewsList.aspx?QueryStr=x08x12o8q7x09x01w1y2269z8469y1160y4577zO3w8w1u9v5v5v1zO3x10x02x11p4x2X12x01w1u8z8p2x01q9p4x2X12x01w1u9z8w7x08q7x15x15p3x0X14x18x0X14o3w8w1p3p9p3p3x0X14x18x0X14z8w7x08q7x15x15p4q7q8x08x01o8q7x09x01w1p3x2X15q5w7x08q7x15x15z8p5x10x05x13x17x01o3w8w1z8w8q7x16q7p3x0X14x18x0X14o3w8w1p3p9p3p3x0X14x18x0X14z8w8q7x16q7p4q7q8x08x01o8q7x09x01w1w8x11q9q5o0x05x14x15x16pQ7x03x01z8x00x0X15q9p5x10x05x13x17x01o3w8w1u9v5v5v1z8p2x1X1X16w7x08q7x15x15o3w8w1v7u8u9v5z8w7x08q7x15x15o3w8w1u9v5v5v1zO6x05x10x07o3w8w1u9v5v5v1' ] custom_settings = { # 并发请求 'CONCURRENT_REQUESTS': 10, # 'CONCURRENT_REQUESTS_PER_DOMAIN': 1000000, 'CONCURRENT_REQUESTS_PER_IP': 0, # 下载暂停 'DOWNLOAD_DELAY': 0.5, 'ITEM_PIPELINES': { # 设置异步入库方式 'HY_NEWS.pipelines.MysqlTwistedPipeline': 600, # 去重逻辑 # 'HY_NEWS.pipelines.DuplicatesPipeline': 200, }, 'DOWNLOADER_MIDDLEWARES': { # 调用 scrapy_splash 打开此设置 # 'scrapy_splash.SplashCookiesMiddleware': 723, # 'scrapy_splash.SplashMiddleware': 725, # 设置设置默认代理 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 700, # 设置请求代理服务器 # 'HY_NEWS.util_custom.middleware.middlewares.ProxyMiddleWare': 100, # 设置scrapy 自带请求头 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, # 自定义随机请求头 'HY_NEWS.util_custom.middleware.middlewares.MyUserAgentMiddleware': 120, # 重试中间件 'scrapy.downloadermiddlewares.retry.RetryMiddleware': None, # 重试中间件 'HY_NEWS.util_custom.middleware.middlewares.MyRetryMiddleware': 90, }, # 调用 scrapy_splash 打开此设置 # 'SPIDER_MIDDLEWARES': { # 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, # }, # 去重/api端口 # 'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter', # # 'SPLASH_URL': "http://10.8.32.122:8050/" # 'SPLASH_URL': "http://127.0.0.1:8050/" } rules = ( Rule(LinkExtractor(restrict_css='.pager a:nth-child(3) '), follow=True), Rule(LinkExtractor(restrict_css='.news-li a '), callback='parse_item', follow=True), Rule(LinkExtractor(restrict_css='.DocTitle a'), callback='parse_item', follow=True), ) def parse_item(self, response): item = HyNewsItem() resp = response.text extractor = GeneralNewsExtractor() result = extractor.extract(resp, with_body_html=False) title = result['title'] txt = result['content'] p_time = result['publish_time'] lyurl = response.url lyname = '医药' content_css = [ '.left-cc', '.pagesContent', ] for content in content_css: content = ''.join(response.css(content).extract()) if content: break if not content: logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent') classify, codes, region = get_category(txt) item['title'] = title item['txt'] = txt item['p_time'] = get_times(p_time) item['content'] = content item['spider_name'] = 'YiYao' item['module_name'] = '行业新闻' item['cate'] = classify item['region'] = region item['code'] = codes item['link'] = lyurl item['website'] = lyname if content: yield item
class LagouSpider(CrawlSpider): name = 'lagou' allowed_domains = ['www.lagou.com'] start_urls = ['http://www.lagou.com/'] rules = ( # Rule(LinkExtractor(allow=(r'zhaopin/.*',)), follow=True), # Rule(LinkExtractor(allow=(r'gongsi/j\d+.html',)), follow=True), Rule(LinkExtractor(allow=r'jobs/\d+.html'), callback='parse_job', follow=False), ) def parse_job(self, response): # 解析拉勾网的职位 item_loader = ArticleItemLoader(item=LagouJobItem(), response=response) item_loader.add_css("title", ".job-name::attr(title)") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("salary", ".job_request .salary::text") item_loader.add_xpath("job_city", "//*[@class='job_request']//span[2]/text()") item_loader.add_xpath("work_years", "//*[@class='job_request']//span[3]/text()") item_loader.add_xpath("degree_need", "//*[@class='job_request']//span[4]/text()") item_loader.add_xpath("job_type", "//*[@class='job_request']//span[5]/text()") item_loader.add_css("tags", '.position-label li::text') item_loader.add_css("publish_time", ".publish_time::text") item_loader.add_css("job_advantage", ".job-advantage p::text") item_loader.add_css("job_desc", ".job_bt div") item_loader.add_css("job_addr", ".work_addr") item_loader.add_css("company_name", "#job_company dt a img::attr(alt)") item_loader.add_css("company_url", "#job_company dt a::attr(href)") item_loader.add_value("crawl_time", datetime.now()) job_item = item_loader.load_item() return job_item def start_requests(self): # 使用selenium模拟登录后拿到cookie交给scrapy的request使用 # 1、通过selenium模拟登录 # 从文件中读取cookies cookies = [] if os.path.exists("lagou.cookie"): cookies = pickle.load(open("lagou.cookie", "rb")) if not cookies: import time from selenium import webdriver browser = webdriver.Chrome( executable_path="C:/scrapy/chromedriver.exe") browser.get("https://passport.lagou.com/login/login.html") browser.find_element_by_css_selector( ".form_body .input.input_white").send_keys("18140523326") browser.find_element_by_css_selector( '.form_body input[type="password"]').send_keys("linfeng0328") browser.find_element_by_css_selector( 'div[data-view="passwordLogin"] input.btn_lg').click() input("检查网页是否有验证码要输入,有就在网页输入验证码,输入完后,控制台回车;如果无验证码,则直接回车") time.sleep(3) cookies = browser.get_cookies() # 写入cookie到文件中 pickle.dump(cookies, open("lagou.cookie", "wb")) cookie_dict = {} for cookie in cookies: cookie_dict[cookie["name"]] = cookie["value"] for url in self.start_urls: yield scrapy.Request(url, dont_filter=True, cookies=cookie_dict)
class LagouSpider(CrawlSpider): name = 'lagou' allowed_domains = ['www.lagou.com'] start_urls = [ 'https://www.lagou.com/jobs/list_%E8%BD%AF%E4%BB%B6%E6%B5%8B%E8%AF%95?px=default&city=%E6%B7%B1%E5%9C%B3#filterBox' ] custom_settings = { "COOKIES_ENABLED": False, "DOWNLOAD_DELAY": 1, 'DEFAULT_REQUEST_HEADERS': { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Connection': 'keep-alive', 'Cookie': 'user_trace_token=20171015132411-12af3b52-3a51-466f-bfae-a98fc96b4f90; LGUID=20171015132412-13eaf40f-b169-11e7-960b-525400f775ce; SEARCH_ID=070e82cdbbc04cc8b97710c2c0159ce1; ab_test_random_num=0; X_HTTP_TOKEN=d1cf855aacf760c3965ee017e0d3eb96; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=0; PRE_UTM=; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DsXIrWUxpNGLE2g_bKzlUCXPTRJMHxfCs6L20RqgCpUq%26wd%3D%26eqid%3Dee53adaf00026e940000000559e354cc; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; index_location_city=%E5%85%A8%E5%9B%BD; TG-TRACK-CODE=index_hotjob; login=false; unick=""; _putrc=""; JSESSIONID=ABAAABAAAFCAAEG50060B788C4EED616EB9D1BF30380575; _gat=1; _ga=GA1.2.471681568.1508045060; LGSID=20171015203008-94e1afa5-b1a4-11e7-9788-525400f775ce; LGRID=20171015204552-c792b887-b1a6-11e7-9788-525400f775ce', 'Host': 'www.lagou.com', 'Origin': 'https://www.lagou.com', 'Referer': 'https://www.lagou.com/', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', } } rules = ( Rule(LinkExtractor(allow=(r'zhaopin/.*', )), follow=True), Rule(LinkExtractor(allow=(r'gongsi/j\d\.html', )), follow=True), Rule(LinkExtractor( allow=(r'jobs/.*', ), restrict_css=("div#s_position_list ul.item_con_list"), ), callback='parse_item', follow=False), ) def parse_item(self, response): Item_loader = LagouItemLoader(item=LagouItem(), response=response) # TITLE Item_loader.add_xpath("title", "//div[@class='job-name']/@title") # URL Item_loader.add_value("url", response.url) # salary Item_loader.add_xpath("salary", "//dd[@class='job_request']/p/span[1]/text()") # job_city Item_loader.add_xpath("job_city", "//dd[@class='job_request']/p/span[2]/text()") # work_years Item_loader.add_xpath("work_years", "//dd[@class='job_request']/p/span[3]/text()") # degree_need Item_loader.add_xpath("degree_need", "//dd[@class='job_request']/p/span[4]/text()") # job_type Item_loader.add_xpath("job_type", "//dd[@class='job_request']/p/span[5]/text()") # tags Item_loader.add_xpath("tags", "//li[@class='labels']/text()") # publish-time Item_loader.add_xpath("publish_time", "//p[@class='publish_time']/text()") # job_advantage Item_loader.add_xpath("job_advantage", "//dd[@class='job-advantage']/p/text()") # job_desc Item_loader.add_xpath("job_desc", "//dd[@class='job_bt']/div/p/text()") # work_addr Item_loader.add_xpath("work_addr", "//div[@class='work_addr']/a/text()") # company_name Item_loader.add_xpath("company_name", "//dl[@class='job_company']/dt/a/img/@alt") # company_url Item_loader.add_xpath("company_url", "//dl[@class='job_company']/dt/a/@href") lagou_item_loader = Item_loader.load_item() return lagou_item_loader
class XiaoshoumoviesSpider(CrawlSpider): """ name:scrapy唯一定位实例的属性,必须唯一 allowed_domains:允许爬取的域名列表,不设置表示允许爬取所有 start_urls:起始爬取列表 start_requests:它就是从start_urls中读取链接,然后使用make_requests_from_url生成Request, 这就意味我们可以在start_requests方法中根据我们自己的需求往start_urls中写入 我们自定义的规律的链接 parse:回调函数,处理response并返回处理后的数据和需要跟进的url log:打印日志信息 closed:关闭spider """ name = 'XiaoShouMovies' allowed_domains = ['www.p4vip.com'] start_urls = ['http://www.p4vip.com/?m=vod-type-id-1.html', 'http://www.p4vip.com/?m=vod-type-id-2.html', 'http://www.p4vip.com/?m=vod-type-id-3.html', 'http://www.p4vip.com/?m=vod-type-id-4.html', 'http://www.p4vip.com/?m=vod-type-id-16.html'] # 连接提取器:会去起始url响应回来的页面中提取指定的url # rules元组中存放的是不同的规则解析器(封装好了某种解析规则) rules = ( # 规则解析器:可以将连接提取器提取到的所有连接表示的页面进行指定规则(回调函数)的解析 # Rule(LinkExtractor(allow=r'.*type-id-8-.*'), follow=True), Rule(LinkExtractor(restrict_xpaths='//a[text()="下一页"]'), follow=True), # Rule(LinkExtractor(allow=r'.*detail-id-\d{1,8}\.html'),callback='parse_list', follow=True), Rule(LinkExtractor(restrict_xpaths='//a[@class="link-hover"]'), callback='parse_list', follow=True), # 由于该网站所有的播放链接都是存在一起的,即在任何一个剧集播放页面即可拿到全集的播放地址链接。所以单独爬取一个页面即可,不需要爬取全部页面链接 Rule(LinkExtractor(allow='.*play-id-\d{1,8}-src-1-num-1.html'), callback='parse_link', follow=False), ) # 解析列表页方法 def parse_list(self, response): # # 根据xpath表达式提取电影各种信息 # for a in a_list: # item = MovieByXiaoShouItem() # item['movie_name'] = a.xpath('@title').get() # item['movie_cover'] = a.xpath('./img/@data-original').get() # yield item # print(response.xpath('//dt[@class="name"]/text()').get()) item = MoviespidersItem() try: # item['num'] = re.findall('(?<=detail-id-).*(?=\.html)',response.request.url)[0] item['num'] = re.findall(r'\d{1,8}', response.request.url)[1] item['name'] = response.xpath('//dt[@class="name"]/text()').get() item['cover'] = response.xpath('//img[@class="lazy"]/@data-original').get() starrings = response.xpath('//dt[2]/a/text()') starring_arr = [] for starring in starrings: starring_arr.append(starring.get()) item['starrings'] = ','.join(starring_arr) item['type'] = response.xpath('//dt[3]/a/text()').get() item['director'] = response.xpath('//dd[1]/a/text()').get() item['region'] = response.xpath('//dd[1]/dd/text()').get() item['year'] = response.xpath('//dd[2]/text()').get() item['language'] = response.xpath('//dd[3]/text()').get() introduction = response.xpath('//div[@class="tab-jq"]/span/text()').get() if introduction is None: introduction = response.xpath('//div[@class="tab-jq"]/text()').get().strip() item['introduction'] = introduction item['state'] = response.xpath('//span[@class="bz"]/text()').get() item['fromUrl'] = response.request.url except Exception: with open('./exception.txt', 'w') as f: f.write(response.xpath('//dt[@class="name"]/text()').get() + ' --- ' + response.request.url + '\n') f.close() return item # 解析播放页的播放地址 def parse_link(self, response): item = MovieplayItem() try: item['num'] = re.findall(r'\d{1,8}', response.request.url)[1] playLink = re.findall(r'(?<=mac_url\=unescape\(\').*?(?=\'\))', response.text)[0] link = execjs.eval("unescape('" + playLink + "')") item['playLink'] = link except Exception: with open('./exception.txt', 'w') as f: f.write( response.xpath('//dt[@class="name"]/text()').get() + ' -url- ' + response.request.url + '\n') f.close() return item
class EzyVisionSpider(Spider): name = 'specsavers_nz-ezyvision.co.nz' allowed_domains = ('ezyvision.co.nz', ) start_urls = ['http://www.ezyvision.co.nz/'] rules = (Rule(LinkExtractor(allow=('Brand', ))), ) def __init__(self, *args, **kwargs): super(EzyVisionSpider, self).__init__(*args, **kwargs) dispatcher.connect(self.spider_idle, signals.spider_idle) def spider_idle(self, spider): self.log('spider idle called') if spider.name == self.name: req = Request('http://www.ezyvision.co.nz/search', callback=self.parse_search) self.crawler.engine.crawl(req, self) def parse(self, response): urls = response.xpath('//a/@href').extract() urls = [url for url in urls if 'Brand=' in url] for url in urls: yield Request(urljoin(get_base_url(response), url), callback=self.parse_brand) def parse_brand(self, response): brand = url_query_parameter(response.url, 'Brand', '') urls = response.xpath( '//section[@id="productList"]//a/@href').extract() for url in urls: yield Request(urljoin(get_base_url(response), url), meta={'brand': brand}, callback=self.parse_product) def parse_search(self, response): brand = url_query_parameter(response.url, 'Brand', '') urls = response.xpath( '//section[@id="productList"]//a/@href').extract() for url in urls: yield Request(urljoin(get_base_url(response), url), meta={'brand': brand}, callback=self.parse_product) yield Request('http://www.ezyvision.co.nz/ajax/search', callback=self.parse_ajax_search) def parse_ajax_search(self, response): base_url = 'http://www.ezyvision.co.nz/product/' data = json.loads(response.body) if data.get('products', None): for product in data['products']: yield Request(urljoin(base_url, product['url']), meta={'brand': ''}, callback=self.parse_product) products_loaded = int(response.meta.get('products_loaded', 6)) + 6 formdata = { 'action': 'load products', 'productsLoaded': str(products_loaded) } yield FormRequest('http://www.ezyvision.co.nz/ajax/search', dont_filter=True, formdata=formdata, callback=self.parse_ajax_search, meta={'products_loaded': products_loaded}) def parse_product(self, response): loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//section[@class="product"]//h1/text()') loader.add_value('url', response.url) loader.add_value('brand', response.meta.get('brand', '')) price = ''.join(response.xpath('//h2[@id="tprices"]/text()').extract()) loader.add_value('price', price) image_url = response.xpath( '//figure[@class="main"]//img/@src').extract()[0] if image_url.endswith('.jpg'): loader.add_value('image_url', urljoin(get_base_url(response), image_url)) cat = response.xpath( '//article[@class="breadcrumbs"]//text()').extract() cat = [r for r in cat if r.strip().replace(u'\u203a', '')] cat = cat[2:-1] for c in cat: loader.add_value('category', c) loader.add_xpath('identifier', '//input[@name="product_id"]/@value') loader.add_xpath('sku', '//input[@name="product_id"]/@value') loader.add_value('shipping_cost', '0') item = loader.load_item() metadata = SpecSaversMeta() promotional_data = response.xpath( '//font[@color="red" and contains(text(), "use this code")]//text()' ).extract() metadata['promotion'] = ' '.join( promotional_data).strip() if promotional_data else '' item['metadata'] = metadata yield item
class NingboSpider(CrawlSpider): name = 'ningbo' allowed_domains = ['ningbo.gov.cn'] start_urls = ['http://gtog.ningbo.gov.cn/col/col381/index.html'] rules = (Rule(LinkExtractor(allow=r'.*gtog.ningbo.gov.cn/art.*'), callback='parse_page', follow=False), ) cont_dict = {} def parse_item(self, response): print("5. parse_item(): " + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') + " -> " + response.url) title = response.xpath("//*[@id='ivs_title']/text()").get() cont = response.xpath("//*[@id='ivs_content']").get() index_id = str('_NULL') pub_org = response.xpath( "//*[@id='c']/tbody/tr[4]/td/table/tbody/tr/td/text()[1]").get() pub_time = response.xpath( "//*[@id='c']/tbody/tr[4]/td/table/tbody/tr/td/text()[2]").get() doc_id = str('_NULL') region = str('宁波') update_time = datetime.datetime.now().strftime("%Y-%m-%d 00:00:00") if not title: return print("\t>>> " + title) for key in keys: if key in title: self.dict_add_one(re.sub('[\s+]', ' ', title), response.url, re.sub('[\s+]', ' ', cont), re.sub('[\s+]', ' ', pub_time), re.sub('[\s+]', ' ', pub_org), index_id, doc_id, region, update_time, key) item = YqcNingboSpiderItem(cont_dict=self.cont_dict) yield item def dict_add_one(self, title, url, cont, pub_time, pub_org, index_id, doc_id, region, update_time, doc_key): time.sleep(0.3) if title in self.cont_dict: self.cont_dict[title]['key_cnt'] += 1 self.cont_dict[title][ 'doc_key'] = self.cont_dict[title]['doc_key'] + ',' + doc_key else: cnt_dict = { 'key_cnt': 1, 'title': title, 'url': url, 'cont': cont, 'pub_time': pub_time, 'pub_org': pub_org, 'index_id': index_id, 'doc_id': doc_id, 'region': region, 'update_time': update_time, 'doc_key': doc_key } self.cont_dict[title] = cnt_dict def parse_page(self, response): url = response.url print("4. parse_page(): " + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') + " -> " + url) url_prefix = 'http://gtog.ningbo.gov.cn' if str('REPORT_NDOC_006051') in url or str( 'REPORT_NDOC_006010') in url: print("\t>>> debug: " + url) if str('currentPage') in url: tr_list = response.xpath( "//*[@id='main']/div[1]/div/div[2]/table/tbody//tr") for tr in tr_list: # print(tr) url = tr.xpath("./td[1]/a/@href").get() full_url = url_prefix + url yield scrapy.Request(full_url, callback=self.parse_item) else: if str('REPORT_NDOC_006051') in url or str( 'REPORT_NDOC_006010') in url: print('\t>>> no currentPage') title = response.xpath("//*[@id='ivs_title']/text()").get() cont = response.xpath("//*[@id='ivs_content']").get() index_id = str('_NULL') pub_org = response.xpath( "//*[@id='c']/tbody/tr[4]/td/table/tbody/tr/td/text()[1]").get( ) pub_time = response.xpath( "//*[@id='c']/tbody/tr[4]/td/table/tbody/tr/td/text()[2]").get( ) doc_id = str('_NULL') region = str('宁波') update_time = datetime.datetime.now().strftime("%Y-%m-%d 00:00:00") if not title: return print("\t>>> " + title) for key in keys: if key in title: # print("\t>>> included") self.dict_add_one(re.sub('[\s+]', ' ', title), response.url, re.sub('[\s+]', ' ', cont), re.sub('[\s+]', ' ', pub_time), pub_org, index_id, doc_id, region, update_time, key) item = YqcNingboSpiderItem(cont_dict=self.cont_dict) print("6. parse_page(): " + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') + " -> " + url) # print("\n") # print(item) yield item
class OddsSpider(CrawlSpider): name = 'odds' allowed_domains = ['www.oddsportal.com'] start_urls = [ 'https://www.oddsportal.com/soccer/africa/africa-cup-of-nations/results/' ] #def start_requests(self): # yield SplashRequest(self.link, args={'wait': 4}, meta={'real_url': self.link}) rules = ( # Rule(LinkExtractor(allow=('/soccer/'), # deny=('/standings/')), process_request='use_splash'), Rule(LinkExtractor( allow=(r'/soccer/[a-z-]+/[a-z0-9-]+/[a-zA-Z0-9-]+/'), deny=( "/soccer/[a-z-]+/[a-z0-9-]+/results", "/soccer/[a-z-]+/[a-z0-9-]+/standing", )), callback='parse_items', process_request='use_splash', follow=True)) def _requests_to_follow(self, response): if not isinstance( response, (HtmlResponse, SplashJsonResponse, SplashTextResponse)): return seen = set() for n, rule in enumerate(self._rules): links = [ lnk for lnk in rule.link_extractor.extract_links(response) if lnk not in seen ] if links and rule.process_links: links = rule.process_links(links) for link in links: seen.add(link) r = self._build_request(n, link) yield rule.process_request(r) #def splash_request(self, request): # return SplashRequest(url=request.url, args={'wait': 4}, meta={'real_url': request.url}) def use_splash(self, request): request.meta.update(splash={ 'args': { 'wait': 1, }, 'endpoint': 'render.html', }) return request def parse_items(self, response): items = OddsportalItem() items['country'] = response.css("a:nth-child(4)::text").get() items['liga'] = response.css("a:nth-child(5)::text").get() items['teams'] = response.css("h1::text").get() items['data'] = response.css(".t1559221200-4-1-1-1::text").get() yield items SplashRequest()
# Auto generated by generator.py. Delete this line if you make modification. from scrapy.spiders import Rule from scrapy.linkextractors import LinkExtractor XPATH = { 'name': "//title", 'price': "//tr[2]/td/font/strong | //font[@color='#FF0000']/strong", 'category': "", 'description': "//tr[2]/td/p/span", 'images': "//table//tr[1]/td/a/img/@src", 'canonical': "", 'base_url': "", 'brand': "" } name = 'tamduc.org' allowed_domains = ['tamduc.org'] start_urls = ['http://tamduc.org'] tracking_url = '' sitemap_urls = [''] sitemap_rules = [('', 'parse_item')] sitemap_follow = [] rules = [ Rule(LinkExtractor(allow=['/Product/']), 'parse_item'), Rule(LinkExtractor(allow=['/listProduct/']), 'parse'), #Rule(LinkExtractor(), 'parse_item_and_links'), ]
class QicheSpider(CrawlSpider): name = 'qiche' allowed_domains = ['autohome.com.cn'] start_urls = ['https://www.autohome.com.cn/news/'] #page_links = LinkExtractor(allow=r'news/\d+/') contentlinks = LinkExtractor(allow=r'news/\d+?/\d+?.html#pvareaid=\d+') rules = ( #Rule(page_links), Rule(contentlinks,callback="parse_item",follow=True), ) # def parse(self, response): def parse_item(self, response): item = QichezhijiaItem() item['name']= self.get_name(response) item['time'] = self.get_time(response) item['source'] = self.get_source(response) item['type'] = self.get_type(response) item['editor'] = self.get_editor(response) item['content'] = self.get_content(response) yield item def get_name(self,response): name = response.xpath('//div[@class="row"]/div/div/h1/text()').extract() if len(name): name = name[0] else: name = "NULL" return name def get_time(self,response): time = response.xpath('//div[@class="row"]/div/div/div[1]/span[1]/text()').extract() if len(time): time = time[0] else: time = "NULL" return time def get_source(self,response): source = response.xpath('//div[@class="row"]/div/div/div[1]/span[2]/text()').extract() if len(source): source = source[0] else: source = "Null" return source def get_type(self,response): type = response.xpath('//div[@class="row"]/div/div/div[1]/span[3]/text()').extract() if len(type): type = type[0] else: type = "NULL" return type def get_editor(self,response): editor = response.xpath('//div[@class="row"]/div/div/div[1]/div/a/text()').extract() if len(editor): editor = editor[0] else: editor = "NULL" return editor def get_content(self,response): content = response.xpath('//div/div[@id="articleContent"]/p/text()').extract() if len(content): content= ' '.join(content) else: content = "NULL" return content
class GenericCrawlSpider(CrawlSpider): crawl_specification = settings # load parser from specification try: parser_class = shared.get_class(crawl_specification.parser) parser = parser_class(data=crawl_specification.parser_data) except AttributeError or TypeError as exc: MLOG.exception(exc) domain = urlparse(start_url).netloc name = crawler_name allowed_domains = [domain] start_urls = [start_url] denied_extensions = [ 'mng', 'pct', 'bmp', 'gif', 'jpg', 'jpeg', 'png', 'pst', 'psp', 'tif', 'tiff', 'ai', 'drw', 'dxf', 'eps', 'ps', 'svg', 'mp3', 'wma', 'ogg', 'wav', 'ra', 'aac', 'mid', 'au', 'aiff', '3gp', 'asf', 'asx', 'avi', 'mov', 'mp4', 'mpg', 'qt', 'rm', 'swf', 'wmv', 'm4a', 'm4v', 'flv', 'xls', 'xlsx', 'ppt', 'pptx', 'pps', 'doc', 'docx', 'odt', 'ods', 'odg', 'odp', 'css', 'exe', 'bin', 'rss', 'zip', 'rar', 'gz', 'tar' ] if isinstance(crawl_specification.parser, ParagraphParser): denied_extensions.append("pdf") rules = [ Rule(VerboseLxmlLinkExtractor(logname=crawler_name, spec=crawl_specification, deny=crawl_specification.blacklist, allow=crawl_specification.whitelist, deny_extensions=denied_extensions), callback=parser.parse, follow=True, errback=parser.errback) ] # ensure that start_urls are also parsed parse_start_url = parser.parse def __init__(self): super().__init__() # setup individual logger for every spider if self.crawl_specification.logs: self.s_log = shared.simple_logger( loger_name="crawlspider", file_path=os.path.join(self.crawl_specification.logs, self.name + ".log")) else: self.s_log = shared.simple_logger(loger_name="crawlspider") # enter spider to parser self.parser.spider = self for hand in self.s_log.handlers: self.logger.logger.addHandler(hand) self.s_log.info("[__init__] - Crawlspider logger setup finished.") def start_requests(self): for url in self.start_urls: yield Request(url)
# Auto generated by generator.py. Delete this line if you make modification. from scrapy.spiders import Rule from scrapy.linkextractors import LinkExtractor XPATH = { 'name': "//h1[@class='product-name']", 'price': "//span[@class='special-price']/span|//div[@class='price-box']/span/span[@class='price']", 'category': "//div[@class='col-md-12 breadcrumbs']/ul/li/a", 'description': "//div[@id='yt_tab_products']", 'images': "//div/a/img/@src", 'canonical': "", 'base_url': "", 'brand': "" } name = 'hcplus.vn' allowed_domains = ['hcplus.vn'] start_urls = ['http://hcplus.vn/'] tracking_url = '' sitemap_urls = [''] sitemap_rules = [('', 'parse_item')] sitemap_follow = [] rules = [ Rule(LinkExtractor(allow=['/[a-zA-Z-]+/?[a-zA-Z0-9-]+.html$']), 'parse_item'), Rule(LinkExtractor(allow=['vn/[a-zA-Z-]+.html']), 'parse'), #Rule(LinkExtractor(), 'parse_item_and_links'), ]
class BBCSpider(Spider): name = 'bbc_spider' allowed_domains = ['www.bbc.com'] start_urls = ['http://www.bbc.com/news'] rules = (Rule(LinkExtractor(allow="http://www.bbc.com/news"), callback='parse'), ) def parse(self, response): items = [] for article in response.xpath( '//*[@class="nw-c-most-read__items gel-layout ' 'gel-layout--no-flex"]/ol/li'): item = Article() item["Title"] = article.xpath( 'span/div/a/span/text()').extract()[0] temp = article.xpath('span/div/a/@href').extract()[0] item["URL"] = ''.join("http://www.bbc.com" + str(temp)) #item["Summary"] = article.xpath('article/div/header/p[2]/text()').extract()[0] # item["Photo"] = article.xpath('article/figure/a/img/@src').extract()[0] item["Site"] = "BBC News" items.append(item) if item["Title"] != "": title = item["Title"] title = title.encode('utf-8').strip() else: title = "" #if item["Summary"] != "": # summary = item["Summary"] # summary = summary.encode('utf-8').strip() #else: # summary = "" if item["URL"] != "": url = item["URL"] url = url.encode('utf-8').strip() else: url = "" if item["Site"] != "": site = item["Site"] else: site = "" summary = "" text = get_article(item["URL"]).encode('utf-8').strip() text = text.replace('\n', ' ') with open("db_data.txt", "a") as myfile: myfile.write('\t') myfile.write(title) myfile.write('\t') myfile.write(summary) myfile.write('\t') myfile.write("") myfile.write('\t') myfile.write(url) myfile.write('\t') myfile.write(site) myfile.write('\t') myfile.write(text) myfile.write('\n') myfile.close()
class PexelSpider(CrawlSpider): name = "PexelSpider" allowed_domains = ['www.pexels.com'] start_urls = ["https://www.pexels.com/"] rules = [ Rule(LinkExtractor(allow_domains="www.pexels.com"), follow=True, callback='parse_link') ] def parse_link(self, response): image_links = response.xpath( '//*[@download="true"]/../a/@href').extract() path_to_store_image = '/home/fahad/Spyder_Projects/PexelCrawler/images/' request_url = response.request.url headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:65.0) Gecko/20100101 Firefox/65.0' } if not os.path.exists(path_to_store_image): os.makedirs(path_to_store_image) for image_link in image_links: if "images.pexels.com" not in image_link: continue image_id = image_link.split('/')[4] image_name = path_to_store_image + image_id + '.jpeg' if os.path.exists(image_name): continue picture_request = requests.get(image_link, headers=headers) if picture_request.status_code == 200: with open(image_name, 'wb') as f: f.write(picture_request.content) else: print( "response code %d for image id %s" % picture_request.status_code, image_id) if '/photo/' not in request_url and '/photos/' not in request_url: for page_no in range(0, 41): Ajax_Request_URL = 'https://www.pexels.com/?dark=true&format=js&page=%d' % page_no yield scrapy.Request(url=Ajax_Request_URL, headers=headers, callback=self.Ajax_Parse) time.sleep(1) def Ajax_Parse(self, response): path_to_store_image = '/home/fahad/Spyder_Projects/PexelCrawler/images/' if not os.path.exists(path_to_store_image): os.makedirs(path_to_store_image) image_links = response.xpath( '//*[@download="true"]/../a/@href').extract() headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:65.0) Gecko/20100101 Firefox/65.0' } for image_link in image_links: if "images.pexels.com" not in image_link: continue image_id = image_link.split('/')[4] image_name = path_to_store_image + image_id + '.jpeg' if os.path.exists(image_name): continue picture_request = requests.get(image_link, headers=headers) if picture_request.status_code == 200: with open(image_name, 'wb') as f: f.write(picture_request.content) else: print( "response code %d for image id %s" % picture_request.status_code, image_id)
from scrapy.spiders import Rule from scrapy.linkextractors import LinkExtractor XPATH = { 'name': "//div[@class='col-xs-12 col-sm-12 col-md-12 col-lg-7 body-holder']/div[@class='summary entry-summary body']/div[@class='title']/h1[@class='product_title entry-title']", 'price': "//div[@class='summary entry-summary body']/div[@class='prices clearfix']/ins/span[@class='amount']", 'category': "//div[@class='breadcrumb-nav-holder minimal']/ul[@class='mc-breadcrumb']/li/span", 'description': "//div[@class='container-fluid']/div[@class='tab-holder']/div[@class='tab-content']/div[@id='tab-description']", 'images': "//div[@class='col-xs-12 col-sm-8 col-sm-offset-2 col-md-offset-3 col-lg-offset-0 col-md-6 col-lg-5 gallery-holder']/div[@class='images']/a/img/@src", 'canonical': "//link[@rel='canonical']/@href", 'base_url': "", 'brand': "" } name = 'bepluaviet.vn' allowed_domains = ['bepluaviet.vn'] start_urls = ['http://bepluaviet.vn'] tracking_url = '' sitemap_urls = [''] sitemap_rules = [('', 'parse_item')] sitemap_follow = [] rules = [ Rule(LinkExtractor(allow=['/shop/']), 'parse_item'), Rule(LinkExtractor(allow=['/sp/[/a-zA-Z0-9-]+$']), 'parse'), #Rule(LinkExtractor(), 'parse_item_and_links'), ]
class ScieloBr(BasePortiaSpider): name = "scielo_br" allowed_domains = [u'www.scielo.br', u'www.scielo.org.mx'] start_urls = [ #OA补漏 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0102-093520170001&lng=en&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0102-093520170002&lng=en&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0102-093520170003&lng=en&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0102-093520170004&lng=en&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0102-093520170005&lng=en&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0102-093520170006&lng=en&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1679-395120160007&lng=en&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-509820100001&lng=pt&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-509820100002&lng=pt&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-509820100003&lng=pt&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-509820100004&lng=pt&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-509820110001&lng=pt&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-509820110002&lng=pt&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-509820110003&lng=pt&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-509820110004&lng=pt&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-509820120001&lng=pt&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-509820120002&lng=pt&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-509820120003&lng=pt&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-509820120004&lng=pt&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-509820130001&lng=pt&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-509820130002&lng=pt&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-509820130003&lng=pt&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-509820130004&lng=pt&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-576420100001&lng=en&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-576420100002&lng=en&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-576420100003&lng=en&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-576420100004&lng=en&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-576420110001&lng=en&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-576420110002&lng=en&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-576420110003&lng=en&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-576420110004&lng=en&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-576420120001&lng=en&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-576420120002&lng=en&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-576420120003&lng=en&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-576420120004&lng=en&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-576420130002&lng=en&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-576420130003&lng=en&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1980-576420130004&lng=en&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0031-104920160001&lng=pt&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0031-104920160005&lng=pt&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0031-104920160017&lng=pt&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0031-104920170002&lng=pt&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0031-104920170010&lng=pt&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0031-104920170015&lng=pt&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0031-104920170017&lng=pt&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0031-104920170036&lng=pt&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0031-104920170039&lng=pt&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0031-104920170040&lng=pt&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1808-243220160001&lng=en&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1808-243220160002&lng=en&nrm=iso', 'http://www.scielo.org.mx/scielo.php?script=sci_issuetoc&pid=0185-330920100001&lng=en&nrm=iso', 'http://www.scielo.org.mx/scielo.php?script=sci_issuetoc&pid=0185-330920100002&lng=en&nrm=iso', 'http://www.scielo.org.mx/scielo.php?script=sci_issuetoc&pid=0185-330920110001&lng=en&nrm=iso', 'http://www.scielo.org.mx/scielo.php?script=sci_issuetoc&pid=0185-330920110002&lng=en&nrm=iso', 'http://www.scielo.org.mx/scielo.php?script=sci_issuetoc&pid=0185-330920120001&lng=en&nrm=iso', 'http://www.scielo.org.mx/scielo.php?script=sci_issuetoc&pid=0185-330920120002&lng=en&nrm=iso', 'http://www.scielo.org.mx/scielo.php?script=sci_issuetoc&pid=0185-330920140001&lng=en&nrm=iso', 'http://www.scielo.org.mx/scielo.php?script=sci_issuetoc&pid=0185-330920140002&lng=en&nrm=iso', 'http://www.scielo.org.mx/scielo.php?script=sci_issuetoc&pid=0185-330920150001&lng=en&nrm=iso', 'http://www.scielo.org.mx/scielo.php?script=sci_issuetoc&pid=0185-330920150002&lng=en&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=2175-786020100001&lng=en&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=2175-786020100002&lng=en&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=2175-786020100003&lng=en&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=2175-786020100004&lng=en&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=2175-786020100005&lng=en&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=2175-786020110001&lng=en&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=2175-786020110002&lng=en&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=2177-705520160001&lng=pt&nrm=is', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=2177-705520160002&lng=pt&nrm=is', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=2177-705520160003&lng=pt&nrm=is', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=2177-705520170001&lng=pt&nrm=is', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=2177-705520170002&lng=pt&nrm=is', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=2177-705520170003&lng=pt&nrm=is', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0101-662820160001&lng=en&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0101-662820160002&lng=en&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0101-662820160003&lng=en&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0101-662820170001&lng=en&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0101-662820170002&lng=en&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=0101-662820170003&lng=en&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_issuetoc&pid=1984-467020170001&lng=en&nrm=iso' u'http://www.scielo.org.mx/scielo.php?script=sci_issues&pid=0185-3309&lng=en&nrm=iso', u'http://www.scielo.br/scielo.php?script=sci_issues&pid=0102-0935&lng=en&nrm=iso', 'http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0100-204X2000000900007&lng=en&nrm=iso&tlng=pt' 'http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0102-09351999000600014&lng=en&nrm=iso&tlng=pt' 'http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0102-09352011000500030&lng=en&nrm=iso&tlng=pt', 'http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0102-09352011000600001&lng=en&nrm=iso&tlng=pt', 'http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0100-204X2015000900854&lng=en&nrm=iso&tlng=pt' ] rules = [ #Rule( # LinkExtractor( # allow=( # u'script=sci_issuetoc&pid=.*&lng=en&nrm=iso$' # ), # )), Rule( LinkExtractor( allow=( u'script=sci_arttext&pid=.*&lng=en&nrm=iso&tlng=en', u'script=sci_arttext&pid=.*&lng=en&nrm=iso&tlng=pt' ),), callback='parse_item', ) ] def get_pdf(response): try: pdf_link_elem = Utils.select_element_by_content(response, "//*[@id='toolBox']/div/ul/li/a", "English (pdf)|Portuguese (pdf)") except Exception as e: return "" pdf_link = pdf_link_elem.xpath("@href").extract_first() pdf_link = urlparse.urljoin(response.url, pdf_link) return pdf_link def get_title(response): #title也有多种情况..醉了 title = response.xpath("//p[@class='trans-title']/text()").extract_first() if title is None: title = response.xpath("//p[@class='title']/text()").extract_first() if title is None: try: title_elem = response.xpath("//div[contains(@class, 'index')]//p[@align='CENTER']")[0] title = " ".join(title_elem.xpath(".//text()").extract()) except Exception as e: top_a_elem = response.xpath("//div[contains(@class, 'index')]//a[@name='top']") title = top_a_elem.xpath("./..//b/text()").extract_first() if title is None: title_elem = response.xpath("//div[contains(@class, 'index')]/p")[2] title = " ".join(title_elem.xpath(".//text()").extract()) print "title is %s" % title return Utils.format_text(title) def get_abstract(response): try: abstract_elem = Utils.select_element_by_content(response, "//div[contains(@class, 'index')]//p", "ABSTRACT|Abstract") abstract_text = Utils.get_all_inner_texts(abstract_elem, "./following-sibling::p[1]") return abstract_text except Exception as e: return "" def get_keyword(response): try: keyword_elem = Utils.select_element_by_content(response, "//div[contains(@class, 'index')]//p", "Keywords|Index terms|Key words") keyword_text = "".join(keyword_elem.xpath(".//text()").extract()).replace("Keywords:", "").split(",") except Exception as e: return "" return keyword_text def get_author(response): #有两种格式的author try: author = response.xpath("//div[@class='autores']/p[@class='author']/span[@class='author-name']/text()").extract() if len(author) == 0: sup_elem = response.xpath("//div[contains(@class, 'index')]/p//sup")[0] author_elem = sup_elem.xpath('./..') tag_name = author_elem.xpath('name()').extract_first() while tag_name != "p": author_elem = author_elem.xpath('./..') tag_name = author_elem.xpath('name()').extract_first() author_raw_text = author_elem.extract() author = author_raw_text except Exception as e: return "" return author def get_author_sup(response): #有两种格式的author_sup try: author_sup = response.xpath("//div[@class='autores']/p[@class='author']/sup/a/text()").extract() if len(author_sup) == 0: i = 0 num = 1 #len(response.xpath("//div[contains(@class, 'index')]/p//sup")) sup_elem = response.xpath("//div[contains(@class, 'index')]/p//sup")[0] author_elem = sup_elem.xpath("./..") author_sup = author_elem.xpath("./sup/text()").extract() except Exception as e: return "" return author_sup def get_author_affiliation(response): #有两种格式的author_affiliation try: author_aff = [] elems = response.xpath("//p[@class='aff']") for elem in elems: txt = " ".join(elem.xpath(".//text()").extract()).strip().replace("\n", "") txt = ' '.join(txt.split()) #remove multi space author_aff.append(txt) if len(author_aff) == 0: sup_elem = response.xpath("//div[contains(@class, 'index')]/p//sup")[-1] author_raw_text = sup_elem.xpath("./..").extract() author_aff = author_raw_text print "author affiliation is :%s" % author_aff except Exception as e: return "" return author_aff items = [[Item(PortiaItem, None, '', [Field(u'pdf_link', get_pdf, []), Field(u'journal', '.content h2 a', []), Field(u'print_issn', 'h2:nth-child(6) *::text', [Regex(u'Print version ISSN (\\d+-\\d+)')]), Field(u'online_issn', 'h2:nth-child(6) *::text', [Regex(u'On-line version ISSN (\\d+-\\d+)')]), Field(u'issue', 'h3 *::text', [Regex(u'.*(no\\.|supl\\.|n\\.)(\\d+).*$')]), Field(u'volumn', 'h3 *::text', [Regex(u'(vol\\.\\d+|ahead of print)')]), Field(u'date', 'h3 *::text', [Regex(u'.*(\\d{4})$')]), Field(u'doi', 'h4 *::text', [Regex(u'dx.doi.org/(.*)')]), Field(u'title', get_title, []), Field(u'author_raw', get_author, []), #Field(u'author_sup', # get_author_sup, # []), #Field(u'author_affiliation_raw', #如果命名为author_affiliation,那么因为此field已经在item里面注册了process,爬取到的数据标签会被处理掉 # get_author_affiliation, # []), Field(u'abstracts', get_abstract, []), Field(u'keywords', get_keyword, []), Field(u'copyright', '.copyright *::text', []), Field(u'license_text', '.license', []), Field(u'license_url', '.license a:first-child::attr(href)', [])])]]
class OldAutoSpider(CrawlSpider): name = "old_autos" allowed_domains = ['turbo.az'] start_urls = [ 'https://turbo.az/autos/%s' % page for page in range(3895724, 603621, -1) ] # def start_requests(self): # """ # :param self: # """ # try: # token = cfscrape.get_tokens(OldAutoSpider.start_urls[0]) # for url in OldAutoSpider.start_urls: # yield scrapy.Request( # url=url, # cookies=token, # ) custom_settings = { 'ITEM_PIPELINES': { 'auto.pipelines.SaveOldAutosPipeline': 200, 'auto.pipelines.OldAutoPipeline': 300, }, 'DOWNLOAD_DELAY': 2, 'ROBOTSTXT_OBEY': True, 'COOKIES_ENABLED': False, # 'ROTATING_PROXY_LIST': [ # 'http://*****:*****@209.127.191.180:80', # 'http://*****:*****@193.8.56.119:80', # 'http://*****:*****@185.164.56.20:80', # 'http://*****:*****@45.130.255.243:80', # 'http://*****:*****@45.95.96.132:80', # 'http://*****:*****@45.95.96.237:80', # ], } rules = (Rule( LinkExtractor(restrict_xpaths=['//div[@class="product-body"]']), callback='parse_auto', follow=False), ) def parse_auto(self, response): exists = response.xpath( '//div[@class="product-properties-container"]').extract_first() salon = response.xpath( '//div[@class="products-i vipped salon"]').extract() manat = response.xpath( '//div[@class="product-price"]/span/text()').extract_first() selector = Selector(response) l = OldAutosItemLoader(OldAutoItem(), selector) if exists: if salon: pass else: l.add_xpath( 'city', '//li[@class="product-properties-i"]/div[@class="product-properties-value"]/text()' ) l.add_xpath( 'brand', '//ul[@class="product-properties"]/li[2]/div[@class="product-properties-value"]/a/text()' ) l.add_xpath( 'model', '//ul[@class="product-properties"]/li[3]/div[@class="product-properties-value"]/a/text()' ) l.add_xpath( 'year', '//ul[@class="product-properties"]/li[4]/div[@class="product-properties-value"]/a/text()' ) l.add_xpath( 'bodytype', '//ul[@class="product-properties"]/li[5]/div[@class="product-properties-value"]/text()' ) l.add_xpath( 'color', '//ul[@class="product-properties"]/li[6]/div[@class="product-properties-value"]/text()' ) l.add_xpath( 'engine', '//ul[@class="product-properties"]/li[7]/div[@class="product-properties-value"]/text()' ) l.add_xpath( 'power', '//ul[@class="product-properties"]/li[8]/div[@class="product-properties-value"]/text()' ) l.add_xpath( 'fuel', '//ul[@class="product-properties"]/li[9]/div[@class="product-properties-value"]/text()' ) l.add_xpath( 'mileage', '//ul[@class="product-properties"]/li[10]/div[@class="product-properties-value"]/text()' ) l.add_xpath( 'transmission', '//ul[@class="product-properties"]/li[11]/div[@class="product-properties-value"]/text()' ) l.add_xpath( 'drivetype', '//ul[@class="product-properties"]/li[12]/div[@class="product-properties-value"]/text()' ) l.add_xpath( 'new', '//ul[@class="product-properties"]/li[13]/div[@class="product-properties-value"]/text()' ) if (manat == "AZN"): l.add_xpath('pricem', '//div[@class="product-price"]/text()') l.add_value('priced', '1') else: l.add_xpath('priced', '//div[@class="product-price"]/text()') l.add_value('pricem', '1') l.add_xpath('order', '//div[@class="product-statistics"]/p[3]/text()') # l.add_value('adddate', datetime.datetime.now()) l.add_xpath('adddate', '//div[@class="product-statistics"]/p[2]/text()') return l.load_item()
class CKSpider(CrawlSpider): name = 'dvwa_login' form_username = '******' form_password = '******' username = '******' password = '******' allowed_domains = ['192.168.57.30'] login_page = 'http://192.168.57.30/login.php' start_urls = [ 'http://192.168.57.30/index.php', ] rules = [ Rule(LinkExtractor(allow=(), deny=('/logout*')), callback="parse_item", follow=True) ] def start_requests(self): logging.debug("start send request") yield Request(url=self.login_page, callback=self.login, dont_filter=True) def login(self, response): logging.debug("submit login ") yield FormRequest.from_response(response, formdata={ self.form_username: self.username, self.form_password: self.password }, callback=self.check_login_response) def check_login_response(self, response): if "logout" in response.body: logging.debug("finish login") return self.parse(response) def parse(self, response): logging.debug("run parse item") yield self.parse_item(response) logging.debug("run parse") parsed_response = self._parse_response(response, self.parse_start_url, cb_kwargs={}, follow=True) for requests_or_item in parsed_response: logging.debug("\nrequest or item after log in: \n") logging.debug(requests_or_item) yield requests_or_item def parse_item(self, response): url_obj = urlparse.urlsplit(response.url) url_ret = urlparse.urlunsplit( (url_obj.scheme, url_obj.netloc, url_obj.path, '', '')) item = URLItem() item['url_base'] = url_ret item['url_parameters'] = url_obj.query return item
class XiamiSpider(BaseSpider): name = 'xiami' extractor = xiami_extractor download_delay = 0.1 proxy_stable = True start_urls = ['http://www.xiami.com/'] allowed_domains = [ 'www.xiami.com', 'i.xiami.com', ] rules = ( # Find some available indexes Rule(LinkExtractor(allow=( 'com/chart', 'com/genre', 'com/zone', ))), # Find collect pages Rule(LinkExtractor(allow=( 'com/collect$', 'com/collect/\d+', ))), # Find artist indexes Rule(LinkExtractor(allow=( 'artist/index', 'artist/tag', ))), # Find artist pages Rule(LinkExtractor(allow=( 'com/artist/\w+$', 'search/find/artist', )), process_request='store_artists'), ) set_artist = set() api_count = 'http://www.xiami.com/count/getplaycount?id=%s&type=song' def __init__(self): super(XiamiSpider, self).__init__() dispatcher.connect(self.spider_idle, signals.spider_idle) def store_artists(self, request): """Store artist urls into set rather than send requests""" if request.url not in self.set_artist: with open('xiami_artist_%s' % self.time_stamp, 'a') as output: output.write(request.url + '\n') self.set_artist.add(request.url) return None def spider_idle(self): """Send 10 requests of artist when request queue is empty in order to limit memory consumption""" count = 10 while self.set_artist and count: count -= 1 url = self.set_artist.pop() request = Request(url, dont_filter=True, callback=self.get_info) self.crawler.engine.crawl(request, self) def get_info(self, response): """Find callback function for different urls""" try: if re.search('artist/\d+', response.url) or \ re.search('i\.xiami\.com/[^/]+$', response.url): self.get_artist(response) elif re.search('album/\d+', response.url): self.get_albums(response) elif re.search('song/\d+', response.url): self.get_songs(response) elif 'count/getplaycount' in response.url: self.get_count(response) else: self.get_pages(response) except (AttributeError, TypeError): return request = self.gen_info(response) if not request: self.save(response.meta['source_id'], response.meta['raw_info'], response.meta['result']) else: yield request def get_artist(self, response): result = self.extractor.parse_response(response) raw_info = { 'html': response.body, 'albums': [], } source_id = self.get_source_id(response) if 'redirect_urls' in response.meta: response.meta.pop('redirect_times') response.meta.pop('redirect_ttl') response.meta.pop('redirect_urls') response.meta.update({ 'requests': [], 'raw_info': raw_info, 'result': result, 'source_id': source_id }) soup = BeautifulSoup(response.body) album_info = soup.find('div', id='artist_album') if album_info: a_info = album_info.find('a', class_='more') response.meta['requests'].append( urlparse.urljoin(response.url, a_info['href'])) @staticmethod def get_pages(response): soup = BeautifulSoup(response.body) div_info = soup.find('div', class_='albumBlock_list') p_info = div_info.find_all('p', class_='cover') for p in p_info: if p.find('span', class_='pubbing'): continue response.meta['requests'].append( urlparse.urljoin(response.url, p.a['href'])) a_info = soup.find('a', class_='p_redirect_l') if a_info: response.meta['requests'].append( urlparse.urljoin(response.url, a_info['href'])) def get_albums(self, response): raw_info = { 'html': response.body, 'songs': [], } response.meta['raw_info']['albums'].append(raw_info) response.meta['result']['albums'].append( self.extractor.parse_response_album(response)) soup = BeautifulSoup(response.body) td_info = soup.find_all('td', class_='song_name') for td in td_info: response.meta['requests'].append( urlparse.urljoin(response.url, td.a['href'])) def get_songs(self, response): response.meta['raw_info']['albums'][-1]['songs'].append(response.body) response.meta['result']['albums'][-1]['songs'].append( self.extractor.parse_response_song(response)) m = re.search('song/(\d+)', response.url) response.meta['requests'].append(self.api_count % m.group(1)) @staticmethod def get_count(response): data = json.loads(response.body) response.meta['result']['albums'][-1]['songs'][-1][ 'song_played'] = data['plays'] def gen_info(self, response): if not response.meta['requests']: return None url = response.meta['requests'].pop() request = Request(url, meta=response.meta, callback=self.get_info) if re.search('song/\d+', url): self.download_delay = 1 request.meta['download_slot'] = 'song' return request def save(self, source_id, raw_info, result): raw_json = json.dumps(raw_info, ensure_ascii=False, sort_keys=True) page_id = '%s_%s' % (self.get_source_name(), source_id) if self.storage: self.storage.save(page_id, raw_json) result_json = json.dumps(result, ensure_ascii=False, sort_keys=True) with open('%s_result_%s' % (self.name, self.time_stamp), 'a') as output: output.write(result_json + '\n') @staticmethod def get_source_id(response): m = re.search("id = '(\d+)'", response.body) return m.group(1) @staticmethod def process_request_headers(request): """Process request to get 200 response for xiami Xiami checks User-Agent in headers. Keep referer empty can keep away from login operation. """ request.headers.setdefault( 'User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/51.0.2704.103 Safari/537.36') if 'redirect_urls' not in request.meta: request.headers['Referer'] = None
# Auto generated by generator.py. Delete this line if you make modification. from scrapy.spiders import Rule from scrapy.linkextractors import LinkExtractor XPATH = { 'name': "//table[@class='catalog-detail'][2]//tr[1]/td[1]/span", 'price': "//div[@class='pro'][3]/span[@class='value']", 'category': "//ul[@class='breadcrumb-navigation']/li/a", 'description': "//table[@class='catalog-detail'][2]//tr[1]/td[1]", 'images': "//div[@id='catalog-detail-main-image']/a/@href", 'canonical': "", 'base_url': "", 'brand': "" } name = 'thegioigiay.vn' allowed_domains = ['thegioigiay.vn'] start_urls = ['http://thegioigiay.vn'] tracking_url = '' sitemap_urls = [''] sitemap_rules = [('', 'parse_item')] sitemap_follow = [] rules = [ Rule(LinkExtractor(allow=['/products+/\d+/\d+/']), 'parse_item'), Rule(LinkExtractor(allow=['/products+/\d+/']), 'parse'), #Rule(LinkExtractor(), 'parse_item_and_links'), ]
class WalgreensProductsSpider(CrawlSpider): """Walgreens products spider.""" name = 'products' allowed_domains = ['walgreens.com', 'bazaarvoice.com'] start_urls = ['https://www.walgreens.com/store/catalog/shopLanding'] rules = ( Rule(LinkExtractor(allow=('/store/c/', ), deny=('/ID=[^-]+-product', )), callback='parse_listing', follow=True), Rule(LinkExtractor(allow=('/ID=[^-]+-product', )), callback='parse_product'), ) # ------------------------------------------------------------------------- def parse_listing(self, response): """ Extract product list. @url https://www.walgreens.com/store/c/eyes/ID=360457-tier3 @returns requests 1 """ blob = response.css('script').re_first( r'__APP_INITIAL_STATE__ = (\{.+\});') if not blob: return data = json.loads(blob) if not data['searchResult'].get('productList'): return for each in data['searchResult']['productList']: yield response.follow(each['productInfo']['productURL'], callback=self.parse_product) limit = response.meta.get('limit', 24) offset = int(url_query_parameter(response.url, 'No', 0)) + limit return response.follow(add_or_replace_parameter( response.url, 'No', offset), callback=self.parse_listing, meta={ 'offset': offset, 'limit': limit }) # ------------------------------------------------------------------------- def parse_product(self, response): """ Extract product details. @url https://www.walgreens.com/store/c/l.a.-colors-eyeliner-&-brow-pencil/ID=prod6248128-product @returns requests 1 """ loader = ProductItemLoader(ProductItem(), response) loader.add_value('id', response.url, re=r'/ID=([^-]+)') loader.add_css('name', '#productTitle') loader.add_css('regular_price', '#regular-price-info') loader.add_css('unit_price', '#unit-price') loader.add_xpath( 'category', '//ul[has-class("nav__bread-crumbs")]/li[position() > 2]//a') loader.add_value('url', response.url) loader.add_css('description', '#Details-0 + .wag-accordion-tab-content[id]') loader.add_css('warnings', '#Warnings-1 + .wag-accordion-tab-content[id]') loader.add_css('ingredients', '#Ingredients-2 + .wag-accordion-tab-content[id]') loader.add_css('shipping', '#Shipping-3 + .wag-accordion-tab-content[id]') loader.add_css('main_image', '#productImg::attr(src)') loader.add_css('image_urls', '#thumbnailImages img::attr(src)') loader.add_css('rating', '#reviewsData > .pr10::text') loader.add_css('reviews_count', '#reviewsData > .ml10::text', re=r'\d+') product = loader.load_item() if product.get('reviews_count') and product['reviews_count'] > 0: return self.request_reviews(product) return product # ------------------------------------------------------------------------- def parse_reviews(self, response): """Extract product reviews.""" product = response.meta.get('product') or {} product['reviews'] = product.get('reviews') or [] data = json.loads(response.body) for each in data['Results']: review = self.extract_review(each) product['reviews'].append(review) if product.get('reviews_count') > len(product['reviews']): offset = response.meta.get('offset') + len(data['Results']) return self.request_reviews(product, offset=offset) return product # ------------------------------------------------------------------------- def request_reviews(self, product, offset=0, limit=30): """Request reviews.""" return scrapy.FormRequest( method='GET', url='https://api.bazaarvoice.com/data/reviews.json', formdata={ 'Filter': 'ProductId:%s' % product['id'], 'Sort': 'Helpfulness:desc', 'Limit': str(limit), 'Offset': str(offset), 'Include': 'Comments', 'Stats': 'Reviews', 'passkey': 'tpcm2y0z48bicyt0z3et5n2xf', 'apiversion': '5.4' }, meta={ 'offset': offset, 'limit': limit, 'product': product }, callback=self.parse_reviews) # ------------------------------------------------------------------------- def extract_review(self, data): """Extract review details.""" loader = ReviewItemLoader(ReviewItem()) loader.add_value('id', data.get('Id')) loader.add_value('rating', data.get('Rating')) loader.add_value('title', data.get('Title')) loader.add_value('text', data.get('ReviewText')) loader.add_value('is_featured', data.get('IsFeatured')) loader.add_value('published_at', data.get('SubmissionTime')) loader.add_value('positive_feedback_count', data.get('TotalPositiveFeedbackCount')) loader.add_value('negative_feedback_count', data.get('TotalNegativeFeedbackCount')) loader.add_value('reviewer', self.extract_reviewer(data)) return loader.load_item() # ------------------------------------------------------------------------- def extract_reviewer(self, data): """Extract reviewer details.""" loader = ReviewerItemLoader(ReviewerItem()) loader.add_value('id', data.get('AuthorId')) loader.add_value('username', data.get('UserNickname')) loader.add_value('location', data.get('UserLocation')) loader.add_value( 'properties', { name: data.get('Value') for name, data in data.get('ContextDataValues', {}).items() }) return loader.load_item()
class MopSpider(CrawlSpider): name = 'mop' allowed_domains = ['mop.com'] start_urls = ['http://dzh.mop.com/'] post_extract = LxmlLinkExtractor( allow=('/\d+.html', '/nofresh/\d+', 'mop\.com/\d+'), allow_domains=('dzh.mop.com'), # deny=( # # ), # deny_domains=( # # ) ) author_extract = LxmlLinkExtractor( allow=('/space/\d+/profile', ), allow_domains=('hi.mop.com'), # deny=( # # ), # deny_domains=( # # ) ) author_page_extract = LxmlLinkExtractor( allow=('/space/\d+', ), allow_domains=('hi.mop.com'), # deny=( # # ), # deny_domains=( # # ) ) fans_extract = LxmlLinkExtractor( allow=('/space/\d+/fans', ), allow_domains=('hi.mop.com'), # deny=( # # ), # deny_domains=( # # ) ) friends_extract = LxmlLinkExtractor( allow=('/space/\d+/follow', ), allow_domains=('hi.mop.com'), # deny=( # # ), # deny_domains=( # # ) ) follow_extract = LxmlLinkExtractor( # allow=( # '/s/[0-9]+', # ), allow_domains=('dzh.mop.com'), # deny=( # '/print.html' # ), # deny_domains=( # 'q.blog.sina.com.cn' # ) ) rules = ( Rule(author_extract, follow=True, callback='parse_author'), Rule(fans_extract, follow=True, callback='parse_fans'), Rule(friends_extract, follow=True, callback='parse_friends'), Rule(author_page_extract, follow=True), Rule(post_extract, follow=True, callback='parse_post'), # Rule(follow_extract, follow=True, callback='parse_follow'), Rule(follow_extract, follow=True), ) a_p_count = 0 a_count = 0 p_count = 0 f_count = 0 # def parse_page(self, response): # self.a_p_count += 1 # print('author page: ', self.a_p_count, ' ', response.url) def parse_author(self, response): # self.a_count += 1 # print('author: ', self.a_count, ' ', response.url) author_item = get_author_item(response) author_id = author_item['author_id'] data_param = 'data=%7B"header"%3A%7B%7D%2C"req"%3A%7B"User%2FSubCount"%3A%7B"uid"%3A"' + \ author_id + '"%7D%2C"User%2FSnsCount"%3A%7B"uid"%3A"' + author_id + '"%7D%7D%7D' data_url = 'http://hi.mop.com/ajax/get?' + data_param yield Request( url=data_url, callback=self.parse_author_data, method='POST', meta={'author_item': author_item}, priority=10, ) def parse_author_data(self, response): author_item = response.meta['author_item'] data_json = response.text try: json_obj = json.loads(data_json) if json_obj: friends_num = json_obj['resp']['User/SnsCount']['retObj'][ 'follow'] author_item['friends_num'] = friends_num fans_num = json_obj['resp']['User/SnsCount']['retObj']['fans'] author_item['fans_num'] = fans_num post_num = json_obj['resp']['User/SubCount']['retObj'][ 'subject'] author_item['post_num'] = post_num reply_num = json_obj['resp']['User/SubCount']['retObj'][ 'reply'] author_item['reply_num'] = reply_num finally: yield author_item def parse_post(self, response): # self.p_count += 1 # print('post: ', self.p_count, ' ', response.url) post_item = get_post_item(response) post_id = post_item['post_id'] for comment_item in get_comment_item(response, post_id): post_item['comment_ids'].append(comment_item['comment_id']) yield comment_item yield post_item # def parse_follow(self, response): # self.f_count += 1 # print('follow: ', self.f_count, ' ', response.url) def parse_fans(self, response): sel = Selector(response) user_id = sel.xpath('//div[@class="hpUserInfo1"]/@uid').extract_first() fans_list = get_fans_item(response) for fans_id, fans_url in fans_list: fans_item = FansItem() fans_item['fans_id'] = fans_id fans_item['friends_id'] = user_id yield fans_item yield Request(url=fans_url + '/profile', callback=self.parse_author) def parse_friends(self, response): sel = Selector(response) user_id = sel.xpath('//div[@class="hpUserInfo1"]/@uid').extract_first() friends_list = get_fans_item(response) for friends_id, friends_url in friends_list: fans_item = FansItem() fans_item['fans_id'] = user_id fans_item['friends_id'] = friends_id yield fans_item yield Request(url=friends_url + '/profile', callback=self.parse_author)
class FoshanSpider(CrawlSpider): name = "foshan" allowed_domains = ["foshannews.com", "foshannews.net"] start_urls = [ 'https://www.foshannews.com/', 'https://www.foshannews.com/fstt/', 'https://www.foshannews.com/cc/', 'https://www.foshannews.com/nh/', 'https://www.foshannews.com/sd/', 'https://www.foshannews.com/gm/', 'https://www.foshannews.com/ss/', 'https://www.foshannews.com/jdyw/', 'https://www.foshannews.com/cc/sstt/', 'https://www.foshannews.com/sd/sdtt/', 'https://www.foshannews.com/nh/nhtt/', 'https://www.foshannews.com/gm/gmtt/', 'https://www.foshannews.com/ss/sstt/' ] url_pattern = r'./*/t(\d{8})_(\d+)\.html' rules = (Rule(LinkExtractor(allow=(url_pattern)), 'parse_news'), ) page_link = set() def start_requests(self): self.page_link = { 'https://www.foshannews.com/', 'https://www.foshannews.com/fstt/', 'https://www.foshannews.com/cc/', 'https://www.foshannews.com/nh/', 'https://www.foshannews.com/sd/', 'https://www.foshannews.com/gm/', 'https://www.foshannews.com/ss/', 'https://www.foshannews.com/jdyw/' } for local in [ 'cc/cctt', 'sd/sdtt', 'nh/nhtt', 'gm/gmtt', 'ss/sstt', 'fstt', 'jdyw' ]: for i in range(1, 35): url = "https://www.foshannews.com/{}/index_{}.html".format( local, str(i)) self.page_link.add(url) for url in self.page_link: yield self.make_requests_from_url(url) url_map = dict() def get_url_id(self, url): return url.split('/')[-1].split('.')[0] def parse_news(self, response): sel = Selector(response) pattern = re.match(self.url_pattern, str(response.url)) item = NewsItem() url_id = self.get_url_id(str(response.url)) if url_id in self.url_map: self.url_map[url_id] += 1 return item else: self.url_map[url_id] = 1 item['contents'] = { 'link': str(response.url), 'title': u'', 'passage': u'' } item['contents']['title'] = sel.xpath('////h1/text()').extract_first() divs = sel.xpath('//div[@class=\'cont\']/div') list_doc = [] for l in divs.xpath('.//div/text() | .//div/b/text()').extract(): t = l.strip() if len(t) > 0: list_doc.append(t) for l in divs.xpath('.//span/text()').extract(): t = l.strip() if len(t) > 0: list_doc.append(t) for l in divs.xpath('.//p/text() | .//p/strong/text()').extract(): t = l.strip() if len(t) > 0: list_doc.append(t) item['contents']['passage'] = list_doc return item
# Auto generated by generator.py. Delete this line if you make modification. from scrapy.spiders import Rule from scrapy.linkextractors import LinkExtractor XPATH = { 'name': "//div[@class='Gpod_box_giua']/div[@class='protp2']/h1", 'price': "//div[@class='Gpod_box_giua']/div[@class='protp2']/ol/li/b/text()", 'category': "//div[@class='menudd']/h1/a", 'description': "//div[@class='Gpod_box_giua']/table//tr/td/div/div[@id='country1']", 'images': "//div[@class='hbimg zoomp']/a[@id='Zoomer']/@href", 'canonical': "//link[@rel='canonical']/@href", 'base_url': "", 'brand': "" } name = 'tanphat.com.vn' allowed_domains = ['tanphat.com.vn'] start_urls = ['http://www.tanphat.com.vn/'] tracking_url = '' sitemap_urls = [''] sitemap_rules = [('', 'parse_item')] sitemap_follow = [] rules = [ Rule(LinkExtractor(allow=['/[a-zA-Z0-9-]+_id+\d+\.html']), 'parse_item'), Rule( LinkExtractor(allow=['/[a-zA-Z0-9-]+_dm+\d+\.html($|\?page=\d+$)'], deny=['max=', 'min=', 'brand=', 'filter=']), 'parse'), #Rule(LinkExtractor(), 'parse_item_and_links'), ]
class AlbawabaSpider(CrawlSpider): name = "albawabacrawler" #allowed_domains =[url[0][8:] for url in csv.reader(open('/home/chrx/Desktop/Scrapy/HezbollahScraper/urls.csv','r'),delimiter =',')] allowed_domains = ["www.albawaba.com"] #start_urls = [url[0] for url in csv.reader(open('/home/chrx/Desktop/Scrapy/HezbollahScraper/urls.csv','r'),delimiter =',')] start_urls = ["https://www.albawaba.com"] custom_settings = { 'CONCURRENT_REQUESTS_PER_DOMAIN': 5, 'AUTOTHROTTLE_ENABLED': True, 'AUTOTHROTTLE_START_DELAY': 1, 'AUTOTHROTTLE_MAX_DELAY': 3 } rules = [ Rule(LinkExtractor(unique=True), follow=True, callback="check_buzzwords") ] terms = [] locations = [] organizations = [] wordlist = [] with open('C:/Users/Alex/Desktop/HezbollahScrapper/terms_english.csv', 'r') as csvfile: terms_reader = csv.reader(csvfile, delimiter=',') for row in terms_reader: terms.append(row[0]) with open( 'C:/Users/Alex/Desktop/HezbollahScrapper/organizations_english.csv', 'r') as csvfile: terms_reader = csv.reader(csvfile, delimiter=',') for row in terms_reader: organizations.append(row[0]) for term in terms: for organization in organizations: wordlist.append(tuple((term, organization))) def check_buzzwords(self, response): url = response.url contenttype = response.headers.get("content-type", "").decode('utf-8').lower() items = [] paragraph_text = response.css("p::text") p_texts = [p.get() for p in paragraph_text] for p_text in p_texts: p_text_lower = p_text.lower() for word_row in self.wordlist: if word_row[0].lower() in p_text_lower and word_row[1].lower( ) in p_text_lower: item = TutorialItem() item["word"] = word_row[0] item["url"] = url item["sentence"] = p_text items.append(item) return (items) #gets the requests to follow recursively def _requests_to_follow(self, response): if getattr(response, "encoding", None) != None: return CrawlSpider._requests_to_follow(self, response) else: return []
class MonsterSpider(CrawlSpider): name = 'monster' allowed_domains = ['monster.fr'] start_urls = gen_start_urls("http://www.monster.fr/emploi/recherche/?q=%s", KEYWORDS, "-") login_page = 'https://login.monster.fr/Login/SignIn' rules = [ Rule( LinkExtractor(allow=['.*jobPosition=.*?']), 'parse_job', ), Rule(LinkExtractor(allow=('.*page=.*?', )), follow=True) ] def parse_job(self, response): job = ScraperItem() sel = Selector(response) job['url'] = response.url job_offer = sel.xpath('//title/text()').extract() job_offer = job_offer[0].strip() job_offer = job_offer.split('-') job['name'] = job_offer[0] job["email"] = None job["phone"] = None return job @staticmethod def execute_js(): from scraper.models import Jobs, db_connect from selenium.webdriver.common.action_chains import ActionChains from sqlalchemy.orm import sessionmaker from selenium.common.exceptions import NoSuchElementException, UnexpectedAlertPresentException from selenium import webdriver import re import time # Get DB engine engine = db_connect() Session = sessionmaker() Session.configure(bind=engine) session = Session() # Iterate through job urls urls = [] q = session.query( Jobs).filter((Jobs.url.like('http://offre-emploi.monster.fr%')) & (Jobs.processed == False)).all() for url in q: urls.append(url.url) # Init browser profile = webdriver.FirefoxProfile() profile.set_preference("browser.cache.disk.enable", False) profile.set_preference("browser.cache.memory.enable", False) profile.set_preference("browser.cache.offline.enable", False) profile.set_preference("network.http.use-cache", False) browser = webdriver.Firefox(profile) action = ActionChains(browser) # Login to user space browser.get("https://login.monster.fr/Login/SignIn", ) browser.find_element_by_name("EmailAddress").send_keys(EMAIL) browser.find_element_by_name("Password").send_keys(PASSWORD) elem = browser.find_element_by_xpath( "//*[@id=\"signInContent\"]/form/div[3]/input[1]") action.move_to_element(elem).click().perform() time.sleep(5) # for each url, click on 'postuler' link = "http://offre-emploi.monster.fr/Apply/Apply.aspx?JobID=" for url in urls: apply_link = re.findall(r"\b\d{6}\w+", url) try: apply_link = link + apply_link[0] print "* Processing %s" % url browser.get(apply_link) if 'Vous postulez' in browser.page_source.encode("utf-8"): browser.find_element_by_css_selector( "#CoverLetter1_DropDownListLetters > option:nth-child(2)" ).click() browser.find_element_by_css_selector( "#rbAuthorizedNo0").click() # Click on "POSTULER" browser.find_element_by_id('btnSubmit').click() time.sleep(5) else: pass except NoSuchElementException: raise except UnexpectedAlertPresentException: alert = browser.switch_to_alert() #alert.dismiss() continue finally: # Update database session.query(Jobs).filter(Jobs.url == url).update( {'processed': True}) session.commit() session.close() browser.close()
class PCComponentes(CrawlSpider): name = 'sonae-pccomponentes' allowed_domains = ['pccomponentes.pt'] start_urls = ['https://www.pccomponentes.pt/'] categories = LinkExtractor( restrict_xpaths=('//*[contains(@class, "menu-principal")]', '//*[contains(@class, "enlaces-clave")]')) rules = (Rule(categories, callback='parse_category'), ) def start_requests(self): for url in self.start_urls: yield Request(url) if hasattr(self, 'prev_crawl_id'): filename = os.path.join(DATA_DIR, '%s_products.csv' % self.prev_crawl_id) with open(filename) as f: reader = csv.DictReader(f) for row in reader: product = Product() for key in row: if row[key]: product[key] = row[key].decode('utf8') yield Request(row['url'], self.parse_product, meta={'item': Product(product)}) def parse_category(self, response): try: data = SpiderSchema(response).get_products() except: return products = False for product in data: if not product.get('sku'): continue products = True loader = ProductLoader(Product(), response=response) loader.add_value('identifier', product['sku']) loader.add_value('url', product['url'][0]) loader.add_value('name', product['name']) loader.add_value('sku', product['sku']) category = response.css('a.GTM-breadcumb::text').extract( )[1:] or response.meta.get('category') loader.add_value('category', category) loader.add_value('image_url', product['image']) loader.add_value('brand', product['brand']) if product['offers']['properties']['availability'] != 'in stock': loader.add_value('stock', 0) price = product['offers']['properties']['price'] yield Request(loader.get_output_value('url'), self.parse_product, meta={'item': Product(loader.load_item())}) if not products: return page = url_query_parameter(response.url, 'page') if page: url = add_or_replace_parameter(response.url, 'page', int(page) + 1) else: id_families = response.xpath( '//input[@data-key="idFamilies"]/@value').extract_first() if id_families: url = add_or_replace_parameter( 'https://www.pccomponentes.pt/listado/ajax?page=0&order=price-desc', 'idFamilies[]', id_families) elif response.url.endswith('/novedades/'): return elif response.url.endswith('/'): url = response.url + 'ajax?page=0&order=price-desc' else: return yield Request(url, self.parse_category, meta={'category': category}) def parse_product(self, response): item = response.meta['item'] data = SpiderSchema(response).get_product() category = response.css('a.GTM-breadcumb::text').extract()[1:] loader = ProductLoaderEU(Product(), response=response) loader.add_value(None, item) loader.replace_value('price', data['offers']['properties']['price']) loader.replace_value('category', category) if data['offers']['properties']['availability'] != 'inStock': loader.replace_value('stock', 0) yield loader.load_item()
class BiqugeSpider(CrawlSpider): name = 'biquge' allowed_domains = ['www.biquyun.com', 'biquyun.com'] start_urls = ['http://www.biquyun.com/'] custom_settings = { "DOWNLOAD_DELAY": 0.5, "USE_PROXY": False, "IGNORE_NOVEL": set({}), "RETAIN_NOVEL": set({}), } rules = ( Rule(LinkExtractor(allow=r'\d+_\d+/$'), callback='parse_novel', follow=True), # Rule(LinkExtractor(allow=r'.*?/\d+.html'), callback='parse_chapter', follow=True, # process_request="custom_process_request"), ) @classmethod def from_crawler(cls, crawler, *args, **kwargs): spider = super(BiqugeSpider, cls).from_crawler(crawler, *args, **kwargs) crawler.signals.connect(spider.spider_close, signal=signals.spider_closed) return spider def __init__(self, *args, **kwargs): super(BiqugeSpider, self).__init__(*args, **kwargs) def spider_close(self): self.logger.info("{0} finish.".format(self.name)) def parse_novel(self, response): author_widget = response.xpath("//div[@id='info']/p[1]/text()").extract() author = "".join([get_author_by_biquge(author) for author in author_widget]) novel = response.css("#info h1::text").extract_first().strip() # 已存在的小说不做处理 if novel_is_exists(novel_name=novel, author_name=author): self.logger.info("过滤已存在的小说[{0}]".format(novel)) return None item_loader = ItemLoader(item=NovelItem(), response=response) item_loader.add_value("url", response.url) item_loader.add_css("image_url", "#fmimg img::attr(src)") item_loader.add_css("site_name", ".header_logo a::text") item_loader.add_value("novel_name", novel) item_loader.add_value("spider_name", self.name) item_loader.add_value("author", author) category = response.xpath("//div[@class='con_top']/a[2]/text()").extract_first() if not category: categories = response.css(".con_top::text").extract() category = get_category_by_biquge("".join(categories)) category = category or "其他小说" item_loader.add_value("category", category) item_loader.add_css("intro", "#intro") item = item_loader.load_item() yield item urls = response.css("#list dl dd a::attr(href)").extract() for url in urls: url = parse.urljoin(response.url, url) yield Request(url=url, dont_filter=True, meta={"novel": novel, "author": author}, callback=self.parse_chapter) def parse_chapter(self, response): author = response.meta.get("author", "") novel = response.meta.get("novel", "") item_loader = ItemLoader(item=ChapterItem(), response=response) item_loader.add_value("url", response.url) index = get_chapter_index_by_biquge(response.url) item_loader.add_value("index", index) item_loader.add_css("name", ".bookname h1::text") item_loader.add_value("novel_name", novel) item_loader.add_value("author_name", author) item = item_loader.load_item() return item