class A58comSpider(CrawlSpider): name = '58com' allowed_domains = ['58.com'] start_urls = ['http://nj.58.com/hezu/0/'] rules = ( Rule(LinkExtractor(allow=r'hezu/0/pn\d+/'), follow=True), Rule(LinkExtractor(allow=r'hezu/.*?shtml'), callback='parse_item', follow=True), ) headers = header_list.get_header() custom_settings = { # do not needs login project 'COOKIES_ENABLED': False, # 'ITEM_PIPELINES': { # 'BaseTemp.pipelines.ImdbMongoPipeline': 300, # }, # do not needs login project 'DOWNLOADER_MIDDLEWARES': { 'BaseTemp.middlewares.UserAgentMiddleware': 200, }, # 'MONGO_DB': 'imdb', 'JOBDIR': 'info/58crawl/001', } def parse_item(self, response): print(response.text) pass
class LieSpider(CrawlSpider): name = 'lie' allowed_domains = ['chahaoba.com'] start_urls = ['https://www.chahaoba.com/分类:骗子号码'] rules = ( Rule(LinkExtractor(allow=r'.*pagefrom.*'), follow=True), Rule(LinkExtractor(allow=r'.*?\d{11}'), callback='parse_item', follow=True), ) headers = header_list.get_header() custom_settings = { # do not needs login project 'COOKIES_ENABLED': False, # 'ITEM_PIPELINES': { # 'BaseTemp.pipelines.ImdbMongoPipeline': 300, # }, # do not needs login project 'DOWNLOADER_MIDDLEWARES': { 'BaseTemp.middlewares.UserAgentMiddleware': 200, }, # 'MONGO_DB': 'imdb', 'JOBDIR': 'info/chahao/001', } def parse_item(self, response): print(response.url)
class PhonenumSpider(scrapy.Spider): name = 'phonenum' allowed_domains = ['www.so.com'] start_urls = ['https://www.so.com/s?q=13716919636'] custom_settings = { # do not needs login project 'COOKIES_ENABLED': False, 'ITEM_PIPELINES': { 'BaseTemp.pipelines.MongoPipeline': 300, }, # do not needs login project 'DOWNLOADER_MIDDLEWARES': { 'BaseTemp.middlewares.UserAgentMiddleware': 200, }, 'MONGO_DB': 'phone', 'JOBDIR': 'info/phone/001', # 'LOG_FILE':'imdb_log.txt', } headers = header_list.get_header() def parse(self, response): # 解析电话号码 item = PhoneItem() msg = response.xpath( '//p[@class="mh-detail"]/text()').extract()[0].split() item['crawl_time'] = datetime.now().strftime('%Y-%m-%d') item['phone_num'] = msg[0] item['area'] = msg[1] item['service_provider'] = msg[2] item['mongo_collection'] = 'info' yield item
class CreditmanageSpider(scrapy.Spider): name = 'creditmanage' allowed_domains = ['credit-manage.com'] start_urls = ['http://credit-manage.com/'] headers = header_list.get_header() custom_settings = { # do not needs login project 'COOKIES_ENABLED': False, 'RETRY_HTTP_CODES': [500, 503, 504, 400, 403, 404, 408], 'RETRY_TIMES': 1000, 'DOWNLOAD_DELAY': 0.3, 'ITEM_PIPELINES': { 'BaseTemp.pipelines.MongoPipeline': 300, }, # do not needs login project 'DOWNLOADER_MIDDLEWARES': { 'BaseTemp.middlewares.UserAgentMiddleware': 200, # 'BaseTemp.middlewares.ProxyMiddleware': 10, }, 'MONGO_DB': 'creditmanage', # 'JOBDIR': 'info/hc360/002', } def parse(self, response): # 发起请求 yield scrapy.FormRequest(url='http://credit-manage.com/search.htm', formdata={'condition': '杨勇'}, callback=self.parse_page, headers=self.headers) def parse_page(self, response): print(response.text) pass
class ImdbcrawlSpider(CrawlSpider): name = 'imdbcrawl' allowed_domains = ['www.imdb.cn'] start_urls = ['http://www.imdb.cn/NowPlaying/'] rules = ( Rule(LinkExtractor(allow=r'Sections/.*'), follow=True), Rule(LinkExtractor(allow=r'title/tt\d+'), callback='parse_item', follow=True), # Rule(), # Rule(), ) headers = header_list.get_header() custom_settings = { # do not needs login project 'COOKIES_ENABLED': False, 'ITEM_PIPELINES': { 'BaseTemp.pipelines.ImdbMongoPipeline': 300, }, # do not needs login project 'DOWNLOADER_MIDDLEWARES': { 'BaseTemp.middlewares.UserAgentMiddleware': 200, }, 'MONGO_DB': 'imdb', 'JOBDIR': 'info/imdbcrawl.cn/001', } def parse_item(self, response): movie_item = BasetempItem() movie_item['crawl_time'] = datetime.now().strftime('%Y-%m-%d') movie_item['title'] = response.xpath( '//div[@class="fk-3"]/div/h3/text()').extract()[0].strip() movie_item['time'] = self.get_time(response) movie_item['area'] = self.get_area(response) movie_item['mongo_collection'] = 'movie1' # 选择mongo表 yield movie_item def get_time(self, response): if re.search('<i>上映时间:</i><a.*?>(\d+)</a>', response.text): time = re.search('<i>上映时间:</i><a.*?>(\d+)</a>', response.text).group(1).strip() else: time = '' return time def get_area(self, response): if re.search('<i>国家:</i><a.*?>(.*?)</a>', response.text): area = re.search('<i>国家:</i><a.*?>(.*?)</a>', response.text).group(1).strip() else: area = '' return area
class Hc360Spider(scrapy.Spider): name = 'hc360' allowed_domains = ['hc360.com'] start_urls = ['https://js.hc360.com/category/cn.html'] headers = header_list.get_header() custom_settings = { # do not needs login project 'COOKIES_ENABLED': False, 'RETRY_HTTP_CODES': [500, 503, 504, 400, 403, 404, 408], 'RETRY_TIMES': 1000, 'DOWNLOAD_DELAY': 0.3, 'ITEM_PIPELINES': { 'BaseTemp.pipelines.MongoPipeline': 300, }, # do not needs login project 'DOWNLOADER_MIDDLEWARES': { 'BaseTemp.middlewares.UserAgentMiddleware': 200, # 'BaseTemp.middlewares.ProxyMiddleware': 10, }, 'MONGO_DB': 'hc360', # 'JOBDIR': 'info/hc360/002', } def parse(self, response): area_url = response.xpath('//article/ul/li/a/@href').extract() # area_url = ['https://js.hc360.com/cn/sh/'] for url in area_url: yield scrapy.Request(url=response.urljoin(url), headers=self.headers, callback=self.pares_page_num) def pares_page_num(self, response): # 第一次请求时截获全部页面数,用于下面直接发请求 # 使用自动寻找是系统会直接ban掉下一页的链接 nums = re.search('共(.*)页', response.text).group(1) page_num = int(nums.strip()) for num in range(2, page_num): page_url = response.url + str(num) + '/' yield scrapy.Request(url=page_url, headers=self.headers, callback=self.parse_area) def parse_area(self, response): # 解析构造本地企业url item = HcUrlItem() comp_url = response.xpath('//article/ul/li/div') for url in comp_url: item['crawl_time'] = datetime.now().strftime('%Y-%m-%d') item['comp_name'] = url.xpath('a/text()').extract()[0] url_temp = url.xpath('a/@href').extract()[0] item['comp_id'] = re.search('/company-(.*?)/', url_temp).group(1) item['comp_page'] = item[ 'comp_id'] + 'b2b.hc360.com/shop/company.html' item['mongo_collection'] = 'url' yield item
class ChahaoSpider(scrapy.Spider): name = 'chahao' allowed_domains = ['chahaoba.com'] start_urls = ['https://www.chahaoba.com/分类:骗子号码'] custom_settings = { # do not needs login project 'COOKIES_ENABLED': False, 'DOWNLOAD_DELAY': 1, 'ITEM_PIPELINES': { 'BaseTemp.pipelines.MongoPipeline': 300, }, # do not needs login project 'DOWNLOADER_MIDDLEWARES': { 'BaseTemp.middlewares.UserAgentMiddleware': 200, }, 'MONGO_DB': 'swindler', 'JOBDIR': 'info/chahaoba/001', # 'LOG_FILE':'imdb_log.txt', } headers = header_list.get_header() def parse(self, response): item = ChaohaoItem() temps = response.xpath('//div[@class="mw-category"]//li') print(temps) for temp in temps: nums = temp.xpath('a/text()').extract()[0] num = get_all_num(nums) item['num'] = num item['lables'] = '诈骗' item['crawl_time'] = datetime.now().strftime('%Y-%m-%d') item['mongo_collection'] = 'tel' yield item try: next_page = re.search('.*<a href="(.*?)" title=".*?">下一页</a>', response.text).group(1) next_page = response.urljoin( next_page.replace('amp;', '').replace('amp', '')) yield scrapy.Request(next_page, headers=self.headers, callback=self.parse) except Exception as e: print('已经是最后一页')
class LieSpider(scrapy.Spider): name = 'pianzi' allowed_domains = ['pianzi.com.cn'] start_urls = ['http://www.pianzi.com.cn/shouji_1/'] headers = header_list.get_header() custom_settings = { # do not needs login project 'COOKIES_ENABLED': False, 'DOWNLOAD_DELAY': 1, 'ITEM_PIPELINES': { 'BaseTemp.pipelines.JsonPipeline': 300, }, # do not needs login project 'DOWNLOADER_MIDDLEWARES': { 'BaseTemp.middlewares.UserAgentMiddleware': 200, }, 'MONGO_DB': 'swindler', 'JOBDIR': 'info/pianzi/002', # 'LOG_FILE':'imdb_log.txt', } def parse(self, response): item = ChaohaoItem() temps = response.xpath( '//ul[@class="news_list"]/li/a/@title').extract() for temp in temps: item['num'] = temp item['lables'] = '被举报电话' item['crawl_time'] = datetime.now().strftime('%Y-%m-%d') # item['mongo_collection'] = 'tel' yield item try: next_page = re.search('.*<a href="(.*?)">下一页</a>', response.text).group(1) next_page = response.urljoin(next_page) yield scrapy.Request(next_page, headers=self.headers, callback=self.parse) except Exception as e: print('已经是最后一页') def parse_item(self, response): pass
class ChahaoSpider(scrapy.Spider): name = 'chahao' allowed_domains = ['chahaoba.com'] start_urls = [ 'https://www.chahaoba.com/index.php?title=%E5%88%86%E7%B1%BB:%E9%AA%97%E5%AD%90%E5%8F%B7%E7%A0%81&%3Bpagefrom=%2B02227393016%EF%BC%9B%2B37911183&pagefrom=%2B0222999767#mw-pages' ] custom_settings = { # do not needs login project 'COOKIES_ENABLED': False, # 'ITEM_PIPELINES': { # 'BaseTemp.pipelines.ImdbMongoPipeline': 300, # }, # do not needs login project 'DOWNLOADER_MIDDLEWARES': { 'BaseTemp.middlewares.UserAgentMiddleware': 200, }, # 'MONGO_DB': 'imdb', # 'JOBDIR': 'info/chahao/001', # 'LOG_FILE':'imdb_log.txt', } headers = header_list.get_header() page = 1 def parse(self, response): nums = response.xpath('//div[@class="mw-category"]//li') for num in nums: phone_nums = get_num(num.xpath('a/text()').extract()[0]) if len(phone_nums) == 11 and phone_nums.startswith('1'): num_link = response.urljoin(phone_nums) phone_num = phone_nums yield scrapy.Request(url=num_link, headers=self.headers, meta={'phone_num': phone_num}, callback=self.parse_detail) try: next_page = re.search('.*<a href="(.*?)" title=".*?">下一页</a>', response.text).group(1) next_page = response.urljoin( next_page.replace('amp;', '').replace('amp', '')) yield scrapy.Request(next_page, headers=self.headers, callback=self.parse) except Exception as e: print('已经是最后一页') def parse_detail(self, response): # 解析详情页面 try: area = re.search('归属省份地区:<a href=".*?">(.*?)</a>', response.text).group(1) except: area = '未知' try: provider = re.search('电信运营商:<a href=".*?">(.*?)</a>', response.text).group(1) except: provider = '未知' title = '诈骗电话' print(title) print(area) print(provider)
class ImdbSpider(scrapy.Spider): name = 'imdb' allowed_domains = ['www.imdb.cn'] start_urls = ['http://www.imdb.cn/nowplaying/1'] custom_settings = { # do not needs login project 'COOKIES_ENABLED': False, 'ITEM_PIPELINES': { 'BaseTemp.pipelines.ImdbMongoPipeline': 300, }, # do not needs login project 'DOWNLOADER_MIDDLEWARES': { 'BaseTemp.middlewares.UserAgentMiddleware': 200, }, 'MONGO_DB': 'imdb', 'JOBDIR': 'info/imdb.com/001', # 'LOG_FILE':'imdb_log.txt', } headers = header_list.get_header() def parse(self, response): # 获取下一页 并抽取详情链接 for page in range(1, 2): movie_url = response.xpath( '//div[@class="ss-3 clear"]/a/@href').extract() for url in movie_url: yield scrapy.Request(url=response.urljoin(url), headers=self.headers, callback=self.parse_movie) next_page = 'http://www.imdb.cn/nowplaying/{0}'.format(page) yield scrapy.Request(url=next_page, callback=self.parse) def parse_movie(self, response): # 解析电影页面 movie_item = BasetempItem() movie_item['crawl_time'] = datetime.now().strftime('%Y-%m-%d') movie_item['title'] = response.xpath( '//div[@class="fk-3"]/div/h3/text()').extract()[0].strip() movie_item['time'] = self.get_time(response) movie_item['area'] = self.get_area(response) movie_item['mongo_collection'] = 'movie' #选择mongo表 yield movie_item def get_time(self, response): if re.search('<i>上映时间:</i><a.*?>(\d+)</a>', response.text): time = re.search('<i>上映时间:</i><a.*?>(\d+)</a>', response.text).group(1).strip() else: time = '' return time def get_area(self, response): if re.search('<i>国家:</i><a.*?>(.*?)</a>', response.text): area = re.search('<i>上映时间:</i><a.*?>(\d+)</a>', response.text).group(1).strip() else: area = '' return area