class ItcastPipeline(object): def __init__(self): self.log = Log().get_logger() """输出json,使用内置JsonItemExporter实现,或自己使用内置json实现""" filename = Config.get_results_path() + 'teachers.json' # self.file = open(filename, 'w' , encoding='utf-8') self.file = open(filename, 'wb') self.exporter = JsonItemExporter(self.file, indent=0, encoding='utf-8') self.exporter.start_exporting() """输出csv,使用自定义MyCsvItemExporter可指定顺序和分隔符""" csv_filename = Config.get_results_path() + 'teachers.csv' self.csv_file = open(csv_filename, 'wb') fields = ['name', 'grade', 'info'] self.csv_exporter = MyCsvItemExporter(fields=fields, file=self.csv_file, encoding='utf-8') """存储数据库""" client = pymongo.MongoClient( 'mongodb://*****:*****@localhost:27017/') db = client['scrapydb'] # 指定数据库 self.collection = db['teachers'] # 指定集合 def process_item(self, item, spider): # 当爬虫的数据返回时,这个方法被调用。 if isinstance(item, ItcastItem): # line = json.dumps(dict(item), ensure_ascii=False) + ",\n" # self.file.writelines(line) self.exporter.export_item(item) self.csv_exporter.export_item(item) # result = self.collection.insert_one(dict(item)) # self.log.info(result.inserted_id) return item def open_spider(self, spider): # 可选实现,当spider被开启时,这个方法被调用。 self.log.info('open_itcast_spider') def close_spider(self, spider): # 可选实现,当spider被关闭时,这个方法被调用,这里没有被调用,原因不详 self.exporter.finish_exporting() self.file.close() self.csv_file.close() self.log.info('close_itcast_spider')
class JDPipeline(object): def __init__(self): self.log = Log().get_logger() """输出json,使用内置JsonItemExporter实现,或自己使用内置json实现""" filename = Config.get_results_path() + 'jd.json' self.file = open(filename, 'wb') self.exporter = JsonItemExporter(self.file, indent=0, encoding='utf-8') self.exporter.start_exporting() """输出csv,使用自定义MyCsvItemExporter可指定顺序和分隔符""" csv_filename = Config.get_results_path() + 'jd.csv' self.csv_file = open(csv_filename, 'wb') fields = ['title', 'price', 'comment', 'product_id'] self.csv_exporter = MyCsvItemExporter(fields=fields, file=self.csv_file, encoding='gbk') def process_item(self, item, spider): # 当爬虫的数据返回时,这个方法被调用。 self.log.info('这是京东网站数据') self.exporter.export_item(item) self.csv_exporter.export_item(item) return item def open_spider(self, spider): # 可选实现,当spider被开启时,这个方法被调用。 self.log.info('open_jd_spider') def close_spider(self, spider): # 可选实现,当spider被关闭时,这个方法被调用,这里没有被调用,原因不详 self.exporter.finish_exporting() self.file.close() self.log.info('close_jd_spider')
class BeautyImageSpider(scrapy.Spider): name = "BeautyImageSpider" allowed_domains = ["lab.scrapyd.cn"] start_urls = ['http://lab.scrapyd.cn/archives/55.html'] custom_settings = { 'ITEM_PIPELINES': { 'ScrapyStudy.pipelines.BeautyImagePipeline': 300, }, # 'DOWNLOADER_MIDDLEWARES': {"ScrapyStudy.middlewares.SeleniumMiddleware": 401, }, 'IMAGES_STORE': 'D:\ImageSpider', # 图片存储位置 } def __init__(self): self.log = Log().get_logger() super(BeautyImageSpider, self).__init__() def start_requests(self): for urls in self.start_urls: yield scrapy.Request(url=urls, callback=self.parse) def parse(self, response): # filename = Config.get_results_path() + "beauty_image.html" # 1、保存网页数据 # with open(filename, 'wb+') as file: # 只能以二进制方式打开 # file.write(response.body) item = BeautyImageItem() # image_urls = response.css(".post-content img::attr(src)").extract() # 图片url集合 image_urls = response.xpath( '//div[@class="post-content"]//img/@src').extract() title = response.css(".post-title a::text").extract_first() item['image_url'] = image_urls item['title'] = title yield item # 直接返回最后数据 def closed(self, spider): self.log.info("BeautyImageSpider_closed")
class Sina7x24Pipeline(object): def __init__(self): self.log = Log().get_logger() """输出json,使用内置JsonItemExporter实现,或自己使用内置json实现""" filename = Config.get_results_path() + 'sina7x24.json' self.file = open(filename, 'wb') self.exporter = JsonItemExporter(self.file, indent=0, encoding='utf-8') # self.exporter = MyJsonLinesItemExporter(self.file, indent=0, encoding='utf-8') self.exporter.start_exporting() """存储数据库""" client = pymongo.MongoClient( 'mongodb://*****:*****@localhost:27017/') db = client['sinadb'] # 指定数据库 self.collection = db['7x24'] # 指定集合 def process_item(self, item, spider): # 当爬虫的数据返回时,这个方法被调用。 if isinstance(item, Sina7x24Item): self.exporter.export_item(item) myquery = {"date": item['date']} newvalues = {"$set": dict(item)} result = self.collection.update_many(myquery, newvalues, upsert=True) # self.log.info('插入数据id:%s' % result.upserted_id) return item def open_spider(self, spider): # 可选实现,当spider被开启时,这个方法被调用。 self.log.info('open_sina7x24_spider') def close_spider(self, spider): # 可选实现,当spider被关闭时,这个方法被调用,这里没有被调用,原因不详 self.exporter.finish_exporting() self.file.close() self.log.info('close_sina7x24_spider')
class FileUtils(object): """ 文件操作类 """ def __init__(self): self.log = Log().get_logger() def read_yaml(self, path): try: with open(path, 'r', encoding='utf-8') as file: data_dict = yaml.load(file) self.log.info('读取yaml文件成功') if isinstance(data_dict, dict): return data_dict else: return dict() except Exception as e: return dict() def write_yaml(self, path, data): try: # a追加写入,w覆盖写入 with open(path, 'w', encoding='utf-8') as file: yaml.dump(data, file) self.log.info('数据写入yaml文件成功') return True except Exception as e: self.log.error('数据写入yaml文件出现异常:%s' % e) return None def read_txt(self, path): try: with open(path, 'r', encoding='utf-8-sig') as file: message_list = file.readlines() if len(message_list) > 0: msg_list = [] for line in message_list: line = line.strip().replace(" ", "") # 去掉换行符和空格 if len(line) > 0: # 去掉空行 msg_list.append(line) if len(msg_list) > 0: return msg_list return None except Exception as e: self.log.error('读取txt文件出现异常:%s' % e) return None def write_txt(self, path, text): try: # a追加写入,w覆盖写入 with open(path, 'w', encoding='utf-8') as file: file.write(str(text)) self.log.info('数据写入文件成功') return True except Exception as e: self.log.error('数据写入txt文件出现异常:%s' % e) return None def del_file(self, path): ls = os.listdir(path) for i in ls: c_path = os.path.join(path, i) if os.path.isdir(c_path): self.del_file(c_path) else: os.remove(c_path) def copy_file_random(self, file_dir, tar_dir, num): # 随机复制num张图片 sample = random.sample(os.listdir(file_dir), num) for name in sample: shutil.copyfile(file_dir + name, tar_dir + name) def clip_file_random(self, file_dir, tar_dir, num): # 随机剪贴num张图片,并重复复制多张 sample = random.sample(os.listdir(file_dir), num) for name in sample: shutil.copyfile(file_dir + name, tar_dir + name) os.remove(file_dir + name) self.copy_redo(tar_dir, name, 32) def copy_redo(self, file_dir, name, num): # 同目录下重复复制多张图片 for index in range(0, num): shutil.copyfile(file_dir + name, file_dir + str(index) + name) def copy_word(self, path): # 复制word的所有内容到了剪切板,包括图片、文字、格式 try: pythoncom.CoInitialize() w = win32com.client.DispatchEx('Word.Application') try: # # 后台运行,不显示,不警告 w.Visible = 0 w.DisplayAlerts = 0 doc = w.Documents.Open(path) # 打开word,经测试要是绝对路径 doc.Content.Copy() # 复制word的所有内容 doc.Close() # 关闭word except Exception as e: self.log.error('复制word文档出现异常:%s' % e) finally: if w: # 对com操作,一定要确保退出word应用 self.log.info('退出word应用') w.Quit() del (w) pythoncom.CoUninitialize() except Exception as e: pass
class Sina7x24SplashSpider(scrapy.Spider): name = "Sina7x24SplashSpider" allowed_domains = ["finance.sina.com.cn"] # start_urls = [ # 'http://finance.sina.com.cn/7x24/' # ] custom_settings = { 'ITEM_PIPELINES': { 'ScrapyStudy.pipelines.Sina7x24Pipeline': 300, }, 'CONCURRENT_REQUESTS': 1, 'SPLASH_URL': 'http://192.168.99.100:8050/', # Splash的服务地址,本地或远程服务地址 "DOWNLOADER_MIDDLEWARES": { 'scrapy_splash.SplashCookiesMiddleware': 723, 'scrapy_splash.SplashMiddleware': 725, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, }, 'SPIDER_MIDDLEWARES': { 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, }, 'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter', # 去重的类 'HTTPCACHE_STORAGE': 'scrapy_splash.SplashAwareFSCacheStorage', # Cache存储 } def __init__(self): self.log = Log().get_logger() self.date_list = [] self.is_first = True super(Sina7x24SplashSpider, self).__init__() def start_requests(self): urls = 'http://finance.sina.com.cn/7x24/' yield SplashRequest(url=urls, callback=self.parse) def parse(self, response): filename = Config.get_results_path() + "sina7x24.html" # 1、保存网页数据 with open(filename, 'wb+') as file: # 只能以二进制方式打开 file.write(response.body) items = [] news = response.xpath("//div[@class='bd_i bd_i_og clearfix']") day = time.strftime("%Y-%m-%d ", time.localtime(time.time())) # 获取当前时间 for each in news[::-1]: item = Sina7x24Item() # extract()方法返回的都是unicode字符串,normalize-space()可以去掉数据中空格、换行符等特殊符号 times = each.xpath("normalize-space(div/p/text())").extract() info = each.xpath("normalize-space(div[2]/div/p/text())").extract() # time = each.css(".bd_i_time_c::text").extract() # info = each.css(".bd_i_txt_c::text").extract() date = day + times[0] if self.is_first: item['date'] = date item['info'] = info[0] self.date_list.append(date) print(date + info[0]) else: if date in self.date_list: continue else: item['date'] = date item['info'] = info[0] self.date_list.append(date) print('-' * 60) print(date + info[0]) items.append(item) self.is_first = False # print('长度:%s' % len(items)) return items # 直接返回最后数据 def closed(self, spider): self.log.info("Sina7x24SplashSpider_closed")
from scrapy import cmdline from ScrapyStudy.public.Log import Log log = Log().get_logger() log.info(20 * '-' + '开始' + 20 * '-') """单个执行""" # cmdline.execute("scrapy crawl BeautyImageSpider".split()) cmdline.execute("scrapy crawl Sina7x24SeleniumSpider".split()) # cmdline.execute("scrapy crawl Sina7x24SplashSpider".split()) # cmdline.execute("scrapy crawl itcast".split()) # 执行爬虫命令 不能去掉split()方法 # cmdline.execute("scrapy crawl jd".split()) #执行爬虫命令 不能去掉split()方法 # cmdline.execute("scrapy crawl itcast -o results//teachers0.json".split()) # -o 输出指定格式的文件 # cmdline.execute("scrapy crawl itcast -o results//teachers0.csv".split()) # cmdline.execute("scrapy crawl itcast -o results//teachers0.xml".split()) """多个执行""" # cmdline.execute("scrapy crawlall".split())
class Sina7x24SeleniumSpider(scrapy.Spider): name = "Sina7x24SeleniumSpider" allowed_domains = ["finance.sina.com.cn"] # start_urls = [ # 'http://finance.sina.com.cn/7x24/' # ] custom_settings = { 'ITEM_PIPELINES': { 'ScrapyStudy.pipelines.Sina7x24Pipeline': 300, }, 'DOWNLOADER_MIDDLEWARES': { "ScrapyStudy.middlewares.SeleniumMiddleware": 401, }, 'CONCURRENT_REQUESTS': 1, } def __init__(self): self.log = Log().get_logger() self.driver_firefox() self.date_list = [] self.is_first = True super(Sina7x24SeleniumSpider, self).__init__() def driver_firefox(self): options = Options() # 不同浏览器导入Options包路径不一样 options.add_argument('-headless') # 无界面配置 self.driver = webdriver.Firefox(firefox_options=options) # 这里初始化浏览很耗时 # self.driver = webdriver.Firefox() # self.driver.maximize_window() self.driver.set_page_load_timeout(25) def start_requests(self): urls = 'http://finance.sina.com.cn/7x24/' while True: # for i in range(50): try: title = self.driver.title except Exception as e: self.log.info('浏览器进程被干掉,重新驱动浏览器') if 'without establishing a connection' in str(e): self.driver_firefox() time.sleep(25) yield scrapy.Request(url=urls, callback=self.parse, dont_filter=True) # dont_filte为True,不去重 def parse(self, response): # filename = Config.get_results_path() + "sina7x24.html" # 1、保存网页数据 # with open(filename, 'wb+') as file: # 只能以二进制方式打开 # file.write(response.body) items = [] news = response.xpath("//div[@class='bd_i bd_i_og clearfix']") day = time.strftime("%Y-%m-%d ", time.localtime(time.time())) # 获取当前时间 for each in news[::-1]: item = Sina7x24Item() # extract()方法返回的都是unicode字符串,normalize-space()可以去掉数据中空格、换行符等特殊符号 times = each.xpath("normalize-space(div/p/text())").extract() info = each.xpath("normalize-space(div[2]/div/p/text())").extract() # time = each.css(".bd_i_time_c::text").extract() # info = each.css(".bd_i_txt_c::text").extract() date = day + times[0] if self.is_first: item['date'] = date item['info'] = info[0] self.date_list.append(date) print(date + info[0]) else: if date in self.date_list: continue else: item['date'] = date item['info'] = info[0] self.date_list.append(date) print('-' * 60) print(date + info[0]) items.append(item) self.is_first = False # print('长度:%s' % len(items)) return items # 直接返回最后数据 def closed(self, spider): self.log.info("Sina7x24SeleniumSpider_closed") self.driver.close()
class MongoDBStudy(object): def __init__(self): self.log = Log().get_logger() """ 1、连接MongoDB 连接MongoDB我们需要使用PyMongo库里面的MongoClient,一般来说传入MongoDB的IP及端口即可,第一个参数为地址host, 第二个参数为端口port,端口如果不传默认是27017。 """ self.client = pymongo.MongoClient( 'mongodb://*****:*****@localhost:27017/') # 可以达到同样的连接效果。 """ 2、指定数据库 MongoDB分为一个个数据库,需要指定要操作哪个数据库,在这里我以test数据库为例进行说明 注意: 在 MongoDB 中,如果数据库不存在,数据库只有在内容插入后才会创建! 就是说,数据库创建后要创建集合(数据表) 并插入一个文档(记录),数据库才会真正创建。集合创建同理。 """ self.db = self.client.sinadb # self.db = self.client.scrapydb # self.db = self.client['testdb'] # 两种方式是等价的。 """ 2.2、读取 MongoDB 中的所有数据库,并判断指定的数据库是否存在 """ dblist = self.client.list_database_names() if "sinadb" in dblist: self.log.info("数据库已存在!") else: self.log.info("数据库不存在!") """ 3、指定集合 每个数据库又包含了许多集合Collection,也就类似与关系型数据库中的表,下一步需要指定要操作的集合, 在这里我们指定一个集合名称为students,学生集合。还是和指定数据库类似,指定集合也有两种方式。 """ # self.collection = self.db.teachers self.collection = self.db['7x24'] # self.collection = self.db['students'] collist = self.db.list_collection_names() if "7x24" in collist: # 判断 sites 集合是否存在 self.log.info("集合已存在!") else: self.log.info("集合不存在!") def db_insert(self): """ 4、插入数据, 对于students这个Collection,我们新建一条学生数据,以字典的形式表示, 直接调用collection的insert()方法即可插入数据 """ student = { "name": "Google", "alexa": "1", "url": "https://www.google.com" } result = self.collection.insert_one(student) # 在MongoDB中,每条数据都有一个_id属性来唯一标识,如果没有显式指明_id,MongoDB会自动产生一个ObjectId类型的_id属性。 self.log.info(result.inserted_id) students = [ { "name": "QQ", "alexa": "101", "url": "https://www.qq.com" }, { "name": "Facebook", "alexa": "10", "url": "https://www.facebook.com" }, ] results = self.collection.insert_many(students) # 集合中插入多条数据 # 输出插入的所有文档对应的 _id 值 self.log.info(results.inserted_ids) # students = [ # {"_id": 1, "name": "知乎", "alexa": "103", "url": "https://www.zhihu.com"}, # {"_id": 2, "name": "Github", "alexa": "109", "url": "https://www.github.com"} # ] # results = self.collection.insert_many(students) # 自己指定 id,插入 # # 输出插入的所有文档对应的 _id 值 # self.log.info(results.inserted_ids) def db_find(self): # results = self.collection.find_one() # 查询集合中的第一条数据。 # self.log.info(results) # # lists = self.collection.find() lists = list(self.collection.find({}, { "_id": 0, "date": 1, "info": 1 })) self.log.info("数据总量:%s" % len(lists)) for x in lists: # 查询集合中的所有数据 self.log.info(x) # find() 方法来查询指定字段的数据,将要返回的字段对应值设置为 1。 # 除了 _id 你不能在一个对象中同时指定 0 和 1,如果你设置了一个字段为 0,则其他都为 1,反之亦然。 # # self.collection.find({}, {"name": 0, "alexa": 1}) 会抛异常 # value_list = self.collection.find({}, {"_id": 0, "name": 1, "alexa": 1}) # for x in value_list: # self.log.info(x) # # self.log.info('-' * 20) # # # 指定条件查询 # myquery = {"name": "Facebook"} # value_list = self.collection.find(myquery) # for x in value_list: # self.log.info(x) # # self.log.info('-' * 20) # # # 高级查询,第一个字母 ASCII 值大于 "H" 的数据 # myquery = {"name": {"$gt": "H"}} # value_list = self.collection.find(myquery) # for x in value_list: # self.log.info(x) # # self.log.info('-' * 20) # # # 使用正则表达式查询,第一个字母为 "F" 的数据 # myquery = {"name": {"$regex": "^F"}} # value_list = self.collection.find(myquery) # for x in value_list: # self.log.info(x) # # self.log.info('-' * 20) # # # 返回指定条数记录,设置指定条数的记录可以使用 limit() 方法,该方法只接受一个数字参数 # value_list = self.collection.find().limit(2) # for x in value_list: # self.log.info(x) def db_update(self): value_list = self.collection.find() # 查询集合中的所有数据 for x in value_list: self.log.info(x) # # 将 alexa 字段的值为XXX 的改为 XXX # myquery = {"alexa": "103"} # newvalues = {"$set": {"alexa": "111"}} # result = self.collection.update_one(myquery, newvalues) # 只能修匹配到的第一条记录 # self.log.info('修改数据结果:%s' % result.modified_count) # 将 alexa 字段的值为10000 的改为 111 myquery = {"alexa": "111"} newvalues = {"$set": {"alexa": "112"}} result = self.collection.update_many(myquery, newvalues) # 修改所有匹配到的记录 self.log.info('修改数据结果:%s' % result.modified_count) self.log.info('-' * 20) value_list = self.collection.find() # 查询集合中的所有数据 for x in value_list: self.log.info(x) def db_sort(self): # value_list = self.collection.find().sort("alexa") # 对字段 alexa 按升序排序 value_list = self.collection.find().sort("alexa", -1) # 对字段 alexa 按降序排序 for x in value_list: self.log.info(x) def db_delete(self): # myquery = {"name": "Taobao"} # self.collection.delete_one(myquery)# 删除 name 字段值为 "Taobao" 的第一个匹配文档 myquery = {"name": {"$regex": "^G"}} result = self.collection.delete_many(myquery) # 删除所有 name 字段中以 G 开头的文档 self.log.info('删除结果:%s' % result.deleted_count) self.db.drop_collection("7x24") # 删除整个collection value_list = self.collection.find() for x in value_list: self.log.info(x)