def __init__(self): self.log = Log().get_logger() """输出json,使用内置JsonItemExporter实现,或自己使用内置json实现""" filename = Config.get_results_path() + 'jd.json' self.file = open(filename, 'wb') self.exporter = JsonItemExporter(self.file, indent=0, encoding='utf-8') self.exporter.start_exporting() """输出csv,使用自定义MyCsvItemExporter可指定顺序和分隔符""" csv_filename = Config.get_results_path() + 'jd.csv' self.csv_file = open(csv_filename, 'wb') fields = ['title', 'price', 'comment', 'product_id'] self.csv_exporter = MyCsvItemExporter(fields=fields, file=self.csv_file, encoding='gbk')
def parse(self, response): # 1、保存网页数据 filename = Config.get_results_path() + "teacher.html" with open(filename, 'wb+') as file: # 只能以二进制方式打开 file.write(response.body) # context = response.xpath('/html/head/title/text()') # print(context.extract_first()) # 提取网站标题 items = [] # 存放老师信息的集合 for each in response.xpath("//div[@class='li_txt']"): # 将我们得到的数据封装到一个 `ItcastItem` 对象 item = ItcastItem() # extract()方法返回的都是unicode字符串,normalize-space()可以去掉数据中空格、换行符等特殊符号 name = each.xpath("normalize-space(h3/text())").extract() grade = each.xpath("normalize-space(h4/text())").extract() info = each.xpath("normalize-space(p/text())").extract() # xpath返回的是包含一个元素的列表 item['name'] = name[0] item['grade'] = grade[0] item['info'] = info[0] items.append(item) # 直接返回最后数据,这里可以返回单个对象或对象列表 return items
def __init__(self): self.log = Log().get_logger() """输出json,使用内置JsonItemExporter实现,或自己使用内置json实现""" filename = Config.get_results_path() + 'teachers.json' # self.file = open(filename, 'w' , encoding='utf-8') self.file = open(filename, 'wb') self.exporter = JsonItemExporter(self.file, indent=0, encoding='utf-8') self.exporter.start_exporting() """输出csv,使用自定义MyCsvItemExporter可指定顺序和分隔符""" csv_filename = Config.get_results_path() + 'teachers.csv' self.csv_file = open(csv_filename, 'wb') fields = ['name', 'grade', 'info'] self.csv_exporter = MyCsvItemExporter(fields=fields, file=self.csv_file, encoding='utf-8') """存储数据库""" client = pymongo.MongoClient( 'mongodb://*****:*****@localhost:27017/') db = client['scrapydb'] # 指定数据库 self.collection = db['teachers'] # 指定集合
def __init__(self): self.log = Log().get_logger() """输出json,使用内置JsonItemExporter实现,或自己使用内置json实现""" filename = Config.get_results_path() + 'sina7x24.json' self.file = open(filename, 'wb') self.exporter = JsonItemExporter(self.file, indent=0, encoding='utf-8') # self.exporter = MyJsonLinesItemExporter(self.file, indent=0, encoding='utf-8') self.exporter.start_exporting() """存储数据库""" client = pymongo.MongoClient( 'mongodb://*****:*****@localhost:27017/') db = client['sinadb'] # 指定数据库 self.collection = db['7x24'] # 指定集合
def parse(self, response): # 1、保存网页数据 filename = Config.get_results_path() + "jd.html" with open(filename, 'wb+') as file: # 只能以二进制方式打开 file.write(response.body) """获取全部分类商品""" req = [] for sel in response.xpath('/html/body/div[5]/div[2]/a'): for i in sel.xpath('@href').extract(): if 'category' in i: url = "http://wap.jd.com" + i r = Request(url, callback=self.parse_category) req.append(r) return req
def __init__(self, name=None): day = time.strftime("%Y-%m-%d", time.localtime(time.time())) # 获取当前时间 file = os.path.join(Config.get_log_path(), (day + '.log')) # 以当前时间命名日志文件 self.logger = logging.Logger(name) # 定义日志的名称 self.logger.setLevel(logging.DEBUG) # 设置日志等级 self.logfile = logging.FileHandler(file, encoding="UTF-8") # 定义日志输出到文件 self.logfile.setLevel(logging.DEBUG) # 将INFO级别或更高的日志输出到文件 self.control = logging.StreamHandler() # 定义日志输出到控制台 self.control.setLevel(logging.DEBUG) # 将INFO级别或更高的日志输出到控制台 # 定义日志格式:时间、文件名、行号、标记、结果 self.formater = logging.Formatter('%(asctime)s - %(filename)s - %(lineno)d : %(message)s') self.logfile.setFormatter(self.formater) self.control.setFormatter(self.formater) self.logger.addHandler(self.logfile) self.logger.addHandler(self.control) self.logfile.close() self.control.close()
def parse(self, response): filename = Config.get_results_path() + "sina7x24.html" # 1、保存网页数据 with open(filename, 'wb+') as file: # 只能以二进制方式打开 file.write(response.body) items = [] news = response.xpath("//div[@class='bd_i bd_i_og clearfix']") day = time.strftime("%Y-%m-%d ", time.localtime(time.time())) # 获取当前时间 for each in news[::-1]: item = Sina7x24Item() # extract()方法返回的都是unicode字符串,normalize-space()可以去掉数据中空格、换行符等特殊符号 times = each.xpath("normalize-space(div/p/text())").extract() info = each.xpath("normalize-space(div[2]/div/p/text())").extract() # time = each.css(".bd_i_time_c::text").extract() # info = each.css(".bd_i_txt_c::text").extract() date = day + times[0] if self.is_first: item['date'] = date item['info'] = info[0] self.date_list.append(date) print(date + info[0]) else: if date in self.date_list: continue else: item['date'] = date item['info'] = info[0] self.date_list.append(date) print('-' * 60) print(date + info[0]) items.append(item) self.is_first = False # print('长度:%s' % len(items)) return items # 直接返回最后数据