def process_item(self, item, spider): #collection_name = item.__class__.__name__ logging.warning('开始插入表%s' % self.mongo_col) try: dt = DateUtil.convert(item["created_at"]) # 时间格式化 if dt <= self.recent: # 数据库中已经有或者太老,不再插入 return item # 以标题作为唯一性依据 item["mblogid"] = DateUtil.calc_md5(item['title'] + item['user']) item["created_at"] = dt admin, price, tag = self.extract(item['text'] + item['title'], self.tAdmin, self.tPrice, self.tTag) item["admin"] = admin item["price"] = price item["tag"] = tag self.db[self.mongo_col].insert(dict(item)) return item except Exception: logging.error('编号为:%s的数据插入异常' % item['mblogid'])
def process_item(self, item, spider): #collection_name = item.__class__.__name__ logging.warning('开始插入表%s' % self.mongo_col) logging.warning('当前插入数据错误表信息......') try: dt = DateUtil.convert(item["created_at"]) # 时间格式化 print("dt=====>", dt) if dt <= self.recent: # 数据库中已经有或者太老,不再插入 return item # 以标题作为唯一性依据 print("item=====>", item) #item["mblogid"] = DateUtil.calc_md5(item['title'] + item['user']) #修改以title+mblogid值为 #item["mblogid"] = DateUtil.calc_md5(item['title'] + item['mblogid']) #item["mblogid"] = print("mblogid====>", item["mblogid"]) item["created_at"] = dt print("created_at====>", item["created_at"]) # admin, price, tag = self.extract( # item['text'] + item['title'], self.tAdmin, self.tPrice, self.tTag); admin, price, tag = self.extract(item['text'], self.tAdmin, self.tPrice, self.tTag) print("admin====>", admin) print("price====>", price) print("tag====>", tag) item["admin"] = admin item["price"] = price item["tag"] = tag print("item=======>", item) self.db[self.mongo_col].insert(dict(item)) return item except Exception: logging.error('编号为:%s的数据插入异常' % item['mblogid'])