def parse(self, response): #获取一个选择器 hxs = Selector(response) for con in hxs.xpath(self.xpathConf.get("parse_xpath")): ss = Selector(text=con.extract()) el = WeiboComItemLoader(selector=ss) el.add_value("site_source", "mp.weixin.qq.com") el.add_value("site_type", "weixin") el.add_value("task_id", self.task_id) nowTime = time.localtime() nowDate = datetime.datetime(nowTime[0], nowTime[1], nowTime[2]) el.add_value("catch_date", nowDate.strftime('%Y-%m-%d')) el.add_xpath('site_url', self.xpathConf.get("site_url")) yield el.load_item()
def parse(self, response): #获取一个选择器 hxs = Selector(response) for con in hxs.xpath(self.xpathConf.get("parse_xpath")): ss = Selector(text=con.extract()) el = WeiboComItemLoader(selector=ss) #'site_source','site_type','site_url' el.add_value("site_source", "weibo.com") el.add_value("site_type", "weibo") el.add_value("task_id", self.task_id) nowTime = time.localtime() nowDate = datetime.datetime(nowTime[0], nowTime[1], nowTime[2]) el.add_value("catch_date", nowDate.strftime('%Y-%m-%d')) el.add_xpath('author', self.xpathConf.get("author")) el.add_xpath('user_url', self.xpathConf.get("user_url")) el.add_xpath('site_url', self.xpathConf.get("site_url")) el.add_value('content', ss.xpath(self.xpathConf.get("content")).extract()) el.add_xpath('publish_time', self.xpathConf.get("publish_time")) attitude = ss.xpath(self.xpathConf.get("attitude")).extract() if attitude: el.add_value("attitude", attitude) else: el.add_value("attitude", "0") comments = ss.xpath(self.xpathConf.get("comments")).extract() if comments: el.add_value("comments", comments) else: el.add_value("comments", "0") repost = ss.xpath(self.xpathConf.get("repost")).extract() if repost: el.add_value("repost", repost) else: el.add_value("repost", "0") log.msg(repost) yield el.load_item()
def parse(self, response): #获取一个选择器 hxs = Selector(response) for con in hxs.xpath(self.xpathConf.get("parse_xpath")): ss = Selector(text=con.extract()) el = WeiboComItemLoader(selector=ss) el.add_value("site_source","news.sina.com.cn") el.add_value("site_type","news") el.add_value("task_id",self.task_id) nowTime=time.localtime() nowDate=datetime.datetime(nowTime[0],nowTime[1],nowTime[2]) el.add_value("catch_date",nowDate.strftime('%Y-%m-%d')) baidu_url = ss.xpath(self.xpathConf.get("site_url")).extract() if baidu_url and len(baidu_url)>0: source_url = urlUtil.getRedirectUrl(baidu_url[0],timeout=10) sinaUrl = fo.findSinaNewsUrl(source_url) if sinaUrl: el.add_value('site_url',sinaUrl) else: continue else: continue yield el.load_item()
def parse(self, response): #获取一个选择器 hxs = Selector(response) for con in hxs.xpath(self.xpathConf.get("parse_xpath")): ss = Selector(text=con.extract()) el = WeiboComItemLoader(selector=ss) el.add_value("site_source","mp.weixin.qq.com") el.add_value("site_type","weixin") el.add_value("task_id",self.task_id) nowTime=time.localtime() nowDate=datetime.datetime(nowTime[0],nowTime[1],nowTime[2]) el.add_value("catch_date",nowDate.strftime('%Y-%m-%d')) el.add_xpath('site_url',self.xpathConf.get("site_url")) yield el.load_item()
def parse(self, response): #获取一个选择器 hxs = Selector(response) for con in hxs.xpath(self.xpathConf.get("parse_xpath")): ss = Selector(text=con.extract()) el = WeiboComItemLoader(selector=ss) #'site_source','site_type','site_url' el.add_value("site_source","weibo.com") el.add_value("site_type","weibo") el.add_value("task_id",self.task_id) nowTime=time.localtime() nowDate=datetime.datetime(nowTime[0],nowTime[1],nowTime[2]) el.add_value("catch_date",nowDate.strftime('%Y-%m-%d')) el.add_xpath('author', self.xpathConf.get("author")) el.add_xpath('user_url',self.xpathConf.get("user_url")) el.add_xpath('site_url',self.xpathConf.get("site_url")) el.add_value('content',ss.xpath(self.xpathConf.get("content")).extract()) el.add_xpath('publish_time',self.xpathConf.get("publish_time")) attitude = ss.xpath(self.xpathConf.get("attitude")).extract() if attitude: el.add_value("attitude",attitude) else: el.add_value("attitude","0") comments = ss.xpath(self.xpathConf.get("comments")).extract() if comments: el.add_value("comments",comments) else: el.add_value("comments","0") repost = ss.xpath(self.xpathConf.get("repost")).extract() if repost: el.add_value("repost",repost) else: el.add_value("repost","0") log.msg(repost) yield el.load_item()
def parse(self, response): #获取一个选择器 hxs = Selector(response) for con in hxs.xpath(self.xpathConf.get("parse_xpath")): ss = Selector(text=con.extract()) el = WeiboComItemLoader(selector=ss) el.add_value("site_source", "news.sina.com.cn") el.add_value("site_type", "news") el.add_value("task_id", self.task_id) nowTime = time.localtime() nowDate = datetime.datetime(nowTime[0], nowTime[1], nowTime[2]) el.add_value("catch_date", nowDate.strftime('%Y-%m-%d')) baidu_url = ss.xpath(self.xpathConf.get("site_url")).extract() if baidu_url and len(baidu_url) > 0: source_url = urlUtil.getRedirectUrl(baidu_url[0], timeout=10) sinaUrl = fo.findSinaNewsUrl(source_url) if sinaUrl: el.add_value('site_url', sinaUrl) else: continue else: continue yield el.load_item()