def __init__(self, minutes, isDaemon=False, logger=LoggerUtil.getFileLogger()): BaseExecuter.__init__(self, logger) self.minutes = int(minutes) self.thread = threading.Thread(name=self.__class__.__name__, target=self.proc) self.thread.setDaemon(isDaemon)
def __init__(self, logger = LoggerUtil.getFileLogger()): BaseExecuter.__init__(self, logger) self.connection = ParseConnection() self.track = [] self.follow = [] self.twitterIdInfo = {} self.hashtagAlbumIdMap = {} self.running = threading.Event() self.thread = threading.Thread(target = self.main)
def __init__(self, connection, logger=LoggerUtil.getFileLogger()): self.connection = connection self.logger = logger
def debugLog(self, message): LoggerUtil.encodedLog(self.logger.debug, message)
def warnLog(self, message): LoggerUtil.encodedLog(self.logger.warn, message)
def errorLog(self, message): LoggerUtil.encodedLog(self.logger.error, message)
def infoLog(self, message): LoggerUtil.encodedLog(self.logger.info, message)
class xljs2Spider(scrapy.Spider): logger_util = LoggerUtil() logger = logger_util.getSelfLogger("xljs2Spider") name = 'xljs2' allowed_domains = [ 'mil.news.sina.com.cn', "comment.sina.com.cn", "roll.mil.news.sina.com.cn" ] start_urls = ["http://roll.mil.news.sina.com.cn/col/zgjq/index.shtml"] def parse(self, response): for p in range(1, 800): next_url = "http://roll.mil.news.sina.com.cn/col/zgjq/index_{0}.shtml" next_url = next_url.format(p) print("页码====", next_url, "====") yield response.follow(url=next_url, callback=self.parese_news_list) def parese_news_list(self, response): datas = set(response.xpath("//div[@class='fixList']//a")) for item in datas: new_url = item.xpath("@href").extract()[0] title = item.xpath("text()").extract()[0] print(new_url, title) metadata = {} metadata["title"] = title yield response.follow(url=new_url, meta=metadata, callback=self.parse_news) def parse_news(self, response): url = response.url title = response.meta["title"] id = re.findall("-(.*)\.", url.split("/")[-1])[0] id = id[1:] # context = response.xpath("//div[@class='article']//p/text()").extract() # publish_time = response.xpath("//div[@class='date-source']/span[1]/text()").extract()[0] # print("id,title",id,title,publish_time,context) context = response.xpath( "//div[@class='article' or @id='artibody']//p/text()").extract() publish_time = response.xpath( "//div[@class='date-source']/span[1]/text() | //span[@class='time-source']/span[@class='titer']/text()" ).extract()[0] print("id,title", id, title, context, publish_time) comment_url = "http://comment.sina.com.cn/page/info?version=1&format=json&channel=jc&newsid=comos-{0}&group=0&compress=0&ie=utf-8&oe=utf-8" \ "&page={1}&page_size=10&t_size=3&h_size=3&thread=1&uid=unlogin_user&callback=jsonp_1602817818772&_=1602817818772" comment_url = comment_url.format(id, 1) metadata = {} metadata["id"] = id metadata["title"] = title metadata["context"] = context metadata["publish_time"] = publish_time print(metadata) print("comment_url===", comment_url) yield response.follow(url=comment_url, meta=metadata, callback=self.parse_comment) def parse_comment(self, response): id = response.meta["id"] title = response.meta["title"] context = response.meta["context"] publish_time = response.meta["publish_time"] data = re.findall("{(.*)}", response.text) if len(data) > 0: jsondata_result = "{" + re.findall("{(.*)}", response.text)[0] + "}" jsondata = json.loads(jsondata_result)["result"] count = jsondata["count"] join_count = count["total"] comment_count = count["show"] xljsxw = FHJSXW() pdate = datetime.datetime.now().strftime('%Y-%m-%d') xljsxw["id"] = id xljsxw["title"] = title xljsxw["context"] = "|".join(context) xljsxw["publish_time"] = publish_time xljsxw["comment_count"] = comment_count xljsxw["join_count"] = join_count xljsxw["accumulator_count"] = 0 xljsxw["pdate"] = pdate xljsxw["data_source"] = "新浪网" xljsxw["data_module"] = "中国军情" print(xljsxw) yield xljsxw
def __init__(self, connection, logger=LoggerUtil.getFileLogger()): self.logger = logger self.connection = connection self.className = self.getClassName()
def __init__(self, logger=LoggerUtil.getFileLogger()): self.logger = logger
class MysqlPipeline(): logger_util = LoggerUtil() logger = logger_util.getSelfLogger("mysqlPipelineLogger") def __init__(self, host, database, user, password, port): self.host = host self.database = database self.user = user self.password = password self.port = port @classmethod def from_crawler(cls, crawler): return cls( host=crawler.settings.get('MYSQL_HOST'), database=crawler.settings.get('MYSQL_DATABASE'), user=crawler.settings.get('MYSQL_USER'), password=crawler.settings.get('MYSQL_PASSWORD'), port=crawler.settings.get('MYSQL_PORT'), ) def open_spider(self, spider): self.db = pymysql.connect(self.host, self.user, self.password, self.database, charset='utf8', port=self.port) self.cursor = self.db.cursor() def close_spider(self, spider): self.db.close() def process_item(self, item, spider): if isinstance(item, FHJSXW): table = "yq_data" data = dict(item) self.logger.info("插入数据: ", data) keys = ', '.join(data.keys()) values = ', '.join(['%s'] * len(data)) sql = 'insert into %s (%s) values (%s)' % (table, keys, values) print("执行sql:", sql) print(tuple(data.values())) try: self.cursor.execute(sql, tuple(data.values())) self.db.commit() except Exception as e: print("错误:", e) return item elif isinstance(item, FHJSXWPL): table = "yq_pl_data" data = dict(item) self.logger.info("插入数据: ", data) keys = ', '.join(data.keys()) values = ', '.join(['%s'] * len(data)) sql = 'insert into %s (%s) values (%s)' % (table, keys, values) print("执行sql:", sql) print(tuple(data.values())) try: self.cursor.execute(sql, tuple(data.values())) self.db.commit() except Exception as e: print("错误:", e) return item
class fhjwSpider(scrapy.Spider): logger_util = LoggerUtil() logger = logger_util.getSelfLogger("fhjwSpider") name = 'fhjw2' allowed_domains = ['shankapi.ifeng.com',"comment.ifeng.com","mil.ifeng.com", "tech.ifeng.com","ishare.ifeng.com","news.ifeng.com","survey.news.ifeng.com"] start_urls = ['https://mil.ifeng.com/'] def parse(self, response): aitems = set(response.xpath("//div[@class='news-34dpVmYc']//a")) for aitem in aitems: item = {} url = "https:"+aitem.xpath("@href").extract()[0] title = aitem.xpath("@title").extract()[0] print(url,title) id = url.split("/")[-1] commentUrl = "ucms_"+ id item["id"] = id item["title"] = title item["commentUrl"] = commentUrl item["url"] = url comment_url = "https://comment.ifeng.com/get.php?orderby=create_time&" \ "docUrl={0}&format=js&job=1&p=1&pageSize=1".format(commentUrl) yield response.follow(comment_url, meta=item, callback=self.parse_comment) def parse_comment(self,response): id = response.meta["id"] url = response.meta["url"] commentUrl = response.meta["commentUrl"] title = response.meta["title"] data = re.findall("{(.*)}", response.text.encode('utf-8').decode('unicode_escape')) if len(data) >0: jsondata_result = "{"+re.findall("{(.*)}",response.text)[0]+"}" jsondata = json.loads(jsondata_result) # print(jsondata) count = jsondata["count"] join_count = jsondata["join_count"] comments = jsondata["comments"] print("count===", id, title, count, join_count) metaitem= {} metaitem["id"] = id metaitem["title"] = title metaitem["count"] = count metaitem["join_count"] = join_count metaitem["commentUrl"] = commentUrl print("url====",url) yield response.follow(url, meta=metaitem, callback=self.parse_context) def parse_context(self, response): id = response.meta["id"] title = response.meta["title"] count = response.meta["count"] join_count = response.meta["join_count"] commentUrl = response.meta["commentUrl"] publish_time = response.xpath("//p[@class='time-1Mgp9W-1']/span[1]/text()").extract()[0].strip() context = response.xpath("////div[@class='text-3w2e3DBc']//p/text()").extract() # print("结果数据:",id,title,publish_time,context,count,join_count) accumulator_url = "https://survey.news.ifeng.com/api/getaccumulatorweight?format=js&" \ "key={0}ding&serviceid=2&callback=getaccumulator" accumulator_url = accumulator_url.format(commentUrl) accumulator_meta = {} accumulator_meta["id"] = id accumulator_meta["title"] = title accumulator_meta["publish_time"] = publish_time accumulator_meta["context"] = context accumulator_meta["count"] = count accumulator_meta["join_count"] = join_count accumulator_meta["commentUrl"] = commentUrl print("获取推荐数:",accumulator_url) yield scrapy.Request(accumulator_url,meta=accumulator_meta, callback=self.parese_context2) def parese_context2(self,response): fhjsxw = FHJSXW() id = response.meta["id"] title = response.meta["title"] context = response.meta["context"] publish_time = response.meta["publish_time"] count = response.meta["count"] join_count = response.meta["join_count"] commentUrl = response.meta["commentUrl"] data = re.findall("\"browse\":{(.*)}}}", response.text) print(data) if len(data) > 0: jsondata_result = "{" + data[0] + "}" jsondata = json.loads(jsondata_result) print(jsondata) accumulator_count = jsondata[commentUrl+"ding"] print("结果数据:",id,title,context,publish_time,count,join_count,accumulator_count) fhjsxw["id"] = id fhjsxw["title"] = title fhjsxw["context"] = "|".join(context) fhjsxw["publish_time"] = publish_time fhjsxw["comment_count"] = count fhjsxw["join_count"] = join_count fhjsxw["accumulator_count"] = accumulator_count pdate = datetime.datetime.now().strftime('%Y-%m-%d') fhjsxw["pdate"] = pdate fhjsxw["data_source"] = "凤凰网" fhjsxw["data_module"] = "军事首页" yield fhjsxw else: fhjsxw["id"] = id fhjsxw["title"] = title fhjsxw["context"] = context fhjsxw["publish_time"] = publish_time fhjsxw["comment_count"] = count fhjsxw["join_count"] = join_count fhjsxw["accumulator_count"] = 0 pdate = datetime.datetime.now().strftime('%Y-%m-%d') fhjsxw["pdate"] = pdate fhjsxw["data_source"] = "凤凰网" fhjsxw["data_module"] = "军事首页" yield fhjsxw
class xljsSpider(scrapy.Spider): logger_util = LoggerUtil() logger = logger_util.getSelfLogger("xljsSpider") name = 'xljspl' allowed_domains = [ 'mil.news.sina.com.cn', "comment.sina.com.cn", "roll.mil.news.sina.com.cn" ] start_urls = ["http://roll.mil.news.sina.com.cn/col/gjjq/index.shtml"] def parse(self, response): for p in range(1, 500): next_url = "http://roll.mil.news.sina.com.cn/col/gjjq/index_{0}.shtml" next_url = next_url.format(p) print("页码====", next_url, "====") yield response.follow(url=next_url, callback=self.parese_news_list) def parese_news_list(self, response): datas = set(response.xpath("//div[@class='fixList']//a")) print("datas:", len(datas)) for item in datas: new_url = item.xpath("@href").extract()[0] title = item.xpath("text()").extract()[0] print(new_url, title) metadata = {} metadata["title"] = title yield response.follow(url=new_url, meta=metadata, callback=self.parse_news) def parse_news(self, response): url = response.url title = response.meta["title"] id = re.findall("-(.*)\.", url.split("/")[-1])[0] id = id[1:] print("id,title", id, title) # datas = set(response.xpath("//div[@class='fixList']//a")) comment_url = "http://comment.sina.com.cn/page/info?version=1&format=json&channel=jc&newsid=comos-{0}&group=0&compress=0&ie=utf-8&oe=utf-8" \ "&page={1}&page_size=10&t_size=3&h_size=3&thread=1&uid=unlogin_user&callback=jsonp_1602817818772&_=1602817818772" comment_url = comment_url.format(id, 1) metadata = {} metadata["id"] = id metadata["title"] = title metadata["page"] = 1 print(metadata) print("comment_url===", comment_url) yield response.follow(url=comment_url, meta=metadata, callback=self.parse_comment) def parse_comment(self, response): id = response.meta["id"] title = response.meta["title"] page = response.meta["page"] data = re.findall("{(.*)}", response.text) if len(data) > 0: jsondata_result = "{" + re.findall("{(.*)}", response.text)[0] + "}" jsondata = json.loads(jsondata_result)["result"] print(jsondata) count = jsondata["count"] join_count = count["total"] comment_count = count["show"] cmntlist = jsondata["cmntlist"] threaddict = jsondata["threaddict"] if len(cmntlist) > 0: for cmn in cmntlist: xljsxwpl = FHJSXWPL() xljsxwpl["id"] = id xljsxwpl["title"] = title xljsxwpl["user_name"] = cmn["nick"] xljsxwpl["user_id"] = cmn["uid"] xljsxwpl["comment_id"] = cmn["mid"] xljsxwpl["comment_contents"] = cmn["content"] xljsxwpl["comment_date"] = cmn["time"] xljsxwpl["uptimes"] = cmn["agree"] xljsxwpl["reply_comment_ids"] = "" pdate = datetime.datetime.now().strftime('%Y-%m-%d') xljsxwpl["pdate"] = pdate xljsxwpl["data_source"] = "新浪网" xljsxwpl["data_module"] = "国际军情" print(xljsxwpl) yield xljsxwpl if len(threaddict.values()) > 0: threaddictlist = threaddict.values() for threaddictitem in threaddictlist: for thread in threaddictitem["list"]: xljsxwpl = FHJSXWPL() xljsxwpl["id"] = id xljsxwpl["title"] = title xljsxwpl["user_name"] = thread["nick"] xljsxwpl["user_id"] = thread["uid"] xljsxwpl["comment_id"] = thread["mid"] xljsxwpl["comment_contents"] = thread["content"] xljsxwpl["comment_date"] = thread["time"] xljsxwpl["uptimes"] = thread["agree"] xljsxwpl["reply_comment_ids"] = "" pdate = datetime.datetime.now().strftime( '%Y-%m-%d') xljsxwpl["pdate"] = pdate xljsxwpl["data_source"] = "新浪网" xljsxwpl["data_module"] = "国际军情" print(xljsxwpl) yield xljsxwpl page = page + 1 comment_url = "http://comment.sina.com.cn/page/info?version=1&format=json&channel=jc&newsid=comos-{0}&group=0&compress=0&ie=utf-8&oe=utf-8" \ "&page={1}&page_size=10&t_size=3&h_size=3&thread=1&uid=unlogin_user&callback=jsonp_1602817818772&_=1602817818772" comment_url = comment_url.format(id, page) print("next_page==", comment_url) yield scrapy.Request(comment_url, meta=response.meta, callback=self.parse_comment) else: print("结束")
class LhgtjsSpider(scrapy.Spider): logger_util = LoggerUtil() logger = logger_util.getSelfLogger("LhgtjsSpider") name = 'lhgtjs' filepath = "jszlwz/data/lhgtjsdatas2.csv" allowed_domains = ['data.un.org'] # start_urls = ['http://data.un.org/Handlers/ExplorerHandler.ashx?m=EDATA', # 'http://data.un.org/Handlers/ExplorerHandler.ashx?m=FAO', # 'http://data.un.org/Handlers/ExplorerHandler.ashx?m=ICS'] start_urls = ['http://data.un.org/Handlers/ExplorerHandler.ashx?m=ICS'] def __init__(self, url): if os.path.exists(self.filepath): os.remove(self.filepath) print("启动url===", url) def parse(self, response): # 爬取页面内的item items = set(response.xpath('//@href')) self.logger.warning("测试", items) for item in items: tempItem = item.extract() if re.findall("Data.aspx?.*[\d]",tempItem): dataurl = urllib.parse.unquote(str(tempItem).replace("\\\"","")) new_url = 'http://data.un.org/'+dataurl print(new_url) # self.logger.warning(new_url) yield response.follow(new_url, callback=self.parse_datadetail) def get_nextpage_url(self,page,url): print("url===",url) datatemp = url.split("?")[1].split("&") dataMartId = datatemp[0].split("=")[1] dataFilter = datatemp[1].split("=")[1] if dataMartId == "ICS": new_url = 'http://data.un.org/Handlers/DataHandler.ashx?Service=page' \ '&Page={0}' \ '&DataFilter={1}' \ '&DataMartId={2}' \ '&UserQuery=&c=2,5,6,7,8' \ '&s=_crEngNameOrderBy:asc,,yr:desc,_utEngNameOrderBy:asc' \ .format(page, dataFilter, dataMartId) else: new_url = 'http://data.un.org/Handlers/DataHandler.ashx?Service=page' \ '&Page={0}' \ '&DataFilter={1}' \ '&DataMartId={2}' \ '&UserQuery=&c=2,5,6,7,8' \ '&s=_crEngNameOrderBy:asc,_enID:asc,yr:desc'\ .format(page,dataFilter,dataMartId) print(new_url,dataMartId) return new_url,dataMartId def parse_data(self,response): name = response.meta['name'] type = response.meta['type'] items = response.xpath('//div[@class="DataContainer"]//tr') for item in items: tds = item.xpath("./td//text()").extract() print("原始数据===",type+"|"+name+"|"+"|".join(tds)) with open(self.filepath,mode="a",encoding="utf-8") as f: f.write(type+"|"+name+"|"+"|".join(tds)+"\n") # lhgItem = LhgItem() # lhgItem.name = name # lhgItem.value = tds # yield lhgItem def parse_datadetail(self, response): print("test===============================") name = response.xpath('//div[@class="SeriesMeta"]//h2/text()').extract_first() # page = response.xpath('//span[contains(@id, "spanPageIndexB")]/text()').extract_first() total = response.xpath('//span[contains(@id, "spanPageCountB")]/text()').extract_first() for i in range(int(total)): page = i+1 new_url,dataMartId = self.get_nextpage_url(page,response.url) print("test===",new_url,dataMartId) self.logger.warning(new_url) yield response.follow(new_url,meta={'name': name, 'type': dataMartId}, callback=self.parse_data)
class fhjwSpider(scrapy.Spider): logger_util = LoggerUtil() logger = logger_util.getSelfLogger("fhjwSpider") name = 'fhjwpl' allowed_domains = ['shankapi.ifeng.com',"comment.ifeng.com"] def start_requests(self): url = 'https://shankapi.ifeng.com/shanklist/_/getColumnInfo/_/default' \ '/000/{0}/20/14-35083-/getColumnInfoCallback?callback=getColumnInfoCallback' t = time.time() nowTime = int(round(t * 1000)) url = url.format(nowTime) yield Request(url) def parse(self, response): print(response.text) data = re.findall("{(.*)}",response.text) if len(data) >0: jsondata_result = "{"+re.findall("{(.*)}",response.text)[0]+"}" jsondata = json.loads(jsondata_result) print(jsondata) code = jsondata.get("code") data = jsondata.get("data") print(code) if code == 0: isEnd = data.get("isEnd") newsstream = data.get("newsstream") for index,item in enumerate(newsstream): print("数据:",item) id = item["id"] newsTime = item["newsTime"] skey = item["skey"] url = item["url"] commentUrl = item["commentUrl"] source = item["source"] title = item["title"] item["p"] = 1 comment_url = "https://comment.ifeng.com/get.php?orderby=create_time&" \ "docUrl={0}&format=js&job=1&p=1&pageSize=20".format(commentUrl) yield response.follow(comment_url, meta=item, callback=self.parse_comment) if index == len(newsstream)-1: new_url = 'https://shankapi.ifeng.com/shanklist/_/getColumnInfo/_/default' \ '/{0}/{1}/20/14-35083-/getColumnInfoCallback?callback=getColumnInfoCallback' timeArray = time.strptime(newsTime, "%Y-%m-%d %H:%M:%S") timeStamp = int(time.mktime(timeArray)) print(id,timeStamp) new_url = new_url.format(id,timeStamp) print(new_url) if isEnd: print("结束") else: yield scrapy.Request(url=new_url, callback=self.parse) def parse_comment(self,response): id = response.meta["id"] newsTime = response.meta["newsTime"] skey = response.meta["skey"] url = response.meta["url"] commentUrl = response.meta["commentUrl"] source = response.meta["source"] title = response.meta["title"] p = response.meta["p"] data = re.findall("{(.*)}", response.text.encode('utf-8').decode('unicode_escape')) if len(data) >0: jsondata_result = "{"+re.findall("{(.*)}",response.text)[0]+"}" jsondata = json.loads(jsondata_result) # print(jsondata) count = jsondata["count"] join_count = jsondata["join_count"] comments = jsondata["comments"] if len(comments) >0: for comment in comments: comment_id = comment["comment_id"] uname = comment["uname"] user_id = comment["user_id"] comment_contents = comment["comment_contents"] comment_date = comment["comment_date"] uptimes = comment["uptimes"] parents = comment["parent"] reply_comment_ids = [] if len(parents) >0 : for parent in parents: reply_comment_id = parent["comment_id"] reply_comment_ids.append(reply_comment_id) print("comment===",id, title, comment_id, uname, user_id, comment_contents, comment_date, uptimes,reply_comment_ids) fhjsxwpl = FHJSXWPL() fhjsxwpl["id"] = id fhjsxwpl["title"] = title fhjsxwpl["comment_id"] = comment_id fhjsxwpl["comment_contents"] = comment_contents fhjsxwpl["comment_date"] = comment_date fhjsxwpl["user_name"] = uname fhjsxwpl["user_id"] = user_id fhjsxwpl["uptimes"] = uptimes fhjsxwpl["reply_comment_ids"] = ",".join(reply_comment_ids) pdate = datetime.datetime.now().strftime('%Y-%m-%d') fhjsxwpl["pdate"] = pdate fhjsxwpl["data_source"] = "凤凰网" fhjsxwpl["data_module"] = "军情热点" yield fhjsxwpl p = p+1 comment_url = "https://comment.ifeng.com/get.php?orderby=create_time&" \ "docUrl={0}&format=js&job=1&p={1}&pageSize=20".format(commentUrl,p) yield scrapy.Request(comment_url, meta=response.meta, callback=self.parse_comment) else: print("结束")
class fhjwSpider(scrapy.Spider): logger_util = LoggerUtil() logger = logger_util.getSelfLogger("fhjwSpider") name = 'fhjwpl2' allowed_domains = [ 'shankapi.ifeng.com', "comment.ifeng.com", "mil.ifeng.com" ] start_urls = ['https://mil.ifeng.com/'] def parse(self, response): aitems = set(response.xpath("//div[@class='news-34dpVmYc']//a")) for aitem in aitems: item = {} url = "https:" + aitem.xpath("@href").extract()[0] title = aitem.xpath("@title").extract()[0] print(url, title) id = url.split("/")[-1] commentUrl = "ucms_" + id item["id"] = id item["title"] = title item["commentUrl"] = commentUrl item["url"] = url item["p"] = 1 comment_url = "https://comment.ifeng.com/get.php?orderby=create_time&" \ "docUrl={0}&format=js&job=1&p=1&pageSize=20".format(commentUrl) yield response.follow(comment_url, meta=item, callback=self.parse_comment) def parse_comment(self, response): id = response.meta["id"] url = response.meta["url"] commentUrl = response.meta["commentUrl"] title = response.meta["title"] p = response.meta["p"] data = re.findall( "{(.*)}", response.text.encode('utf-8').decode('unicode_escape')) if len(data) > 0: jsondata_result = "{" + re.findall("{(.*)}", response.text)[0] + "}" jsondata = json.loads(jsondata_result) # print(jsondata) count = jsondata["count"] join_count = jsondata["join_count"] comments = jsondata["comments"] if len(comments) > 0: for comment in comments: comment_id = comment["comment_id"] uname = comment["uname"] user_id = comment["user_id"] comment_contents = comment["comment_contents"] comment_date = comment["comment_date"] uptimes = comment["uptimes"] parents = comment["parent"] reply_comment_ids = [] if len(parents) > 0: for parent in parents: reply_comment_id = parent["comment_id"] reply_comment_ids.append(reply_comment_id) print("comment===", id, title, comment_id, uname, user_id, comment_contents, comment_date, uptimes, reply_comment_ids) fhjsxwpl = FHJSXWPL() fhjsxwpl["id"] = id fhjsxwpl["title"] = title fhjsxwpl["comment_id"] = comment_id fhjsxwpl["comment_contents"] = comment_contents fhjsxwpl["comment_date"] = comment_date fhjsxwpl["user_name"] = uname fhjsxwpl["user_id"] = user_id fhjsxwpl["uptimes"] = uptimes fhjsxwpl["reply_comment_ids"] = ",".join(reply_comment_ids) pdate = datetime.datetime.now().strftime('%Y-%m-%d') fhjsxwpl["pdate"] = pdate fhjsxwpl["data_source"] = "凤凰网" fhjsxwpl["data_module"] = "军事首页" yield fhjsxwpl p = p + 1 comment_url = "https://comment.ifeng.com/get.php?orderby=create_time&" \ "docUrl={0}&format=js&job=1&p={1}&pageSize=20".format(commentUrl,p) yield scrapy.Request(comment_url, meta=response.meta, callback=self.parse_comment) else: print("结束")
class fhjwSpider(scrapy.Spider): logger_util = LoggerUtil() logger = logger_util.getSelfLogger("fhjwSpider") name = 'fhjw' allowed_domains = [ 'shankapi.ifeng.com', "comment.ifeng.com", "mil.ifeng.com", "tech.ifeng.com", "ishare.ifeng.com", "news.ifeng.com", "survey.news.ifeng.com" ] def start_requests(self): url = 'https://shankapi.ifeng.com/shanklist/_/getColumnInfo/_/default' \ '/000/{0}/20/14-35083-/getColumnInfoCallback?callback=getColumnInfoCallback' t = time.time() nowTime = int(round(t * 1000)) url = url.format(nowTime) yield Request(url) def parse(self, response): print(response.text) data = re.findall("{(.*)}", response.text) if len(data) > 0: jsondata_result = "{" + re.findall("{(.*)}", response.text)[0] + "}" jsondata = json.loads(jsondata_result) print(jsondata) code = jsondata.get("code") data = jsondata.get("data") print(code) if code == 0: isEnd = data.get("isEnd") newsstream = data.get("newsstream") for index, item in enumerate(newsstream): print("数据:", item) id = item["id"] newsTime = item["newsTime"] skey = item["skey"] url = item["url"] commentUrl = item["commentUrl"] source = item["source"] title = item["title"] item["p"] = 1 comment_url = "https://comment.ifeng.com/get.php?orderby=create_time&" \ "docUrl={0}&format=js&job=1&p=1&pageSize=1".format(commentUrl) yield response.follow(comment_url, meta=item, callback=self.parse_comment) if index == len(newsstream) - 1: new_url = 'https://shankapi.ifeng.com/shanklist/_/getColumnInfo/_/default' \ '/{0}/{1}/20/14-35083-/getColumnInfoCallback?callback=getColumnInfoCallback' timeArray = time.strptime(newsTime, "%Y-%m-%d %H:%M:%S") timeStamp = int(time.mktime(timeArray)) print(id, timeStamp) new_url = new_url.format(id, timeStamp) print(new_url) if isEnd: print("结束") else: yield scrapy.Request(url=new_url, callback=self.parse) def parse_comment(self, response): id = response.meta["id"] newsTime = response.meta["newsTime"] skey = response.meta["skey"] url = response.meta["url"] commentUrl = response.meta["commentUrl"] source = response.meta["source"] title = response.meta["title"] p = response.meta["p"] data = re.findall( "{(.*)}", response.text.encode('utf-8').decode('unicode_escape')) if len(data) > 0: jsondata_result = "{" + re.findall("{(.*)}", response.text)[0] + "}" jsondata = json.loads(jsondata_result) # print(jsondata) count = jsondata["count"] join_count = jsondata["join_count"] comments = jsondata["comments"] print("count===", id, title, newsTime, count, join_count) metaitem = {} metaitem["id"] = id metaitem["title"] = title metaitem["newsTime"] = newsTime metaitem["count"] = count metaitem["join_count"] = join_count metaitem["commentUrl"] = commentUrl print("url====", url) yield response.follow(url, meta=metaitem, callback=self.parse_context) def parse_context(self, response): id = response.meta["id"] title = response.meta["title"] count = response.meta["count"] join_count = response.meta["join_count"] commentUrl = response.meta["commentUrl"] publish_time = response.xpath( "//p[@class='time-1Mgp9W-1']/span[1]/text()").extract()[0].strip() context = response.xpath( "////div[@class='text-3w2e3DBc']//p/text()").extract() # print("结果数据:",id,title,publish_time,context,count,join_count) accumulator_url = "https://survey.news.ifeng.com/api/getaccumulatorweight?format=js&" \ "key={0}ding&serviceid=2&callback=getaccumulator" accumulator_url = accumulator_url.format(commentUrl) accumulator_meta = {} accumulator_meta["id"] = id accumulator_meta["title"] = title accumulator_meta["publish_time"] = publish_time accumulator_meta["context"] = context accumulator_meta["count"] = count accumulator_meta["join_count"] = join_count accumulator_meta["commentUrl"] = commentUrl print("获取推荐数:", accumulator_url) yield scrapy.Request(accumulator_url, meta=accumulator_meta, callback=self.parese_context2) def parese_context2(self, response): fhjsxw = FHJSXW() id = response.meta["id"] title = response.meta["title"] context = response.meta["context"] publish_time = response.meta["publish_time"] count = response.meta["count"] join_count = response.meta["join_count"] commentUrl = response.meta["commentUrl"] data = re.findall("\"browse\":{(.*)}}}", response.text) print(data) if len(data) > 0: jsondata_result = "{" + data[0] + "}" jsondata = json.loads(jsondata_result) print(jsondata) accumulator_count = jsondata[commentUrl + "ding"] print("结果数据:", id, title, context, publish_time, count, join_count, accumulator_count) fhjsxw["id"] = id fhjsxw["title"] = title fhjsxw["context"] = "|".join(context) fhjsxw["publish_time"] = publish_time fhjsxw["comment_count"] = count fhjsxw["join_count"] = join_count fhjsxw["accumulator_count"] = accumulator_count pdate = datetime.datetime.now().strftime('%Y-%m-%d') fhjsxw["pdate"] = pdate fhjsxw["data_source"] = "凤凰网" fhjsxw["data_module"] = "军情热点" yield fhjsxw else: fhjsxw["id"] = id fhjsxw["title"] = title fhjsxw["context"] = context fhjsxw["publish_time"] = publish_time fhjsxw["comment_count"] = count fhjsxw["join_count"] = join_count fhjsxw["accumulator_count"] = 0 pdate = datetime.datetime.now().strftime('%Y-%m-%d') fhjsxw["pdate"] = pdate fhjsxw["data_source"] = "凤凰网" fhjsxw["data_module"] = "军情热点" yield fhjsxw
class CmanoSpider(scrapy.Spider): logger_util = LoggerUtil() logger = logger_util.getSelfLogger("CmanoSpider") name = 'cmano' allowed_domains = ['www.cmano-db.com'] start_urls = ['http://www.cmano-db.com/'] def __init__(self, url): print("启动url===", url) def parse(self, response): # 爬取页面内的item items = set( response.xpath( '//ul[contains(@class, "nav navbar-nav")]/li/ul/li/a/@href')) self.logger.warning("测试", items) listCountrys = ['Ship', 'Submarine', 'Facility', 'Aircraft'] noListCountrys = ["Weapon", "Sensor"] for item in items: tempItem = item.extract() new_url = 'http://www.cmano-db.com/' + urllib.parse.unquote( tempItem) self.logger.warning(tempItem) if tempItem[0:-1].lower() in [ listCountry.lower() for listCountry in listCountrys ]: self.logger.warning("测试", tempItem) yield response.follow(new_url, callback=self.parse_country) elif tempItem[0:-1].lower() in [ noListCountry.lower() for noListCountry in noListCountrys ]: self.logger.warning("测试", tempItem) yield response.follow(new_url, callback=self.parse_no_country_list) def parse_country(self, response): # self.logger.warning("country测试", response.url) items = set( response.xpath('//div[contains(@class, "country")]/h4/a/@href')) self.logger.warning("测试", items) for index, item in enumerate(items): new_url = 'http://www.cmano-db.com/' + urllib.parse.unquote( item.extract()) self.logger.warning("country测试", new_url) yield response.follow(new_url, callback=self.parse_country_list) def parse_country_list(self, response): self.logger.warning("country-list测试", response.url) items = set( response.xpath( '//table[contains(@class, "table table-striped table-hover")]//@href' )) gjitem = GJItem() gjitem["name"] = response.url.split("/")[-2] print("-----------response.url", response.url) print("-----------gjname", gjitem) self.logger.warning("测试", items) for index, item in enumerate(items): new_url = 'http://www.cmano-db.com/' + urllib.parse.unquote( item.extract()) self.logger.warning("测试", new_url) yield response.follow(new_url, meta={'item': gjitem}, callback=self.parse_country_detail) def parse_no_country_list(self, response): self.logger.warning("country-list测试", response.url) items = set( response.xpath('//div[contains(@class, "country")]//@href')) self.logger.warning("测试", items) for index, item in enumerate(items): new_url = 'http://www.cmano-db.com/' + urllib.parse.unquote( item.extract()) self.logger.warning("测试", new_url) yield response.follow(new_url, meta={'item': None}, callback=self.parse_country_detail) def parse_country_detail(self, response): label = response.url.split("/")[-3].strip() print("--------------label", label) name = response.xpath( "//h3[@id='typography']/text()").extract()[0].strip() print("--------------name", name) img = CmanoItem() self.logger.warning("country-detial测试", response.url) imageitems = response.xpath( "//div[contains(@class,'col-lg-7')]/a/img/@src") image_urls = [] for index, item in enumerate(imageitems): new_url = 'http://www.cmano-db.com/' + urllib.parse.unquote( item.extract()) image_urls.append(new_url) img["image_urls"] = image_urls yield img img1 = AttrItem() img1["name"] = name img1["attr"] = "label" img1["value"] = label print("--------------label", img1) yield img1 item = response.meta['item'] print("--------------item", item) if item is not None: imggj = RelaItem() imggj["name_partA"] = name imggj["name_partB"] = item["name"] imggj["rela"] = "服役国家" print("--------------服役国家", imggj) yield imggj if len(image_urls) > 0: img0 = AttrItem() imagesname = image_urls[0].split("/")[-1].strip() filepath = "images/" + imagesname.split("_")[0] + "/" + imagesname print("--------------filepath", filepath) img0["name"] = name img0["attr"] = "image_path" img0["value"] = filepath yield img0 items = response.xpath( "//div[contains(@class,'col-lg-7')]/table[1]//td/text()") for index, item in enumerate(items): tempdata = item.extract().split(":") print("--------------tempdata", tempdata) if len(tempdata[0].strip()) > 0 and len(tempdata) == 2: img2 = AttrItem() img2["name"] = name img2["attr"] = tempdata[0].strip() img2["value"] = tempdata[1].strip() print("--------------attr", img2) yield img2 weaponsList = ["Weapons:", "Weapons / Loadouts:"] sensorsList = ["Sensors / EW:", "Sensors:"] itemsrela1 = response.xpath( "//div[contains(@class,'col-lg-7')]/table[2]//a/text()") print("------------itemsrela1", itemsrela1) relatype = response.xpath( "//div[contains(@class,'col-lg-7')]/table[2]//u/text()").extract() print("------------relatype", relatype) for index, item in enumerate(itemsrela1): tempdata = item.extract() print("--------------tempdata", tempdata) if len(tempdata.strip()) > 0: img3 = RelaItem() img3["name_partA"] = name img3["name_partB"] = tempdata.strip() if relatype[0] in sensorsList: img3["rela"] = "传感器配置" elif relatype[0] in weaponsList: img3["rela"] = "武器负载" print("--------------rela", img3) yield img3 itemsrela12 = response.xpath( "//div[contains(@class,'col-lg-7')]/table[3]//a/text()") print("------------itemsrela1", itemsrela12) relatype2 = response.xpath( "//div[contains(@class,'col-lg-7')]/table[3]//u/text()").extract() print("------------relatype", relatype2) for index, item in enumerate(itemsrela12): tempdata = item.extract() print("--------------tempdata", tempdata) if len(tempdata.strip()) > 0: img4 = RelaItem() img4["name_partA"] = name img4["name_partB"] = tempdata.strip() if relatype2[0] in sensorsList: img4["rela"] = "传感器配置" elif relatype2[0] in weaponsList: img4["rela"] = "武器负载" print("--------------rela", img4) yield img4