def get_inner_url_list(self, url): """ : param : return: url_list 返回从频道首页通栏中的文章的url """ writelog("huxiu开始解析原始URL:" + url) selector = self.parser(url) url_tmp_list = list( set( selector.xpath( '//*[@id="index"]//div[@class="mod-info-flow"]//div[@class="mob-ctt channel-list-yh"]//a[@class="transition msubstr-row2"]/@href' ))) """ : param: url {'/article/220006.html', '/article/219989.html', '/article/220008.html'} 添加url,获取完整的url地址。 """ url_list = [] for url_tmp in url_tmp_list: full_url_tmp = self.base_url + url_tmp url_list.append(full_url_tmp) writelog("huxiu返回inner_url_list内容如下:\n" + json.dumps(url_list) + "\n") return url_list
def get_inner_url_list(self, url): """ : param : return: url_list 返回从频道首页通栏中的文章的url """ writelog("tmtpost开始解析原始URL:" + url) selector = self.parser(url) url_tmp_list = list( set( selector.xpath( '/html/body//section/div[1]/div//div[3]/a/@href'))) """ : param: url {'/article/220006.html', '/article/219989.html', '/article/220008.html'} 添加url,获取完整的url地址。 """ url_list = [] for url_tmp in url_tmp_list: full_url_tmp = self.base_url + url_tmp url_list.append(full_url_tmp) writelog("tmtpost返回inner_url_list内容如下:\n" + json.dumps(url_list) + "\n") return url_list
def is_url_processed(url): str = url hashlibMd5 = hashlib.md5() hashlibMd5.update(str.encode(encoding='utf-8')) id = hashlibMd5.hexdigest() # 打开数据库连接 cfg = configparser.ConfigParser() cfg.read("conf.ini") db_host = cfg.get("database", "host") db_port = cfg.getint("database", "port") db_name = cfg.get("database", "dbname") db_user = cfg.get("database", "user") db_pass = cfg.get("database", "pass") db = pymysql.connect(host=db_host, user=db_user, password=db_pass, db=db_name, port=db_port, use_unicode=True, charset="utf8") cur = db.cursor() sql_select_from_article = "select 1 as cnt from 91_article where id=%s" values = (id) result_data = cur.execute(sql_select_from_article, values) cur.close() db.close() if result_data > 0: writelog("该url已存在数据库中:{}\n".format(url)) return True return False
def get_inner_url_list_new(self, url): writelog("====>>>> huxiu开始解析原始URL:{}\n".format(url)) inner_url_list = [] selector = self.parser(url) for sel in selector.xpath( "//*[@id='index']//div[@class='mod-info-flow']/div[@class='mod-b mod-art clearfix']" ): item = {} title_datas = sel.xpath( "div[@class='mob-ctt channel-list-yh']/h2/a/text()") item['title'] = '' + title_datas[0] link_datas = sel.xpath( "div[@class='mob-ctt channel-list-yh']/h2/a/@href") item['link'] = self.base_url + link_datas[0] desc_datas = sel.xpath( "div[@class='mob-ctt channel-list-yh']/div[@class='mob-sub']/text()" ) item['desc'] = '' + desc_datas[0] img_datas = sel.xpath( "div[@class='mod-thumb pull-left ']/a[@class='transition']/img[@class='lazy']/@data-original" ) # 头图是视频缩略图时,xpath解析较复杂,暂跳过 if img_datas == False: continue if len(img_datas) == False: continue item['img'] = '' + img_datas[0] inner_url_list.append(item) # writelog("huxiu返回inner_url_list:{}\n".format(json.dumps(inner_url_list))) return inner_url_list
def get_inner_url_list_new(self, url): writelog("====>>>> 36kr开始解析原始URL:{}\n".format(url)) inner_url_list = [] get_payload = { 'per_page': 20, 'page': 1, '_': str(int(time.time() * 1000)) } headers = { 'Host': '36kr.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36' } data = self._craw('get', url, headers, get_payload) data = json.loads(data) articles = data['data']['items'] for article in articles: # 每条资讯对应一个字典 item = {} item['title'] = article['title'] item['link'] = 'https://36kr.com/p/{}.html'.format(article['id']) item['desc'] = article['summary'] item['img'] = article['cover'] inner_url_list.append(item) # writelog("36kr返回inner_url_list内容如下:\n" + json.dumps(inner_url_list) + "\n") return inner_url_list
def _excepthook(type, value, trace): print(self.time_now(), "捕获到全局异常,类型:{},值:{}\n".format(str(type), str(value))) print(self.time_now(), "按任意键继续") os.system('pause') err_msg = '\n ======================== 捕获到全局异常 ======================== \n' err_msg += ''.join(traceback.format_exception(type, value, trace)) writelog(err_msg) sys.__excepthook__(type, value, trace)
def parser(self, url): """ : param url: 需要解析页面的url地址 : return: selector """ response = self.session.get(url=url, headers=self.headers, verify=True) if response.status_code == 200: selector = html.fromstring(response.text) return selector else: writelog("woshipm,网络请求出现异常,请检查! url:" + url)
def writeIntoMysql(news, web_src_id, web_src_name, web_platform_id, img, desc): # 打开数据库连接 cfg = configparser.ConfigParser() cfg.read("conf.ini") db_host = cfg.get("database", "host") db_port = cfg.getint("database", "port") db_name = cfg.get("database", "dbname") db_user = cfg.get("database", "user") db_pass = cfg.get("database", "pass") db = pymysql.connect(host=db_host, user=db_user, password=db_pass, db=db_name, port=db_port, use_unicode=True, charset="utf8") cur = db.cursor() try: str = news['url'] hashlibMd5 = hashlib.md5() hashlibMd5.update(str.encode(encoding='utf-8')) id = hashlibMd5.hexdigest() # 新增91_article记录 sql_insert1 = "insert into 91_article(id,create_by,create_date,title,keywords,image,websrc_id,websrc_name,web_platform_id,description) values(%s,%s,now(),%s,%s,%s,%s,%s,%s,%s)" values1 = (id, "webcrawler", news['title'], news['labels'], img, web_src_id, web_src_name, web_platform_id, desc) cur.execute(sql_insert1, values1) # 新增91_article_data记录 sql_insert2 = "insert into 91_article_data(id,content,copyfrom) values(%s,%s,%s)" values2 = (id, news['text'], news['author']) cur.execute(sql_insert2, values2) # 提交 db.commit() except Exception as e: # 错误回滚 db.rollback() writelog("数据库写入失败!异常原因:{}\n".format(e)) return False finally: cur.close() db.close() writelog("数据库写入成功!\n") return True
def get_news(self, url, title, summary): writelog("36kr,即将处理url:" + url) driver = self.create_phantomJS() driver.get(url) # 等待页面加载完成 driver.implicitly_wait(30) page_src_code = driver.page_source # print(page_src_code) try: news = {} news['url'] = url news['author'] = u"36Kr" news['title'] = driver.title news['content'] = summary news['labels'] = "36kr默认标签" full_content = u"" pattern = re.compile( r'<section class="textblock">([\s\S]*?)<\/section>', re.RegexFlag.S) items_withtag = re.findall(pattern, page_src_code) for item in items_withtag: full_content += item # print(full_content) news['text'] = full_content textblock_element = driver.find_element_by_xpath( '//section[@class="textblock"]') # print(textblock_element.text) # news['text'] = textblock_element.text driver.quit() writelog("36kr,处理正常结束!url:" + url) return news except Exception as e: writelog("36kr,解析出现异常!url=" + url) exc_type, exc_value, exc_traceback = sys.exc_info() writelog("*** 异常堆栈如下:") traceback.print_exception(exc_type, exc_value, exc_traceback, limit=5, file=sys.stdout) writelog("-" * 100) return None
def get_inner_url_list(self, url): """ : param : return: url_list 返回从频道首页通栏中的文章的url """ writelog("woshipm开始解析原始URL:" + url) selector = self.parser(url) url_tmp_list = list( set(selector.xpath('//h2[@class="post-title"]/a/@href'))) """ : param: url {'/article/220006.html', '/article/219989.html', '/article/220008.html'} 添加url,获取完整的url地址。 """ url_list = [] for url_tmp in url_tmp_list: url_list.append(url_tmp) writelog("woshipm返回inner_url_list内容如下:\n" + json.dumps(url_list) + "\n") return url_list
def get_inner_url_list(self, url): """ : param : return: url_list 返回从频道首页通栏中的文章的url """ writelog("chanpin100开始解析原始URL:" + url) selector = self.parser(url) url_tmp_list = list( set(selector.xpath('//h4[@class="media-heading"]/a/@href'))) """ : param: url {'/article/220006.html', '/article/219989.html', '/article/220008.html'} 添加url,获取完整的url地址。 """ url_list = [] for url_tmp in url_tmp_list: full_url_tmp = self.base_url + url_tmp url_list.append(full_url_tmp) writelog("chanpin100返回inner_url_list内容如下:\n" + json.dumps(url_list) + "\n") return url_list
def get_inner_url_list(self, url): """ : param : return: url_list 返回从频道首页通栏中的文章的urll """ writelog("leiphone开始解析原始URL:" + url) selector = self.parser(url) # url_tmp_list = list(set(selector.xpath('//div[@class="lph-picShow idx-picShow clr"]//a/@href'))) url_tmp_list = list(set( selector.xpath('//div[@class="img"]//a/@href'))) """ : param: url {'/article/220006.html', '/article/219989.html', '/article/220008.html'} 添加url,获取完整的url地址。 """ url_list = [] for url_tmp in url_tmp_list: if ("javascript" in url_tmp): continue # full_url_tmp = self.base_url + url_tmp full_url_tmp = url_tmp # html = requests.get(url) # soup = BeautifulSoup(html.text, 'lxml') # a = soup.findAll(name = 'a') # for a_ in a: # writelog("leiphone,找到一个href链接:" + a_.get('href')) url_list.append(full_url_tmp) writelog("leiphone返回inner_url_list内容如下:\n" + json.dumps(url_list) + "\n") return url_list
def get_news(self, url): """ : param url : 需要进行获取信息的url地址 flag : 标志位,判断是否抓取成功 news : 字典,存储各信息 : return: news 正常返回news,错误返回 -1 """ writelog("tmtpost,即将解析url:" + url) news = {} flag = None try: news['url'] = url news['link'] = url[0:7:1] + "m" + url[10:-1:1] + url[-1] selector = self.parser(url) news['author'] = u'钛媒体' title = selector.xpath('/html/head/title/text()')[0] tmp = "" for i in title: if i == '-' or i == '|': break tmp += i # selector.xpath('/html/body/div[5]//div[1]/div[1]/a/span/text()')[0] news['title'] = tmp content = selector.xpath('//div[@class="inner"]//p') article = "" temp = [] for i in content: img_url = i.xpath('img/@src') temp.append(i.text) temp.append(img_url) temp.pop() for i in temp: if i: if type(i) == list: article = article + "\n" + "![](" + i[0] + ")" + "\n\n" else: article = article + i + "\n\n" summary = "" for i in temp: if len(summary) > 400: break else: if i: if type(i) == list: pass else: summary = summary + i + "\n" news['content'] = summary full_article = self.getContent(url) news['text'] = full_article # news['text'] = article cover_list = selector.xpath('//img[@class="aligncenter"]/@src') if cover_list: news['cover'] = cover_list[0] else: writelog("tmtpost,无法解析封面!url=" + url) news['cover'] = "钛媒体默认封面" labels_list = selector.xpath( '/html/body//section//span[1]/a/text()') if labels_list: news['labels'] = '' + labels_list[0] else: writelog("tmtpost,无法解析标签!url=" + url) news['labels'] = "钛媒体默认标签" news['service'] = 'Article.AddArticle' except Exception as e: writelog("tmtpost,解析时出现异常,请检查!url=" + url) exc_type, exc_value, exc_traceback = sys.exc_info() writelog("*** print_exception:") traceback.print_exception(exc_type, exc_value, exc_traceback, limit=5, file=sys.stdout) writelog("-" * 100) flag = 1 if flag == None: writelog("tmtpost,处理正常结束!url=" + url) return news else: return None
news['labels'] = '' + labels_list[0] else: writelog("zaodula,无法解析标签!url=" + url) news['labels'] = "早读课默认标签" except Exception as e: writelog("zaodula,解析时异常,请检查!url=" + url) exc_type, exc_value, exc_traceback = sys.exc_info() writelog("*** 异常堆栈:") traceback.print_exception(exc_type, exc_value, exc_traceback, limit=5, file=sys.stdout) writelog("-" * 100) flag = 1 if flag == None: writelog("zaodula,处理正常结束!url=" + url) return news else: return None if __name__ == '__main__': spider = zaodu() url = 'https://www.zaodula.com' inner_url_list = spider.get_inner_url_list(url) writelog("早读课inner_url_list:" + json.dumps(inner_url_list)) for inner_url in inner_url_list: writelog("早读课url_title:" + spider.get_news(inner_url)['title'])
def get_news(self, url): """ : param url : 需要进行获取信息的url地址 flag : 标志位,判断是否抓取成功 news : 字典,存储各信息 : return: news 正常返回news,错误返回 -1 """ writelog("woshipm,即将处理url:" + url) news = {} flag = None try: news['url'] = url selector = self.parser(url) title = selector.xpath('/html/head/title/text()')[0] news['author'] = u'人人都是产品经理' tmp = "" for i in title: if i == '|': break tmp += i news['title'] = tmp content = selector.xpath('//div[@class="grap"]//p') article = "" temp = [] cover_list = [] for i in content: img_url = i.xpath('img/@src') temp.append(i.text) temp.append(img_url) cover_list.append(img_url) # 去除尾部多余信息 for i in range(6): temp.pop() for i in temp: if i: if type(i) == list: article = article + "\n" + "![](" + i[0] + ")" + "\n\n" else: article = article + i + "\n\n" summary = "" for i in temp: if len(summary) > 400: break else: if i: if type(i) == list: pass else: summary = summary + i + "\n" news['content'] = summary full_article = self.getContent(url) news['text'] = full_article # news['text'] = article for i in cover_list: if i: news['cover'] = i[0] break news['labels'] = u"产品项目" except Exception as e: writelog("woshipm,解析时出现异常,请检查!url=" + url) exc_type, exc_value, exc_traceback = sys.exc_info() writelog("*** print_exception:") traceback.print_exception(exc_type, exc_value, exc_traceback, limit=5, file=sys.stdout) writelog("-" * 100) flag = 1 if flag == None: writelog("woshipm,处理正常结束!url=" + url) return news else: return None
news['cover'] = i[0] break news['labels'] = u"产品项目" except Exception as e: writelog("woshipm,解析时出现异常,请检查!url=" + url) exc_type, exc_value, exc_traceback = sys.exc_info() writelog("*** print_exception:") traceback.print_exception(exc_type, exc_value, exc_traceback, limit=5, file=sys.stdout) writelog("-" * 100) flag = 1 if flag == None: writelog("woshipm,处理正常结束!url=" + url) return news else: return None if __name__ == '__main__': spider = woshipm() url = 'http://www.woshipm.com/category/pmd' inner_url_list = spider.get_inner_url_list(url) writelog("人人都是产品经理inner_url_list:" + json.dumps(inner_url_list)) for inner_url in inner_url_list: writelog("人人都是产品经理news:" + json.dumps(spider.get_news(inner_url)))
def get_news(self, url): """ : param url : 需要进行获取信息的url地址 flag : 标志位,判断是否抓取成功 news : 字典,存储各信息 : return: news 正常返回news,错误返回 -1 """ writelog("zaodula,即将处理url:" + url) news = {} flag = None try: news['url'] = url selector = self.parser(url) title = selector.xpath('/html/head/title/text()')[0] news['author'] = u'早读课' tmp = "" for i in title: if i == '-' or i == '丨' or i == '—': break tmp += i news['title'] = tmp content = selector.xpath('//div[@class="single-content"]//p') article = "" temp = [] cover_list = [] for a_tag in content: href_url = a_tag.xpath('a/@href') temp.append(a_tag.text) temp.append(href_url) cover_list.append(href_url) # 去除尾部多余信息 for i in range(6): temp.pop() for i in temp: if i: if type(i) == list: article = article + "\n" + "![](" + i[0] + ")" + "\n\n" else: article = article + i + "\n\n" summary = "" for i in temp: if len(summary) > 400: break else: if i: if type(i) == list: pass else: summary = summary + i + "\n" news['content'] = summary full_article = self.getContent(url) news['text'] = full_article # news['text'] = article for i in cover_list: if i: news['cover'] = i[0] break labels_list = selector.xpath('//div[@class="single-cat"]/a/text()') if labels_list: news['labels'] = '' + labels_list[0] else: writelog("zaodula,无法解析标签!url=" + url) news['labels'] = "早读课默认标签" except Exception as e: writelog("zaodula,解析时异常,请检查!url=" + url) exc_type, exc_value, exc_traceback = sys.exc_info() writelog("*** 异常堆栈:") traceback.print_exception(exc_type, exc_value, exc_traceback, limit=5, file=sys.stdout) writelog("-" * 100) flag = 1 if flag == None: writelog("zaodula,处理正常结束!url=" + url) return news else: return None
else: writelog("chanpin100,无法解析标签!url=" + url) news['labels'] = "产品100默认标签" except Exception as e: writelog("chanpin100,解析时出现异常,请检查!url=" + url) exc_type, exc_value, exc_traceback = sys.exc_info() writelog("*** print_exception:") traceback.print_exception(exc_type, exc_value, exc_traceback, limit=5, file=sys.stdout) writelog("-" * 100) flag = 1 if flag == None: writelog("chanpin100,处理正常结束!url=" + url) return news else: return None if __name__ == '__main__': spider = chanpin() url = 'http://www.chanpin100.com/pm' inner_url_list = spider.get_inner_url_list(url) for inner_url in inner_url_list: writelog("产品100, news:" + json.dumps(spider.get_news(inner_url)))
def get_news(self, url): """ : param url : 需要进行获取信息的url地址 flag : 标志位,判断是否抓取成功 news : 字典,存储各信息 : return: news 正常返回news,错误返回 -1 """ writelog("huxiu,即将处理url:" + url) news = {} flag = None try: # 重新组合成https://m.huxiu.com/ 类型的移动端url地址。 news['url'] = url news['link'] = url[0:8:1] + "m" + url[11:-1:1] + url[-1] selector = self.parser(url) news['title'] = selector.xpath( '/html/head/title/text()')[0].replace('-虎嗅网', '') news['author'] = u'虎嗅网' # selector.xpath('//div[3]/div[1]/div[2]/a[1]/text()')[0].strip() content = selector.xpath('//div[@class="article-content-wrap"]//p') article = "" temp = [] for i in content: img_url = i.xpath('img/@src') temp.append(i.text) temp.append(img_url) for i in temp: if i: if type(i) == list: article = article + "\n" + "![](" + i[0] + ")" + "\n\n" else: article = article + i + "\n\n" summary = "" for i in temp: if len(summary) > 400: break else: if i: if type(i) == list: pass else: summary = summary + i + "\n" news['content'] = summary.replace('\xa0', '') full_article = self.getContent(url) # full_article_list = selector.xpath('//div[@class="article-content-wrap"]') # for tmp_full_article in full_article_list: # tmp_article_value = tmp_full_article.string() # writelog(tmp_article_value) # news['text'] = article news['text'] = full_article labels_list = selector.xpath( '//div[@class="article-img-box"]/img/@src') if labels_list: news['labels'] = "" + labels_list[0] else: writelog("huxiu,无法解析标签!url=" + url) news['labels'] = "虎嗅网默认标签" labels_list = selector.xpath( '//div[@class="column-link-box"]/a/text()') if labels_list: news['labels'] = "" for label in labels_list: news['labels'] += (" " + label) else: news['labels'] = "虎嗅网默认标签" news['service'] = 'Article.AddArticle' except Exception as e: writelog("huxiu,解析出现异常!url=" + url) exc_type, exc_value, exc_traceback = sys.exc_info() writelog("*** 异常堆栈如下:") traceback.print_exception(exc_type, exc_value, exc_traceback, limit=5, file=sys.stdout) writelog("-" * 100) flag = 1 if flag == None: writelog("huxiu,处理正常结束!url:" + url) return news else: return None
def get_news(self, url): """ : param url : 需要进行获取信息的url地址 flag : 标志位,判断是否抓取成功 news : 字典,存储各信息 : return: news 正常返回news,错误返回 -1 """ writelog("leiphone,即将处理url:" + url) news = {} flag = None try: news['url'] = url news['link'] = url[0:8:1] + "m" + url[11:-1:1] + url[-1] selector = self.parser(url) news['author'] = u'雷锋网' news['title'] = selector.xpath('/html/head/title/text()')[0] # selector.xpath('/html/body//section/div/article//a/text()')[0].strip() content = selector.xpath('//div[@class="lph-article-comView"]//p') article = "" temp = [] flag1 = True for i in content: img_url = i.xpath('img/@src') if (img_url): writelog("leiphone,即将处理img链接:" + json.dumps(img_url)) if flag1 and img_url: news['cover'] = img_url # flag = False temp.append(i.text) temp.append(img_url) # 去除尾部多余信息 temp.pop() temp.pop() for i in temp: if i: if type(i) == list: article = article + "\n" + "![](" + i[0] + ")" + "\n\n" else: article = article + i + "\n\n" summary = "" for i in temp: if len(summary) > 400: break else: if i: if type(i) == list: pass else: summary = summary + i + "\n" news['content'] = summary full_article = self.getContent(url) news['text'] = full_article news['labels'] = '雷锋网默认标签' # news['text'] = article news['service'] = 'Article.AddArticle' except Exception as e: writelog("leiphone,解析时出现异常,请检查!url=" + url) exc_type, exc_value, exc_traceback = sys.exc_info() writelog("*** print_exception:") traceback.print_exception(exc_type, exc_value, exc_traceback, limit=5, file=sys.stdout) writelog("-" * 100) flag = 1 if flag == None: writelog("leiphone,处理正常结束!url=" + url) return news else: return None
news['text'] = full_article news['labels'] = '雷锋网默认标签' # news['text'] = article news['service'] = 'Article.AddArticle' except Exception as e: writelog("leiphone,解析时出现异常,请检查!url=" + url) exc_type, exc_value, exc_traceback = sys.exc_info() writelog("*** print_exception:") traceback.print_exception(exc_type, exc_value, exc_traceback, limit=5, file=sys.stdout) writelog("-" * 100) flag = 1 if flag == None: writelog("leiphone,处理正常结束!url=" + url) return news else: return None if __name__ == '__main__': leiphone = leiphone() url = 'https://www.leiphone.com/category/ai' inner_url_list = leiphone.get_inner_url_list(url) writelog("雷锋网inner_url_list:" + json.dumps(inner_url_list)) for inner_url in inner_url_list: writelog("雷锋网news:" + json.dumps(leiphone.get_news(inner_url)))
news['cover'] = i[0] break news['labels'] = u"产品中国" except Exception as e: writelog("pmtoo,解析时出现异常,请检查!url=" + url) exc_type, exc_value, exc_traceback = sys.exc_info() writelog("*** print_exception:") traceback.print_exception(exc_type, exc_value, exc_traceback, limit=5, file=sys.stdout) writelog("-" * 100) flag = 1 if flag == None: writelog("pmtoo,处理正常结束!url=" + url) return news else: return None if __name__ == '__main__': spider = pmtoo() url = 'http://www.pmtoo.com/article/category/产品经理' inner_url_list = spider.get_inner_url_list(url) writelog("产品中国inner_url_list:" + json.dumps(inner_url_list)) for inner_url in inner_url_list: writelog("产品中国news:" + json.dumps(spider.get_news(inner_url)))
driver.quit() writelog("36kr,处理正常结束!url:" + url) return news except Exception as e: writelog("36kr,解析出现异常!url=" + url) exc_type, exc_value, exc_traceback = sys.exc_info() writelog("*** 异常堆栈如下:") traceback.print_exception(exc_type, exc_value, exc_traceback, limit=5, file=sys.stdout) writelog("-" * 100) return None if __name__ == '__main__': # writelog(kr.get_news(url)['text']) kr = kr() url = 'https://36kr.com/api/search-column/23' inner_url_list = kr.get_inner_url_list_new(url) writelog(json.dumps(inner_url_list)) for inner_url in inner_url_list: news = kr.get_news(inner_url['link'], inner_url['link'], inner_url['desc']) writelog("抓取到36kr文章:\n" + json.dumps(news))