def get_lists(): urls = [] # 获取数据libid=04、02、03 url = 'http://english.gov.cn/policies/policywatch/' courtData = get_data(url) # 解析数据 soup = BeautifulSoup(courtData, 'lxml') # print(soup) lists = soup.select(".list-container")[0].select("li") for list in lists: urls.append('http://english.gov.cn' + list.find('a').get('href')) print(len(urls)) # print(urls) return urls
def get_lists(): urls = [] # 获取数据libid=04、02、03 url = 'http://tv.cctv.com/lm/pflmj/videoset/index.shtml' # url = 'http://tv.cctv.com/lm/pingan365/videoset/index.shtml' courtData=get_data(url) # 解析数据 soup = BeautifulSoup(courtData,'lxml') # print(soup) lists = soup.select(".text") # lists = soup.find_all('li') for list in lists: href = list.find('a').get('href') urls.append(href) print(len(urls)) return urls
def do_insert(href): print(href) wwzl_url = href url = 'http://english.gov.cn/policies/policywatch/' wwzl_source = url source_code = 3 # 新建连接 db = pymysql.connect(host="192.168.5.210", user="******", password="******", db="law", charset='utf8') cursor = db.cursor() zzzfw_data = get_data(href) if zzzfw_data != None: zzzfw_soup = BeautifulSoup(zzzfw_data, 'lxml') title = zzzfw_soup.find('h3') wwzl_title = title.get_text() zuozhe_riqi_info = zzzfw_soup.select(".adio") if len(zuozhe_riqi_info) == 2: wwzl_promulgator = zuozhe_riqi_info[0].get_text() wwzl_content = zzzfw_soup.find("content").get_text() print(len(wwzl_content)) tag = zuozhe_riqi_info[1] tag.span.extract() tag.span.extract() promulgation_date = tag.get_text().replace('\n', '') if len(wwzl_content) < 20000: sql = "insert into t_wwzl_info (`wwzl_id`,`wwzl_title`,`wwzl_promulgator`,`promulgation_date`,`wwzl_content`,`wwzl_url`,`wwzl_source`,`source_code`) values(%s,%s,%s,%s,%s,%s,%s,%s)" cursor.execute( sql, (str(uuid.uuid1()), wwzl_title, wwzl_promulgator, promulgation_date, wwzl_content, wwzl_url, wwzl_source, source_code)) db.commit() print("-----------------------------------------------") else: print('字段太长' + href) else: print('zuozhe_riqi_info出错')
def do_insert(href): # 新建连接 db = pymysql.connect(host="192.168.5.210",user="******",password="******",db="law",charset='utf8') cursor = db.cursor() cctv_data = get_data(href) print(href) # cctv_data = get_data("http://news.cntv.cn/2014/02/20/VIDE1392875460487310.shtml") cctv_soup =BeautifulSoup(cctv_data,'lxml') title = cctv_soup.find('h3') video_url = href url = 'http://tv.cctv.com/lm/pflmj/videoset/index.shtml' video_source = url source_code = 23 related_articles = cctv_soup.select('#content_body') if related_articles!=[]: related_articles = related_articles[0].get_text().replace('\n','') elif related_articles==[]: related_articles = cctv_soup.select('.cnt_bd') if related_articles!=[]: related_articles = related_articles[0].get_text().replace('\n','') wenben = cctv_soup.select(".text_box_02") if title!=None and wenben!=[]: video_introduction = '' if wenben[0].find_all('p')!=None: if len(wenben[0].find_all('p'))>2: video_introduction = wenben[0].find_all('p')[2].get_text() title_info = title.get_text().split() if len(title_info) >=3: video_name = title_info[2] if len(title_info) ==4: video_name = title_info[2]+title_info[3] recording_time = title_info[1] sql ="insert into t_law_video_info (`video_id`,`video_name`,`related_articles`,`video_introduction`,`video_url`,`recording_time`,`video_source`,`source_code`) values(%s,%s,%s,%s,%s,%s,%s,%s)" cursor.execute(sql,(str(uuid.uuid1()),video_name,related_articles,video_introduction,video_url,recording_time,video_source,source_code)) db.commit() print("-----------------------------------------------") elif (title.get_text().find("]")!=-1 and title.get_text().find("(")!=-1): title_info = re.split('[])]',title.get_text()) if len(title_info) ==3: video_name = title_info[1] recording_time = title_info[2] sql ="insert into t_law_video_info (`video_id`,`video_name`,`related_articles`,`video_introduction`,`video_url`,`recording_time`,`video_source`,`source_code`) values(%s,%s,%s,%s,%s,%s,%s,%s)" cursor.execute(sql,(str(uuid.uuid1()),video_name,related_articles,video_introduction,video_url,recording_time,video_source,source_code)) db.commit() print("-----------------------------------------------") elif title==None or wenben==[]: title = cctv_soup.find('h1') title_info = title.get_text().split() if len(title_info) >=3: video_name = title_info[2] if len(title_info) ==4: video_name = title_info[2]+title_info[3] recording_time = title_info[1] sql ="insert into t_law_video_info (`video_id`,`video_name`,`related_articles`,`video_url`,`recording_time`,`video_source`,`source_code`) values(%s,%s,%s,%s,%s,%s,%s)" cursor.execute(sql,(str(uuid.uuid1()),video_name,related_articles,video_url,recording_time,video_source,source_code)) db.commit() print("-----------------------------------------------") elif (title.get_text().find("]")!=-1 and title.get_text().find("(")!=-1): title_info = re.split('[](]',title.get_text()) if len(title_info) ==3: video_name = title_info[1] recording_time = title_info[2] sql ="insert into t_law_video_info (`video_id`,`video_name`,`related_articles`,`video_url`,`recording_time`,`video_source`,`source_code`) values(%s,%s,%s,%s,%s,%s,%s)" cursor.execute(sql,(str(uuid.uuid1()),video_name,related_articles,video_url,recording_time,video_source,source_code)) db.commit() print("-----------------------------------------------")
# -*- coding: utf-8 -*- import pymysql import uuid import re from common.getDataDef import get_data from bs4 import BeautifulSoup # 新建连接 db = pymysql.connect(host="192.168.5.210", user="******", password="******", db="law", charset='utf8') cursor = db.cursor() # 获取数据 url = 'http://baike.baidu.com/cms/s/court/court-data.json?t=201891616' courtNameData = get_data(url) courtnames = re.findall(r"courtName\":\"(.+?)\"", courtNameData) count = 0 for courtname in courtnames: if count > 2571: print(courtname) print(courtname) count += 1 print(count) print("-----------------------------------------------") db.close
# -*- coding: utf-8 -*- import re from common.getDataDef import get_data from bs4 import BeautifulSoup # 获取数据 url = 'http://baike.baidu.com/cms/s/court/court-data.json?t=201891616' courtNameData = get_data(url) courtnames = re.findall(r"courtName\":\"(.+?)\"", courtNameData) count = 1 # for courtname in courtnames: # print(courtname) courtname = '辽宁省高级人民法院' baikeUrl = 'https://baike.baidu.com/item/' + courtname courtData = get_data(baikeUrl) # 解析数据 soup = BeautifulSoup(courtData, 'lxml') main_content = soup.find("div", class_="main-content") para_title = main_content.find(attrs={"label-module": "para-title"}) para = main_content.find(attrs={"label-module": "para"}) # print(para_title.previous_sibling) print(para.next_element)
import pymysql import uuid import re from common.getDataDef import get_data from bs4 import BeautifulSoup # 新建连接 db = pymysql.connect(host="192.168.5.210",user="******",password="******",db="law",charset='utf8') cursor = db.cursor() count=0 # 获取数据 for x in range(59,67): url='http://peixun.court.gov.cn/index.php?m=special&c=stindex&a=show&sid='+str(x) print(url) courtData=get_data(url) soup = BeautifulSoup(courtData,'lxml') tbody = soup.find_all('table')[1] trs = tbody.find_all('tr') for tr in trs: # tr = trs[1] a = tr.find('a') if a!=None: video_url = 'http://peixun.court.gov.cn/'+a.get('href') videoData = get_data(video_url) vsoup = BeautifulSoup(videoData,'lxml') # print(vsoup) video_name = vsoup.find('h3').get('title') vtbody = vsoup.find('table') # 无权观看的情况,pass if vtbody!=None: