예제 #1
0
def get_lists():
    urls = []
    # 获取数据libid=04、02、03
    url = 'http://english.gov.cn/policies/policywatch/'
    courtData = get_data(url)
    # 解析数据
    soup = BeautifulSoup(courtData, 'lxml')
    # print(soup)
    lists = soup.select(".list-container")[0].select("li")
    for list in lists:
        urls.append('http://english.gov.cn' + list.find('a').get('href'))
    print(len(urls))
    # print(urls)
    return urls
예제 #2
0
def get_lists():
    urls = []
    # 获取数据libid=04、02、03
    url = 'http://tv.cctv.com/lm/pflmj/videoset/index.shtml'
    # url = 'http://tv.cctv.com/lm/pingan365/videoset/index.shtml'
    courtData=get_data(url)
    # 解析数据
    soup = BeautifulSoup(courtData,'lxml')
    # print(soup)
    lists = soup.select(".text")
    # lists = soup.find_all('li')
    for list in lists:
        href = list.find('a').get('href')
        urls.append(href)
    print(len(urls))
    return urls
예제 #3
0
def do_insert(href):
    print(href)
    wwzl_url = href
    url = 'http://english.gov.cn/policies/policywatch/'
    wwzl_source = url
    source_code = 3

    # 新建连接
    db = pymysql.connect(host="192.168.5.210",
                         user="******",
                         password="******",
                         db="law",
                         charset='utf8')
    cursor = db.cursor()

    zzzfw_data = get_data(href)
    if zzzfw_data != None:
        zzzfw_soup = BeautifulSoup(zzzfw_data, 'lxml')
        title = zzzfw_soup.find('h3')
        wwzl_title = title.get_text()
        zuozhe_riqi_info = zzzfw_soup.select(".adio")
        if len(zuozhe_riqi_info) == 2:
            wwzl_promulgator = zuozhe_riqi_info[0].get_text()
            wwzl_content = zzzfw_soup.find("content").get_text()
            print(len(wwzl_content))
            tag = zuozhe_riqi_info[1]
            tag.span.extract()
            tag.span.extract()
            promulgation_date = tag.get_text().replace('\n', '')
            if len(wwzl_content) < 20000:
                sql = "insert into t_wwzl_info (`wwzl_id`,`wwzl_title`,`wwzl_promulgator`,`promulgation_date`,`wwzl_content`,`wwzl_url`,`wwzl_source`,`source_code`) values(%s,%s,%s,%s,%s,%s,%s,%s)"
                cursor.execute(
                    sql, (str(uuid.uuid1()), wwzl_title, wwzl_promulgator,
                          promulgation_date, wwzl_content, wwzl_url,
                          wwzl_source, source_code))
                db.commit()
                print("-----------------------------------------------")
            else:
                print('字段太长' + href)
        else:
            print('zuozhe_riqi_info出错')
예제 #4
0
def do_insert(href):
    # 新建连接
    db = pymysql.connect(host="192.168.5.210",user="******",password="******",db="law",charset='utf8')
    cursor = db.cursor()

    cctv_data = get_data(href)
    print(href)
    # cctv_data = get_data("http://news.cntv.cn/2014/02/20/VIDE1392875460487310.shtml")
    cctv_soup  =BeautifulSoup(cctv_data,'lxml')
    title = cctv_soup.find('h3')
    video_url = href
    url = 'http://tv.cctv.com/lm/pflmj/videoset/index.shtml'
    video_source = url
    source_code = 23
    related_articles = cctv_soup.select('#content_body')
    if related_articles!=[]:
        related_articles = related_articles[0].get_text().replace('\n','')
    elif related_articles==[]:
        related_articles = cctv_soup.select('.cnt_bd')
        if related_articles!=[]:
            related_articles = related_articles[0].get_text().replace('\n','')

    wenben = cctv_soup.select(".text_box_02")
    if title!=None and wenben!=[]:
        video_introduction = ''
        if wenben[0].find_all('p')!=None:
            if len(wenben[0].find_all('p'))>2:
                video_introduction = wenben[0].find_all('p')[2].get_text()
        title_info = title.get_text().split()
        if len(title_info) >=3:
            video_name = title_info[2]
            if len(title_info) ==4:
                video_name = title_info[2]+title_info[3]
            recording_time = title_info[1]
            sql ="insert into t_law_video_info (`video_id`,`video_name`,`related_articles`,`video_introduction`,`video_url`,`recording_time`,`video_source`,`source_code`) values(%s,%s,%s,%s,%s,%s,%s,%s)"
            cursor.execute(sql,(str(uuid.uuid1()),video_name,related_articles,video_introduction,video_url,recording_time,video_source,source_code))
            db.commit()
            print("-----------------------------------------------")
        elif (title.get_text().find("]")!=-1 and title.get_text().find("(")!=-1):
            title_info = re.split('[])]',title.get_text())
            if len(title_info) ==3:
                video_name = title_info[1]
            recording_time = title_info[2]
            sql ="insert into t_law_video_info (`video_id`,`video_name`,`related_articles`,`video_introduction`,`video_url`,`recording_time`,`video_source`,`source_code`) values(%s,%s,%s,%s,%s,%s,%s,%s)"
            cursor.execute(sql,(str(uuid.uuid1()),video_name,related_articles,video_introduction,video_url,recording_time,video_source,source_code))
            db.commit()
            print("-----------------------------------------------")
    elif title==None or wenben==[]:
        title = cctv_soup.find('h1')
        title_info = title.get_text().split()
        if len(title_info) >=3:
            video_name = title_info[2]
            if len(title_info) ==4:
                video_name = title_info[2]+title_info[3]
            recording_time = title_info[1]
            sql ="insert into t_law_video_info (`video_id`,`video_name`,`related_articles`,`video_url`,`recording_time`,`video_source`,`source_code`) values(%s,%s,%s,%s,%s,%s,%s)"
            cursor.execute(sql,(str(uuid.uuid1()),video_name,related_articles,video_url,recording_time,video_source,source_code))
            db.commit()
            print("-----------------------------------------------")
        elif (title.get_text().find("]")!=-1 and title.get_text().find("(")!=-1):
            title_info = re.split('[](]',title.get_text())
            if len(title_info) ==3:
                video_name = title_info[1]
                recording_time = title_info[2]
                sql ="insert into t_law_video_info (`video_id`,`video_name`,`related_articles`,`video_url`,`recording_time`,`video_source`,`source_code`) values(%s,%s,%s,%s,%s,%s,%s)"
                cursor.execute(sql,(str(uuid.uuid1()),video_name,related_articles,video_url,recording_time,video_source,source_code))
                db.commit()
                print("-----------------------------------------------")
예제 #5
0
# -*- coding: utf-8 -*-
import pymysql
import uuid
import re
from common.getDataDef import get_data
from bs4 import BeautifulSoup

# 新建连接
db = pymysql.connect(host="192.168.5.210",
                     user="******",
                     password="******",
                     db="law",
                     charset='utf8')
cursor = db.cursor()

# 获取数据
url = 'http://baike.baidu.com/cms/s/court/court-data.json?t=201891616'
courtNameData = get_data(url)
courtnames = re.findall(r"courtName\":\"(.+?)\"", courtNameData)

count = 0
for courtname in courtnames:
    if count > 2571:
        print(courtname)
    print(courtname)
    count += 1
    print(count)
    print("-----------------------------------------------")

db.close
예제 #6
0
# -*- coding: utf-8 -*-
import re
from common.getDataDef import get_data
from bs4 import BeautifulSoup

# 获取数据
url = 'http://baike.baidu.com/cms/s/court/court-data.json?t=201891616'
courtNameData = get_data(url)
courtnames = re.findall(r"courtName\":\"(.+?)\"", courtNameData)

count = 1
# for courtname in courtnames:
# print(courtname)
courtname = '辽宁省高级人民法院'
baikeUrl = 'https://baike.baidu.com/item/' + courtname
courtData = get_data(baikeUrl)
# 解析数据
soup = BeautifulSoup(courtData, 'lxml')
main_content = soup.find("div", class_="main-content")
para_title = main_content.find(attrs={"label-module": "para-title"})
para = main_content.find(attrs={"label-module": "para"})
# print(para_title.previous_sibling)
print(para.next_element)
예제 #7
0
import pymysql
import uuid
import re
from common.getDataDef import get_data
from bs4 import BeautifulSoup

# 新建连接
db = pymysql.connect(host="192.168.5.210",user="******",password="******",db="law",charset='utf8')
cursor = db.cursor()

count=0
# 获取数据
for x in range(59,67):
    url='http://peixun.court.gov.cn/index.php?m=special&c=stindex&a=show&sid='+str(x)
    print(url)
    courtData=get_data(url)
    soup = BeautifulSoup(courtData,'lxml')
    tbody = soup.find_all('table')[1]
    trs = tbody.find_all('tr')
    for tr in trs:
        # tr = trs[1]
        a = tr.find('a')
        if a!=None:
            video_url = 'http://peixun.court.gov.cn/'+a.get('href')
            videoData = get_data(video_url)
            vsoup = BeautifulSoup(videoData,'lxml')
            # print(vsoup)
            video_name = vsoup.find('h3').get('title')
            vtbody = vsoup.find('table')
            # 无权观看的情况,pass
            if vtbody!=None: