Esempio n. 1
0
def Fantojian(readfilename, writefilename, append):
    strs = Readfromjson(readfilename)
    for i in strs:
        traditional_sentence = i
        simplified_sentence = Traditional2Simplified(traditional_sentence)
        try:
            simplified_sentence = json.loads(simplified_sentence,
                                             encoding="utf-8")
            writejson(simplified_sentence, writefilename, append)
        except:
            print(simplified_sentence)
Esempio n. 2
0
    if i<len(htmls)-1:
        attendteams = []#球队的连接并非都在b标签中
        if i==0:
            attendteams = getAttendteams6(soup, "第一","第二")
        elif i>=12 and i<=20:
            attendteams = getAttendteams6(soup, "亞","亞")
        else:
            try:
                attendteams = getAttendteams5(soup,"[A-Z].*組","[A-Z].*組")#仅计算小组赛(A,B,C,组),表格无序号为td
                if attendteams==[]:
                    attendteams = getAttendteams5_b(soup, "[A-Z].*組", "[A-Z].*組")#仅计算小组赛(A,B,C,组),表格有序号为td
                if attendteams==[]:
                    attendteams = getAttendteams5(soup, "組", "組")#无小组赛页面,计算第一、二组,表格无序号为td
                if attendteams==[]:
                    print(i)
                    attendteams = getAttendteams5_b(soup, "組", "組")#无小组赛页面,计算第一、二组,表格有序号为td
            except:
                    attendteams=getAttendteams5_c(soup, "[A-Z].*組", "[A-Z].*組")#一个表格中嵌套了两个表格


        dict["attendteams"]=attendteams
        print(i,url,len(attendteams),attendteams)
    writejson(dict,"亚冠_output.json","a")



# #若页面为繁体,转化为简体
# Fantojian("亚冠_output.json","亚冠_output(简).json","a")


Esempio n. 3
0
        attendteams = getAttendteams(time, soup)
        if time == "2008-09":
            attendteams = getAttendteams2(time, soup)  #08-09
        if time == "2018-19" or time == "2016-17":  #16-17的常规赛数据为个人数据
            attendteams = getAttendteams3(time, soup)  #18-19
        if time == "2005-06":
            attendteams = getAttendteams4(time, soup)  #05-06
        game["attendteams"] = attendteams

    Inforbox = getInforbox(soup)
    game["Inforbox"] = Inforbox

    brief = getBrief(soup)
    game["brief"] = brief

    return game


urls = []  #cba的url信息
for i in range(5, 20):
    j = i + 1
    if i == 19:
        time = ""
    else:
        time = "20%02d-%02d年" % (i, j)
    url = str(time) + "中国男子篮球职业联赛"
    url = "https://zh.wikipedia.org/wiki/" + quote(url, safe='')
    urls.append(url)
    game = parseCba(time, url)
    writejson(dict, "cba_output.json", "a")
Esempio n. 4
0
    dict = {"name": "", "link": "", "brief": "", "Inforbox": ""}  # "attendteams": "",

    soup=htmls[i]

    url=geturlfromhtml(soup)
    dict["link"]=url

    name=soup.select("title")[0].get_text().replace(" - 维基百科,自由的百科全书","")
    dict["name"]=name

    infobox=getInforbox(soup)
    dict["Inforbox"]=infobox

    brief=getBrief(soup).strip("\n")
    if brief=="":
        brief=getBrief2(soup).strip("\n")
    dict["brief"]=brief

    if i<len(htmls)-1:
        attendteams = []
        try:#getAttendteams2和2的顺序不能颠倒
            attendteams = getAttendteams2(soup,"積分榜")

            print(url, attendteams)
        except:
            attendteams = getAttendteams(soup,"積分榜")
            print(url,attendteams)
        dict["attendteams"]=attendteams

    writejson(dict,"ENgland/英超_output.json")
Esempio n. 5
0
    dict = {"name": "", "link": "", "brief": "", "Inforbox": ""}#"attendteams": "",

    soup=htmls[i]

    url=geturlfromhtml(soup)
    dict["link"]=url

    name=soup.select("title")[0].get_text().replace(" - 维基百科,自由的百科全书","")
    dict["name"]=name

    infobox=getInforbox(soup)
    dict["Inforbox"]=infobox

    brief=getBrief(soup).strip("\n")
    if brief=="":
        brief=getBrief2(soup).strip("\n")
    dict["brief"]=brief

    if i<len(htmls)-1:
        attendteams = []
        try:#getAttendteams4和3的顺序不能颠倒
            attendteams = getAttendteams4(soup,"积分榜","積分榜")

            print(url, attendteams)
        except:
            attendteams = getAttendteams3(soup,"积分榜","積分榜")
            print(url,attendteams)
        dict["attendteams"]=attendteams

    writejson(dict, "CSL/中超_output.json")
Esempio n. 6
0
from langconv import *
from ReadfromJson import Readfromjson
from WritetoJson import writejson
import json


def Traditional2Simplified(sentence):
    '''
    将sentence中的繁体字转为简体字
    :param sentence: 待转换的句子
    :return: 将句子中繁体字转换为简体字之后的句子
    '''
    sentence = Converter('zh-hans').convert(sentence)
    return sentence


if __name__ == "__main__":
    strs = Readfromjson("AFC/亚冠_output.json")
    for i in strs:
        traditional_sentence = i
        simplified_sentence = Traditional2Simplified(traditional_sentence)
        try:
            simplified_sentence = json.loads(simplified_sentence,
                                             encoding="utf-8")
            writejson(simplified_sentence, "AFC/亚冠_output(简).json", "a")
        except:
            print(simplified_sentence)