def Fantojian(readfilename, writefilename, append): strs = Readfromjson(readfilename) for i in strs: traditional_sentence = i simplified_sentence = Traditional2Simplified(traditional_sentence) try: simplified_sentence = json.loads(simplified_sentence, encoding="utf-8") writejson(simplified_sentence, writefilename, append) except: print(simplified_sentence)
if i<len(htmls)-1: attendteams = []#球队的连接并非都在b标签中 if i==0: attendteams = getAttendteams6(soup, "第一","第二") elif i>=12 and i<=20: attendteams = getAttendteams6(soup, "亞","亞") else: try: attendteams = getAttendteams5(soup,"[A-Z].*組","[A-Z].*組")#仅计算小组赛(A,B,C,组),表格无序号为td if attendteams==[]: attendteams = getAttendteams5_b(soup, "[A-Z].*組", "[A-Z].*組")#仅计算小组赛(A,B,C,组),表格有序号为td if attendteams==[]: attendteams = getAttendteams5(soup, "組", "組")#无小组赛页面,计算第一、二组,表格无序号为td if attendteams==[]: print(i) attendteams = getAttendteams5_b(soup, "組", "組")#无小组赛页面,计算第一、二组,表格有序号为td except: attendteams=getAttendteams5_c(soup, "[A-Z].*組", "[A-Z].*組")#一个表格中嵌套了两个表格 dict["attendteams"]=attendteams print(i,url,len(attendteams),attendteams) writejson(dict,"亚冠_output.json","a") # #若页面为繁体,转化为简体 # Fantojian("亚冠_output.json","亚冠_output(简).json","a")
attendteams = getAttendteams(time, soup) if time == "2008-09": attendteams = getAttendteams2(time, soup) #08-09 if time == "2018-19" or time == "2016-17": #16-17的常规赛数据为个人数据 attendteams = getAttendteams3(time, soup) #18-19 if time == "2005-06": attendteams = getAttendteams4(time, soup) #05-06 game["attendteams"] = attendteams Inforbox = getInforbox(soup) game["Inforbox"] = Inforbox brief = getBrief(soup) game["brief"] = brief return game urls = [] #cba的url信息 for i in range(5, 20): j = i + 1 if i == 19: time = "" else: time = "20%02d-%02d年" % (i, j) url = str(time) + "中国男子篮球职业联赛" url = "https://zh.wikipedia.org/wiki/" + quote(url, safe='') urls.append(url) game = parseCba(time, url) writejson(dict, "cba_output.json", "a")
dict = {"name": "", "link": "", "brief": "", "Inforbox": ""} # "attendteams": "", soup=htmls[i] url=geturlfromhtml(soup) dict["link"]=url name=soup.select("title")[0].get_text().replace(" - 维基百科,自由的百科全书","") dict["name"]=name infobox=getInforbox(soup) dict["Inforbox"]=infobox brief=getBrief(soup).strip("\n") if brief=="": brief=getBrief2(soup).strip("\n") dict["brief"]=brief if i<len(htmls)-1: attendteams = [] try:#getAttendteams2和2的顺序不能颠倒 attendteams = getAttendteams2(soup,"積分榜") print(url, attendteams) except: attendteams = getAttendteams(soup,"積分榜") print(url,attendteams) dict["attendteams"]=attendteams writejson(dict,"ENgland/英超_output.json")
dict = {"name": "", "link": "", "brief": "", "Inforbox": ""}#"attendteams": "", soup=htmls[i] url=geturlfromhtml(soup) dict["link"]=url name=soup.select("title")[0].get_text().replace(" - 维基百科,自由的百科全书","") dict["name"]=name infobox=getInforbox(soup) dict["Inforbox"]=infobox brief=getBrief(soup).strip("\n") if brief=="": brief=getBrief2(soup).strip("\n") dict["brief"]=brief if i<len(htmls)-1: attendteams = [] try:#getAttendteams4和3的顺序不能颠倒 attendteams = getAttendteams4(soup,"积分榜","積分榜") print(url, attendteams) except: attendteams = getAttendteams3(soup,"积分榜","積分榜") print(url,attendteams) dict["attendteams"]=attendteams writejson(dict, "CSL/中超_output.json")
from langconv import * from ReadfromJson import Readfromjson from WritetoJson import writejson import json def Traditional2Simplified(sentence): ''' 将sentence中的繁体字转为简体字 :param sentence: 待转换的句子 :return: 将句子中繁体字转换为简体字之后的句子 ''' sentence = Converter('zh-hans').convert(sentence) return sentence if __name__ == "__main__": strs = Readfromjson("AFC/亚冠_output.json") for i in strs: traditional_sentence = i simplified_sentence = Traditional2Simplified(traditional_sentence) try: simplified_sentence = json.loads(simplified_sentence, encoding="utf-8") writejson(simplified_sentence, "AFC/亚冠_output(简).json", "a") except: print(simplified_sentence)