def parse_content_html(raw): pattern = re.compile(r"(?s)<div class=TRS_Editor>(.*?)<\/div>") m = pattern.search(raw) content = m.groups()[0] province = ProvinceData(provinceName, provinceKey) pattern_confirm = re.compile(r"累计.*?确诊病例.*?>(\d+)<.*?例") cm = pattern_confirm.search(content) if cm is not None: province.Confirmed = int(cm.groups()[0]) pattern_heal = re.compile(r"治愈出院.*?>(\d+)<.*?例") hm = pattern_heal.search(content) if hm is not None: province.Healed = int(hm.groups()[0]) city = {} pattern_data = re.compile(r"[,、]([\u4E00-\u9FA5]+).*?>(\d+)<.*?例") for i in pattern_data.finditer(content[content.rfind("确诊病例中"):]): name = utils.remove_preposition(i.groups()[0]) if name in cities.keys(): id = cities[name] if id not in city.keys(): d = CityData(name, id) d.Confirmed = int(i.groups()[1]) city[id] = d return province, city
def parse_content_html(raw): pattern = re.compile(r"(?s)<!--content begin -->(.*)<!--content end -->") m = pattern.search(raw) content = m.groups()[0] province = ProvinceData(provinceName, provinceKey) pattern_confirm = re.compile(r"累计确诊.*?(\d+)例") cm = pattern_confirm.search(content) if cm is not None: province.Confirmed = int(cm.groups()[0]) pattern_heal = re.compile(r"出院(\d+)例") hm = pattern_heal.search(content) if hm is not None: province.Healed = int(hm.groups()[0]) pattern_dead = re.compile(r"死亡(\d+)例") dm = pattern_dead.search(content) if dm is not None: province.Dead = int(dm.groups()[0]) city = {} pattern_data = re.compile(r"[。,、]([\u4E00-\u9FA5]+)(\d+)例") for i in pattern_data.finditer(content[content.rfind("累计确诊"):]): name = utils.remove_preposition(i.groups()[0]) if name in cities.keys(): id = cities[name] if id not in city.keys(): d = CityData(name, id) d.Confirmed = int(i.groups()[1]) city[id] = d return province, city
def parse_content_html(raw): pattern = re.compile(r"(?s)<div class=\"ze-art\" style=\"width: 100%;\">(.*)<\/div>") m = pattern.search(raw) content = m.groups()[0] province = ProvinceData(provinceName, provinceKey) pattern_confirm = re.compile(r"累计.*?确诊.*?(\d+)例") cm = pattern_confirm.search(content) if cm is not None: province.Confirmed = int(cm.groups()[0]) pattern_heal = re.compile(r"累计治愈出院(\d+)例") hm = pattern_heal.search(content) if hm is not None: province.Healed = int(hm.groups()[0]) city = {} pattern_data = re.compile(r"([\u4E00-\u9FA5]+)(\d+)例") for i in pattern_data.finditer(content[content.rfind("累计报告"):content.rfind("累计治愈")]): name = utils.remove_preposition(i.groups()[0]) if name in cities.keys(): id = cities[name] if id not in city.keys(): d = CityData(name, id) d.Confirmed = int(i.groups()[1]) city[id] = d return province, city
def parse_content_html(raw): pattern = re.compile( r"(?s)<!------------------------- mian开始 ------------------------->(.*)<!--------责任编辑相关---------->" ) m = pattern.search(raw) content = m.groups()[0] province = ProvinceData(provinceName, provinceKey) pattern_confirm = re.compile(r"累计.*?确诊.*?(\d+)例") cm = pattern_confirm.search(content) if cm is not None: province.Confirmed = int(cm.groups()[0]) pattern_heal = re.compile(r",出院病例(\d+)例") for hm in pattern_heal.finditer(content): province.Healed = int(hm.groups()[0]) city = {} pattern_data = re.compile(r"[\.\u200b]([\u4E00-\u9FA5]+)(\d+)例") for i in pattern_data.finditer( content[content.rfind("累计报告"):content.rfind("重症病例")]): name = utils.remove_preposition(i.groups()[0]) if '盟' in name[:-1]: name = name[:name.find('盟') + 1] if '市' in name[:-1]: name = name[:name.find('市') + 1] if name in cities.keys(): id = cities[name] if id not in city.keys(): d = CityData(name, id) d.Confirmed = int(i.groups()[1]) city[id] = d else: city[id].Confirmed += int(i.groups()[1]) return province, city
def parse_content_html(raw): pattern = re.compile(r"<p>(.*?累计.*?)<\/p>") m = pattern.search(raw) content = m.groups()[0] province = ProvinceData(provinceName, provinceKey) pattern_confirm = re.compile(r"确诊病例(\d+)例") cm = pattern_confirm.search(content) if cm is not None: province.Confirmed = int(cm.groups()[0]) pattern_heal = re.compile(r"治愈出院(\d+)例") hm = pattern_heal.search(content) if hm is not None: province.Healed = int(hm.groups()[0]) pattern_dead = re.compile(r"死亡(\d+)例(([\u4E00-\u9FA5]+))") dm = pattern_dead.search(content) dc = {} if dm is not None: province.Dead = int(dm.groups()[0]) dc[dm.groups()[1]] = int(dm.groups()[0]) # SPECIAL HANDLE city = {} pattern_data = re.compile(r"[(,、。]([\u4E00-\u9FA5]+)(\d+)例") for i in pattern_data.finditer(content): name = utils.remove_preposition(i.groups()[0]) if name in alia_cities.keys(): name = alia_cities[name] if name in cities.keys(): id = cities[name] if id not in city.keys(): d = CityData(name, id) d.Confirmed = int(i.groups()[1]) city[id] = d else: city[id].Confirmed += int(i.groups()[1]) # SPECIAL HANDLE for dd in dc.items(): if dd[0] in cities.keys(): id = cities[dd[0]] if id not in city.keys(): d = CityData(dd[0], id) d.Confirmed = dd[1] city[id] = d else: city[id].Confirmed += dd[1] return province, city
def test_remove_preposition(self): s = '其中广州市' self.assertEqual(utils.remove_preposition(s), "广州市")