def main(save_path, file_path=""): if file_path == "": code_list = keyword_list else: with open(file_path, "r") as f: code_list = f.readlines() page_num = int(input(highlight("请输入搜索页数: "))) crawler = baidu_crawler() exist_list = [] for filename in glob.glob(os.path.join(save_path, "*.json")): name = os.path.split(filename)[-1].split(".")[0] exist_list.append(name) for i, code in enumerate(code_list): print(highlight(f"[{i + 1} / {len(code_list)}] 正在搜索 {code} ...")) if code in exist_list: print(highlight("跳过,因为已经搜索过。")) result = crawler.get_result(code + " site:gov.cn", pages=page_num) with open(os.path.join(save_path, code + ".json"), "w") as f: json.dump({"keyword": code, "data": result}, f)
def filter_xlsx(xlsx_path): df = pd.read_excel(xlsx_path, header=[0]) matched = [] unmatched = [] df_remain = pd.DataFrame(columns=df.columns) for i in range(df.shape[0]): row = df.loc[i].values raw_str = "" for item in row: raw_str = raw_str + " | " + str(item) result = find_codes(raw_str) ok = False for code, rel_text, rel_type in result: db_result, n_rel_text = db.search(code, rel_text, reverse_check=True) if len(db_result) == 0: unmatched.append((code, n_rel_text, rel_type, db_result)) else: matched.append((code, n_rel_text, rel_type, db_result)) ok = True if not ok: df_remain.loc[df_remain.shape[0]] = df.loc[i] while True: command = input( highlight( f""" \n检索完成。共检索条目{len(matched) + len(unmatched)}条。 输入 u[c] 查看未匹配条目,输入 m[c] 查看匹配条目,输入 q 退出,输入 s 保存未匹配: """, 31)) print("") if "q" in command: break elif "u" in command: display(unmatched, "c" in command) elif "m" in command: display(matched, "c" in command) elif "s" in command: filepath = input(highlight("请输入保存文件名 (xlsx): ")) writer = pd.ExcelWriter(filepath) df_remain.to_excel(writer, header=True) writer.save()
def main(): while True: command = input( highlight( """\n1 - 输入url 2 - 直接导入文本 3 - 导入xlsx 4 - 导入图片 q - 退出: 请输入指令: """, 31)) if command == "1": print(highlight("输入url, 以#结束: ")) url_data = [] crawler = browser() while True: url = input() if url == "": continue if url == "#": break url_data.append({ "url": url, "text": crawler.get_result(url, plain=True) }) filter_urls(url_data) elif command == "2": path = input("输入文本文件: ") with open(path, "r") as f: text = f.read() filter(text) elif command == "3": path = input("输入 xlsx 文件: ") filter_xlsx(path) elif command == "4": path = input("输入图片文件: ") text = ocr_text(path) print("\n" + text + "\n") filter(text) elif command == "q": break else: print("请输入正确指令。")
def main(): idx = input(highlight("请输入本次任务编号(用来避免重复爬取): ")) paths.update_path(idx) while True: command = input( highlight( """\n1 - 爬取微博 2 - 对爬取微博进行索引(清空所有列表) 3 - 查看未处理相关新闻 4 - 查看已处理相关新闻 5 - 查看标星新闻 6 - 反向搜索航班/车次相关政府新闻 7 - 对政府新闻进行索引(清空所有列表) 8 - 清空下载内容 q - 退出: 请输入指令: """, 31)) if command == "1": weibo_crawler.main(save_path=paths.weibo_path) elif command == "2": paths.clear(idx) process_weibo() elif command == "3": display(paths.unread_post_filepath, "unread") elif command == "4": display(paths.read_post_filepath, "read") elif command == "5": display(paths.star_post_filepath, "star") elif command == "q": break elif command == "6": code_file = input(highlight("输入车次/航班列表 (留空则默认):")) baidu_crawler.main(save_path=paths.baidu_path, file_path=code_file) elif command == "7": paths.clear(idx) process_baidu() elif command == "8": try: empty_dir(paths.baidu_path) empty_dir(paths.weibo_path) print(highlight("成功。")) except: print(highlight("失败。", 31)) else: print("请重新输入指令。")
def ocr_text(image_path): with open(image_path, "rb") as image_file: enc = base64.b64encode(image_file.read()) data = { "app_id": app_id, "image": enc, "nonce_str": ''.join(random.sample(string.digits + string.ascii_letters, 32)), "time_stamp": int(time.time()), } sign = get_sign_str(data) data["sign"] = sign ret_str = "" for i in range(5): url = "https://api.ai.qq.com/fcgi-bin/ocr/ocr_generalocr" r = requests.post(url, data=data) if r.json()["ret"] == 0: print(highlight("识别图片成功。")) result = r.json()["data"]["item_list"] last = result[0]["itemcoord"][0]["y"] for item in result: if item["itemcoord"][0]["y"] - last > 9: last = item["itemcoord"][0]["y"] ret_str += "\n" ret_str += item["itemstring"] + " " break else: print(highlight(f"识别图片失败,原因:{r.json()['msg']}")) time.sleep(1) return ret_str
def db_unique(f_type): def match(text1, text2): text1 = text1.upper() text2 = text2.upper() code_in_txt1 = [x for x in find_plain_code(text1)] code_in_txt2 = [x for x in find_plain_code(text2)] for code in code_in_txt1: if code in code_in_txt2: return True return False for i in range(len(db)): for j in range(i + 1, len(db)): data1 = db[i] data2 = db[j] if data1["t_type"] != f_type or data2["t_type"] != f_type: continue if match(data1["t_no"], data2["t_no"]) and data1["t_date"] == data2["t_date"]: if data1["t_type"] == 2 and data2["t_type"] == 2 and not match( data1["t_no_sub"], data2["t_no_sub"]): continue if data1["verified"] != 2 and data2["verified"] != 2: print("=" * 100) print(highlight(f"id = {data1['id']} :")) print(data1) print(highlight(f"id = {data2['id']} :")) print(data2) print("=" * 100)
def search(code, rel_text, reverse_check=False): result = [] dates = [] code = code.upper().replace(" ", "") for i, element in enumerate(db): code_in_no = find_codes(element["t_no"].replace(" ", "")) for t_no, _, _ in code_in_no: if code == t_no: date_ok, date_key = date_contain(element["t_date"], rel_text) if element["verified"] == 1: verified = highlight("已审核") elif element["verified"] == 0: verified = highlight("未处理", 31) else: verified = highlight("未通过", 31) if date_ok: s = f"\t [#{i + 1}] {highlight(element['t_date'], 35)} | {highlight(element['t_no'], 32)} | {element['t_no_sub']} | 出发: {element['t_pos_start']} | 到达: {element['t_pos_end']} | {verified}" dates.append(date_key) else: s = f"\t [#{i + 1}] {element['t_date']} | {highlight(element['t_no'], 32)} | {element['t_no_sub']} | 出发: {element['t_pos_start']} | 到达: {element['t_pos_end']} | {verified}" if date_ok or not reverse_check: result.append(s) break new_rel_text = rel_text for x in dates: new_rel_text = new_rel_text.replace(x, highlight(x, 35)) return result, new_rel_text
def display(ls, single=False): for i, (code, rel_text, rel_type, db_result) in enumerate(ls): if single: print( f"============================================================================" ) print(rel_text) else: print( f"============================================================================" ) print(f"> [{i + 1}/{len(ls)}]") print(highlight("相关编号:", 33) + code) print(highlight("上下文:", 33) + rel_text) print(highlight("类型:", 33) + rel_type) print(highlight("已存在数据: ", 33)) for s in db_result: print(s) print( f"============================================================================" )
def main(): fail_list = [] for i, item in enumerate(db): print(f"{i + 1} / {len(db)} ... ", end="", flush=True) url = item["source"] headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} r = requests.get(url, headers=headers) r.encoding = "utf-8" result = find_codes(item["t_no"]) if len(result) > 0: idx, _, _ = result[0] else: idx = item["t_no"] codes = find_codes(r.text) print("\r", end="") ok = False for code, _, _ in codes: if similarity(code, idx) > 0.3: ok = True break if ok: continue if idx in r.text: continue print(highlight(f"NO id = {item['id']} | ", 31) + " " + item["source"] + " " + item["t_no"]) fail_list.append(item["id"])
def print_stat(data): author_list = [] author_count = {} date_list = [] date_count = {} for element in data: author = element["name"] date = element["time"] if author not in author_list: author_list.append(author) author_count[author] = 0 author_count[author] += 1 if date not in date_list: date_list.append(date) date_count[date] = 0 date_count[date] += 1 print(highlight("\n来源统计: ", 33)) sorted_author_list = sorted(author_count.items(), key=lambda item: item[1]) for author, count in sorted_author_list: print(highlight(author, 31) + ": " + highlight(count, 32)) print(highlight("\n日期统计: ", 33)) sorted_date_list = sorted(date_count.items(), key=lambda item: item[1]) for date, count in sorted_date_list: print(highlight(date, 31) + ": " + highlight(count, 32))
if name in final_name_list: continue for k in range(5): url = "https://s.weibo.com/user?q=" + parse.quote( name) + "&Refer=user_user" headers = {'User-Agent': random_ua()} r = requests.get(url, headers=headers) r.encoding = "utf-8" result = re.search(r"weibo.com/u/(\d+)", r.text) if result: final_name_list.append(name) final_id_list.append(result.group(1)) print(highlight(name, 32) + " " + result.group(1)) break else: print(highlight(name, 31)) time.sleep(1) time.sleep(1 + random.randint(1, 2)) if i % 15 == 0: time.sleep(7) print(f"name_list = {final_name_list}") print(f"id_list = {final_id_list}")
def display(filepath, catag): with open(filepath, "r") as f: data = json.load(f) filter_command = input(highlight("需要筛选吗 (y/n): ")) if filter_command == "y": data = filter_posts(data) while True: command = input( f"\n载入 {len(data)} 条数据。输入 p - 导出到 xlsx,0 - 显示统计, 1 - 逐条显示,2 - 全部显示" + (",3 - 全部标记为已读" if catag == "unread" else "") + ",q - 退出: ") if command == "p": export_xlsx(data) elif command == "0": print_stat(data) elif command == "1" or command == "2": for i, clue in enumerate(data): db_result, n_rel_text = db.search(clue["code"], clue["rel_text"], reverse_check=True) print( f"============================================================================" ) print(f"> [{i + 1}/{len(data)}]") print( highlight("是否标星", 33) + ("★" if is_star(clue["id"]) else "")) print(highlight("Post ID:", 33) + clue["id"]) print(highlight("时间:", 33) + clue["time"]) print( highlight("来源:", 33) + clue["source"] + " | " + clue["name"]) print(highlight("网页:", 33) + clue["src_url"]) print(highlight("相关编号:", 33) + clue["code"]) print(highlight("上下文:", 33) + n_rel_text) print(highlight("类型:", 33) + clue["rel_type"]) print(highlight("已存在数据: ", 33)) for s in db_result: print(s) print( f"============================================================================" ) if command == "1": if catag == "unread": print("输入 r 标记为已读,", end="") next_step = input("(u)s (取消)标星该条数据,n 跳转到下一条,q 结束阅读。") if next_step == "r" and catag == "unread": mark_read_clue(clue) elif next_step == "u": unstar_clue(clue) elif next_step == "s": star_clue(clue) elif next_step == "q": break elif command == "3": if catag == "unread": for clue in data: mark_read_clue(clue) print(f"已标记 {len(data)} 条数据为已读。") else: print("指令无效。") elif command == "q": break else: print("请输入正确的指令。")
def filter_posts(data): print(highlight(f"共 {len(data)} 条线索。")) author = input(highlight("指定来源微博 (不指定则为空,多个用空格隔开): ")) author_list = author.split() start_date_str = input(highlight("输入开始日期 (yyyy-mm-dd): ")) while True: try: if start_date_str != "": start_date = datetime.datetime.strptime( start_date_str, "%Y-%m-%d") break except: print(highlight("请输入正确日期格式。")) end_date_str = input(highlight("输入结束日期 (yyyy-mm-dd): ")) while True: try: if end_date_str != "": end_date = datetime.datetime.strptime(start_date_str, "%Y-%m-%d") break except: print(highlight("请输入正确日期格式。")) keyword_filter = input(highlight("输入 f 筛选不包含疫情关键字的线索: ")) filter_type = input(highlight("输入 u 只显示未匹配线索,m 只显示匹配线索, a 显示所有线索(默认): ")) result = [] for element in data: if len(author_list) > 0: ok = False for author in author_list: if author in element["name"]: ok = True if not ok: continue try: clue_date = datetime.datetime.strptime(element["time"], "%Y-%m-%d") except: clue_date = None if keyword_filter == "f": contain_keyword = False for keyword in keyword_list: if keyword in element["rel_text"]: contain_keyword = True element["rel_text"] = element["rel_text"].replace( keyword, highlight(keyword, 35)) if not contain_keyword: continue if clue_date and start_date_str != "" and clue_date < start_date: continue if clue_date and end_date_str != "" and clue_date > end_date: continue db_result, _ = db.search(element["code"], element["rel_text"], reverse_check=True) if filter_type == "u" and len(db_result) > 0: continue if filter_type == "m" and len(db_result) == 0: continue element["src_url"] = element["src_url"] result.append(element) return result
from tools.common import highlight, find_codes, find_plain_code import requests import json try: db = json.loads( requests.get( "http://2019ncov2.toolmao.com/ncovadmin/list").text)["data"] print(highlight(f"导入已录入信息 {len(db)} 条。")) except: try: db = json.loads( requests.get( "http://2019ncov.nosugartech.com/data.json").text)["data"] print(highlight(f"导入已录入信息 {len(db)} 条。")) except: print(highlight(f"导入已录入信息失败。")) def date_contain(date, text): _, month, day = date.split("-") months = [month] days = [day] if month[0] == "0": months.append(month[1]) if day[0] == "0": days.append(day[1])
def filter(text): need_keyword = input(highlight("是否需要检索关键词? (y/n): ")) print(highlight("开始检索...")) result = find_codes(text) unmatched = [] matched = [] unmatched_r = [] matched_r = [] export_data = [] for i, (code, rel_text, rel_type) in enumerate(result): if need_keyword == "y": ok = False for keyword in keyword_list: if keyword in rel_text: ok = True if not ok: continue db_result, n_rel_text = db.search(code, rel_text) if len(db_result) == 0: unmatched.append((code, n_rel_text, rel_type, db_result)) else: matched.append((code, n_rel_text, rel_type, db_result)) db_result_r, n_rel_text_r = db.search(code, rel_text, True) if len(db_result_r) == 0: unmatched_r.append((code, n_rel_text_r, rel_type, db_result_r)) export_data.append({"code": code, "rel_text": rel_text}) else: matched_r.append((code, n_rel_text_r, rel_type, db_result_r)) while True: command = input( highlight( f""" \n检索完成。共检索条目{len(result)}条。 [c] - 只输出上下文,默认关闭, [r] - 反向检索日期,默认关闭 输入u[c][r]查看未匹配条目,输入m[c][r]查看匹配条目, 输入 s 将保存未匹配条目 输入q退出: """, 31)) print("") if "q" in command: break elif "u" in command: if "r" in command: display(unmatched_r, "c" in command) else: display(unmatched, "c" in command) elif "m" in command: if "r" in command: display(matched_r, "c" in command) else: display(matched, "c" in command) elif "s" in command: export_xlsx(export_data, columns=["相关文本", "相关编号"], ids=["rel_text", "code"])
new_end = "" for station in result: ret, new_start = equiv(station, start_station) if ret: start_time = result[station] ret, new_end = equiv(station, end_station) if ret: end_time = result[station] return start_time, end_time, new_start, new_end if __name__ == "__main__": print(highlight("请输入火车信息: 日期 - 车次 - 出发站 - 结束站,以#结束:")) infos = [] while True: text = input() if text == "#": break info = text.split() infos.append({ "date": info[0], "code": info[1], "start": info[2], "end": info[3] }) searcher = Searcher()
def filter_urls(data): need_keyword = input(highlight("是否需要检索关键词? (y/n): ")) print(highlight("开始检索...")) unmatched = [] matched = [] export_data = [] for item in data: text = item["text"] url = item["url"] if url == "": continue result = find_codes(text) for i, (code, rel_text, rel_type) in enumerate(result): if need_keyword == "y": ok = False for keyword in keyword_list: if keyword in rel_text: ok = True if not ok: continue db_result, n_rel_text = db.search(code, rel_text, reverse_check=True) if len(db_result) == 0: unmatched.append((code, n_rel_text, rel_type, db_result)) export_data.append({ "code": code, "rel_text": rel_text, "src_url": url }) else: matched.append((code, n_rel_text, rel_type, db_result)) while True: command = input( highlight( f""" \n检索完成。共检索条目{len(matched) + len(unmatched)}条。 [c] - 只输出上下文,默认关闭。 输入 u[c] 查看未匹配条目, 输入 m[c] 查看匹配条目, 输入 s 将保存未匹配条目 输入q退出: """, 31)) print("") if "q" in command: break elif "u" in command: display(unmatched, "c" in command) elif "m" in command: display(matched, "c" in command) elif "s" in command: export_xlsx(export_data, columns=["相关文本", "相关编号", "来源网址"], ids=["rel_text", "code", "src_url"])