def main(save_path, file_path=""):

    if file_path == "":
        code_list = keyword_list
    else:
        with open(file_path, "r") as f:
            code_list = f.readlines()

    page_num = int(input(highlight("请输入搜索页数: ")))

    crawler = baidu_crawler()

    exist_list = []

    for filename in glob.glob(os.path.join(save_path, "*.json")):
        name = os.path.split(filename)[-1].split(".")[0]
        exist_list.append(name)

    for i, code in enumerate(code_list):
        print(highlight(f"[{i + 1} / {len(code_list)}] 正在搜索 {code} ..."))
        if code in exist_list:
            print(highlight("跳过,因为已经搜索过。"))
        result = crawler.get_result(code + " site:gov.cn", pages=page_num)
        with open(os.path.join(save_path, code + ".json"), "w") as f:
            json.dump({"keyword": code, "data": result}, f)
Esempio n. 2
0
def filter_xlsx(xlsx_path):

    df = pd.read_excel(xlsx_path, header=[0])

    matched = []
    unmatched = []

    df_remain = pd.DataFrame(columns=df.columns)

    for i in range(df.shape[0]):

        row = df.loc[i].values
        raw_str = ""

        for item in row:
            raw_str = raw_str + " | " + str(item)

        result = find_codes(raw_str)
        ok = False

        for code, rel_text, rel_type in result:

            db_result, n_rel_text = db.search(code,
                                              rel_text,
                                              reverse_check=True)

            if len(db_result) == 0:
                unmatched.append((code, n_rel_text, rel_type, db_result))
            else:
                matched.append((code, n_rel_text, rel_type, db_result))
                ok = True

        if not ok:
            df_remain.loc[df_remain.shape[0]] = df.loc[i]

    while True:

        command = input(
            highlight(
                f"""
        \n检索完成。共检索条目{len(matched) + len(unmatched)}条。 
输入 u[c] 查看未匹配条目,输入 m[c] 查看匹配条目,输入 q 退出,输入 s 保存未匹配: """, 31))
        print("")

        if "q" in command:
            break
        elif "u" in command:
            display(unmatched, "c" in command)
        elif "m" in command:
            display(matched, "c" in command)
        elif "s" in command:
            filepath = input(highlight("请输入保存文件名 (xlsx): "))
            writer = pd.ExcelWriter(filepath)
            df_remain.to_excel(writer, header=True)
            writer.save()
Esempio n. 3
0
def main():

    while True:

        command = input(
            highlight(
                """\n1 - 输入url
2 - 直接导入文本
3 - 导入xlsx
4 - 导入图片
q - 退出:

请输入指令: """, 31))

        if command == "1":
            print(highlight("输入url, 以#结束: "))

            url_data = []

            crawler = browser()

            while True:
                url = input()

                if url == "":
                    continue
                if url == "#":
                    break

                url_data.append({
                    "url": url,
                    "text": crawler.get_result(url, plain=True)
                })

            filter_urls(url_data)
        elif command == "2":
            path = input("输入文本文件: ")
            with open(path, "r") as f:
                text = f.read()
            filter(text)
        elif command == "3":
            path = input("输入 xlsx 文件: ")
            filter_xlsx(path)
        elif command == "4":
            path = input("输入图片文件: ")
            text = ocr_text(path)
            print("\n" + text + "\n")
            filter(text)
        elif command == "q":
            break
        else:
            print("请输入正确指令。")
Esempio n. 4
0
def main():

    idx = input(highlight("请输入本次任务编号(用来避免重复爬取):  "))

    paths.update_path(idx)

    while True:

        command = input(
            highlight(
                """\n1 - 爬取微博
2 - 对爬取微博进行索引(清空所有列表)
3 - 查看未处理相关新闻
4 - 查看已处理相关新闻
5 - 查看标星新闻
6 - 反向搜索航班/车次相关政府新闻
7 - 对政府新闻进行索引(清空所有列表)
8 - 清空下载内容
q - 退出: 

请输入指令: """, 31))

        if command == "1":
            weibo_crawler.main(save_path=paths.weibo_path)
        elif command == "2":
            paths.clear(idx)
            process_weibo()
        elif command == "3":
            display(paths.unread_post_filepath, "unread")
        elif command == "4":
            display(paths.read_post_filepath, "read")
        elif command == "5":
            display(paths.star_post_filepath, "star")
        elif command == "q":
            break
        elif command == "6":
            code_file = input(highlight("输入车次/航班列表 (留空则默认):"))
            baidu_crawler.main(save_path=paths.baidu_path, file_path=code_file)
        elif command == "7":
            paths.clear(idx)
            process_baidu()
        elif command == "8":
            try:
                empty_dir(paths.baidu_path)
                empty_dir(paths.weibo_path)
                print(highlight("成功。"))
            except:
                print(highlight("失败。", 31))
        else:
            print("请重新输入指令。")
Esempio n. 5
0
def ocr_text(image_path):

    with open(image_path, "rb") as image_file:
        enc = base64.b64encode(image_file.read())

    data = {
        "app_id":
        app_id,
        "image":
        enc,
        "nonce_str":
        ''.join(random.sample(string.digits + string.ascii_letters, 32)),
        "time_stamp":
        int(time.time()),
    }

    sign = get_sign_str(data)

    data["sign"] = sign

    ret_str = ""

    for i in range(5):

        url = "https://api.ai.qq.com/fcgi-bin/ocr/ocr_generalocr"
        r = requests.post(url, data=data)

        if r.json()["ret"] == 0:
            print(highlight("识别图片成功。"))
            result = r.json()["data"]["item_list"]
            last = result[0]["itemcoord"][0]["y"]
            for item in result:
                if item["itemcoord"][0]["y"] - last > 9:
                    last = item["itemcoord"][0]["y"]
                    ret_str += "\n"
                ret_str += item["itemstring"] + " "
            break
        else:
            print(highlight(f"识别图片失败,原因:{r.json()['msg']}"))

        time.sleep(1)

    return ret_str
Esempio n. 6
0
def db_unique(f_type):
    def match(text1, text2):

        text1 = text1.upper()
        text2 = text2.upper()

        code_in_txt1 = [x for x in find_plain_code(text1)]
        code_in_txt2 = [x for x in find_plain_code(text2)]

        for code in code_in_txt1:
            if code in code_in_txt2:
                return True

        return False

    for i in range(len(db)):

        for j in range(i + 1, len(db)):

            data1 = db[i]
            data2 = db[j]

            if data1["t_type"] != f_type or data2["t_type"] != f_type:
                continue

            if match(data1["t_no"],
                     data2["t_no"]) and data1["t_date"] == data2["t_date"]:

                if data1["t_type"] == 2 and data2["t_type"] == 2 and not match(
                        data1["t_no_sub"], data2["t_no_sub"]):
                    continue

                if data1["verified"] != 2 and data2["verified"] != 2:

                    print("=" * 100)
                    print(highlight(f"id = {data1['id']} :"))
                    print(data1)
                    print(highlight(f"id = {data2['id']} :"))
                    print(data2)
                    print("=" * 100)
Esempio n. 7
0
def search(code, rel_text, reverse_check=False):

    result = []
    dates = []

    code = code.upper().replace(" ", "")

    for i, element in enumerate(db):

        code_in_no = find_codes(element["t_no"].replace(" ", ""))

        for t_no, _, _ in code_in_no:

            if code == t_no:
                date_ok, date_key = date_contain(element["t_date"], rel_text)

                if element["verified"] == 1:
                    verified = highlight("已审核")
                elif element["verified"] == 0:
                    verified = highlight("未处理", 31)
                else:
                    verified = highlight("未通过", 31)

                if date_ok:
                    s = f"\t [#{i + 1}]  {highlight(element['t_date'], 35)} | {highlight(element['t_no'], 32)} | {element['t_no_sub']} | 出发: {element['t_pos_start']} | 到达: {element['t_pos_end']} | {verified}"
                    dates.append(date_key)
                else:
                    s = f"\t [#{i + 1}]  {element['t_date']} | {highlight(element['t_no'], 32)} | {element['t_no_sub']} | 出发: {element['t_pos_start']} | 到达: {element['t_pos_end']} | {verified}"

                if date_ok or not reverse_check:
                    result.append(s)

                break

    new_rel_text = rel_text

    for x in dates:
        new_rel_text = new_rel_text.replace(x, highlight(x, 35))

    return result, new_rel_text
Esempio n. 8
0
def display(ls, single=False):

    for i, (code, rel_text, rel_type, db_result) in enumerate(ls):

        if single:
            print(
                f"============================================================================"
            )
            print(rel_text)
        else:
            print(
                f"============================================================================"
            )
            print(f"> [{i + 1}/{len(ls)}]")
            print(highlight("相关编号:", 33) + code)
            print(highlight("上下文:", 33) + rel_text)
            print(highlight("类型:", 33) + rel_type)
            print(highlight("已存在数据: ", 33))
            for s in db_result:
                print(s)
            print(
                f"============================================================================"
            )
Esempio n. 9
0
def main():

    fail_list = []

    for i, item in enumerate(db):

        print(f"{i + 1} / {len(db)} ... ", end="", flush=True)

        url = item["source"]
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
        r = requests.get(url, headers=headers)
        r.encoding = "utf-8"

        result = find_codes(item["t_no"])
        if len(result) > 0:
            idx, _, _ = result[0]
        else:
            idx = item["t_no"]
        codes = find_codes(r.text)

        print("\r", end="")

        ok = False

        for code, _, _ in codes:
            if similarity(code, idx) > 0.3:
                ok = True
                break

        if ok:
            continue

        if idx in r.text:
            continue

        print(highlight(f"NO id = {item['id']} | ", 31) + " " + item["source"] + " " + item["t_no"])
        fail_list.append(item["id"])
Esempio n. 10
0
def print_stat(data):

    author_list = []
    author_count = {}

    date_list = []
    date_count = {}

    for element in data:

        author = element["name"]
        date = element["time"]

        if author not in author_list:
            author_list.append(author)
            author_count[author] = 0

        author_count[author] += 1

        if date not in date_list:
            date_list.append(date)
            date_count[date] = 0

        date_count[date] += 1

    print(highlight("\n来源统计: ", 33))

    sorted_author_list = sorted(author_count.items(), key=lambda item: item[1])
    for author, count in sorted_author_list:
        print(highlight(author, 31) + ": " + highlight(count, 32))

    print(highlight("\n日期统计: ", 33))

    sorted_date_list = sorted(date_count.items(), key=lambda item: item[1])
    for date, count in sorted_date_list:
        print(highlight(date, 31) + ": " + highlight(count, 32))
Esempio n. 11
0
        if name in final_name_list:
            continue

        for k in range(5):

            url = "https://s.weibo.com/user?q=" + parse.quote(
                name) + "&Refer=user_user"
            headers = {'User-Agent': random_ua()}
            r = requests.get(url, headers=headers)
            r.encoding = "utf-8"

            result = re.search(r"weibo.com/u/(\d+)", r.text)

            if result:
                final_name_list.append(name)
                final_id_list.append(result.group(1))
                print(highlight(name, 32) + " " + result.group(1))
                break
            else:
                print(highlight(name, 31))

            time.sleep(1)

        time.sleep(1 + random.randint(1, 2))

        if i % 15 == 0:
            time.sleep(7)

    print(f"name_list = {final_name_list}")
    print(f"id_list = {final_id_list}")
Esempio n. 12
0
def display(filepath, catag):

    with open(filepath, "r") as f:
        data = json.load(f)

    filter_command = input(highlight("需要筛选吗 (y/n): "))

    if filter_command == "y":
        data = filter_posts(data)

    while True:

        command = input(
            f"\n载入 {len(data)} 条数据。输入 p - 导出到 xlsx,0 - 显示统计, 1 - 逐条显示,2 - 全部显示"
            + (",3 - 全部标记为已读" if catag == "unread" else "") + ",q - 退出: ")

        if command == "p":
            export_xlsx(data)

        elif command == "0":
            print_stat(data)

        elif command == "1" or command == "2":

            for i, clue in enumerate(data):

                db_result, n_rel_text = db.search(clue["code"],
                                                  clue["rel_text"],
                                                  reverse_check=True)

                print(
                    f"============================================================================"
                )
                print(f"> [{i + 1}/{len(data)}]")
                print(
                    highlight("是否标星", 33) +
                    ("★" if is_star(clue["id"]) else ""))
                print(highlight("Post ID:", 33) + clue["id"])
                print(highlight("时间:", 33) + clue["time"])
                print(
                    highlight("来源:", 33) + clue["source"] + " | " +
                    clue["name"])
                print(highlight("网页:", 33) + clue["src_url"])
                print(highlight("相关编号:", 33) + clue["code"])
                print(highlight("上下文:", 33) + n_rel_text)
                print(highlight("类型:", 33) + clue["rel_type"])
                print(highlight("已存在数据: ", 33))
                for s in db_result:
                    print(s)
                print(
                    f"============================================================================"
                )

                if command == "1":
                    if catag == "unread":
                        print("输入 r 标记为已读,", end="")
                    next_step = input("(u)s (取消)标星该条数据,n 跳转到下一条,q 结束阅读。")
                    if next_step == "r" and catag == "unread":
                        mark_read_clue(clue)
                    elif next_step == "u":
                        unstar_clue(clue)
                    elif next_step == "s":
                        star_clue(clue)
                    elif next_step == "q":
                        break

        elif command == "3":
            if catag == "unread":
                for clue in data:
                    mark_read_clue(clue)
                print(f"已标记 {len(data)} 条数据为已读。")
            else:
                print("指令无效。")
        elif command == "q":
            break
        else:
            print("请输入正确的指令。")
Esempio n. 13
0
def filter_posts(data):

    print(highlight(f"共 {len(data)} 条线索。"))

    author = input(highlight("指定来源微博 (不指定则为空,多个用空格隔开): "))

    author_list = author.split()

    start_date_str = input(highlight("输入开始日期 (yyyy-mm-dd): "))

    while True:
        try:
            if start_date_str != "":
                start_date = datetime.datetime.strptime(
                    start_date_str, "%Y-%m-%d")
            break
        except:
            print(highlight("请输入正确日期格式。"))

    end_date_str = input(highlight("输入结束日期 (yyyy-mm-dd): "))

    while True:
        try:
            if end_date_str != "":
                end_date = datetime.datetime.strptime(start_date_str,
                                                      "%Y-%m-%d")
            break
        except:
            print(highlight("请输入正确日期格式。"))

    keyword_filter = input(highlight("输入 f 筛选不包含疫情关键字的线索: "))

    filter_type = input(highlight("输入 u 只显示未匹配线索,m 只显示匹配线索, a 显示所有线索(默认): "))

    result = []

    for element in data:

        if len(author_list) > 0:
            ok = False
            for author in author_list:
                if author in element["name"]:
                    ok = True
            if not ok:
                continue

        try:
            clue_date = datetime.datetime.strptime(element["time"], "%Y-%m-%d")
        except:
            clue_date = None

        if keyword_filter == "f":

            contain_keyword = False

            for keyword in keyword_list:
                if keyword in element["rel_text"]:
                    contain_keyword = True
                    element["rel_text"] = element["rel_text"].replace(
                        keyword, highlight(keyword, 35))

            if not contain_keyword:
                continue

        if clue_date and start_date_str != "" and clue_date < start_date:
            continue

        if clue_date and end_date_str != "" and clue_date > end_date:
            continue

        db_result, _ = db.search(element["code"],
                                 element["rel_text"],
                                 reverse_check=True)

        if filter_type == "u" and len(db_result) > 0:
            continue

        if filter_type == "m" and len(db_result) == 0:
            continue

        element["src_url"] = element["src_url"]

        result.append(element)

    return result
Esempio n. 14
0
from tools.common import highlight, find_codes, find_plain_code
import requests
import json

try:
    db = json.loads(
        requests.get(
            "http://2019ncov2.toolmao.com/ncovadmin/list").text)["data"]
    print(highlight(f"导入已录入信息 {len(db)} 条。"))
except:
    try:
        db = json.loads(
            requests.get(
                "http://2019ncov.nosugartech.com/data.json").text)["data"]
        print(highlight(f"导入已录入信息 {len(db)} 条。"))
    except:
        print(highlight(f"导入已录入信息失败。"))


def date_contain(date, text):
    _, month, day = date.split("-")

    months = [month]
    days = [day]

    if month[0] == "0":
        months.append(month[1])

    if day[0] == "0":
        days.append(day[1])
Esempio n. 15
0
def filter(text):

    need_keyword = input(highlight("是否需要检索关键词? (y/n): "))

    print(highlight("开始检索..."))

    result = find_codes(text)

    unmatched = []
    matched = []

    unmatched_r = []
    matched_r = []

    export_data = []

    for i, (code, rel_text, rel_type) in enumerate(result):

        if need_keyword == "y":
            ok = False
            for keyword in keyword_list:
                if keyword in rel_text:
                    ok = True
            if not ok:
                continue

        db_result, n_rel_text = db.search(code, rel_text)

        if len(db_result) == 0:
            unmatched.append((code, n_rel_text, rel_type, db_result))
        else:
            matched.append((code, n_rel_text, rel_type, db_result))

        db_result_r, n_rel_text_r = db.search(code, rel_text, True)

        if len(db_result_r) == 0:
            unmatched_r.append((code, n_rel_text_r, rel_type, db_result_r))
            export_data.append({"code": code, "rel_text": rel_text})
        else:
            matched_r.append((code, n_rel_text_r, rel_type, db_result_r))

    while True:

        command = input(
            highlight(
                f"""
        \n检索完成。共检索条目{len(result)}条。
[c] - 只输出上下文,默认关闭, [r] - 反向检索日期,默认关闭 
输入u[c][r]查看未匹配条目,输入m[c][r]查看匹配条目,
输入 s 将保存未匹配条目
输入q退出: """, 31))

        print("")

        if "q" in command:
            break
        elif "u" in command:
            if "r" in command:
                display(unmatched_r, "c" in command)
            else:
                display(unmatched, "c" in command)
        elif "m" in command:
            if "r" in command:
                display(matched_r, "c" in command)
            else:
                display(matched, "c" in command)
        elif "s" in command:
            export_xlsx(export_data,
                        columns=["相关文本", "相关编号"],
                        ids=["rel_text", "code"])
        new_end = ""

        for station in result:
            ret, new_start = equiv(station, start_station)
            if ret:
                start_time = result[station]
            ret, new_end = equiv(station, end_station)
            if ret:
                end_time = result[station]

        return start_time, end_time, new_start, new_end


if __name__ == "__main__":

    print(highlight("请输入火车信息: 日期 - 车次 - 出发站 - 结束站,以#结束:"))
    infos = []

    while True:
        text = input()
        if text == "#":
            break
        info = text.split()
        infos.append({
            "date": info[0],
            "code": info[1],
            "start": info[2],
            "end": info[3]
        })

    searcher = Searcher()
Esempio n. 17
0
def filter_urls(data):

    need_keyword = input(highlight("是否需要检索关键词? (y/n): "))

    print(highlight("开始检索..."))

    unmatched = []
    matched = []
    export_data = []

    for item in data:

        text = item["text"]
        url = item["url"]

        if url == "":
            continue

        result = find_codes(text)

        for i, (code, rel_text, rel_type) in enumerate(result):

            if need_keyword == "y":
                ok = False
                for keyword in keyword_list:
                    if keyword in rel_text:
                        ok = True
                if not ok:
                    continue

            db_result, n_rel_text = db.search(code,
                                              rel_text,
                                              reverse_check=True)

            if len(db_result) == 0:
                unmatched.append((code, n_rel_text, rel_type, db_result))
                export_data.append({
                    "code": code,
                    "rel_text": rel_text,
                    "src_url": url
                })
            else:
                matched.append((code, n_rel_text, rel_type, db_result))

    while True:

        command = input(
            highlight(
                f"""
        \n检索完成。共检索条目{len(matched) + len(unmatched)}条。
[c] - 只输出上下文,默认关闭。
输入 u[c] 查看未匹配条目,
输入 m[c] 查看匹配条目,
输入 s 将保存未匹配条目
输入q退出: """, 31))
        print("")

        if "q" in command:
            break
        elif "u" in command:
            display(unmatched, "c" in command)
        elif "m" in command:
            display(matched, "c" in command)
        elif "s" in command:
            export_xlsx(export_data,
                        columns=["相关文本", "相关编号", "来源网址"],
                        ids=["rel_text", "code", "src_url"])