Ejemplo n.º 1
0
def read_pdf_to_text(path):
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    function_logger.info("开始读取pdf文件!")

    # 按页返回页中各个文本
    source_pdf = open(path, 'rb')
    # 创建PDF,资源管理器,来共享资源
    rsrcmgr = PDFResourceManager()
    # 创建一个PDF设备对象
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    # 创建一个PDF解释其对象
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # 循环遍历列表,每次处理一个page内容
    # doc.get_pages() 获取page列表
    for page in PDFPage.get_pages(source_pdf):
        interpreter.process_page(page)
        # 接受该页面的LTPage对象
        layout = device.get_result()
        # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象
        # 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等
        # 想要获取文本就获得对象的text属性,
        for x in layout:
            if isinstance(x, LTTextBoxHorizontal):
                results = x.get_text()
                print(results)
        print("-----------")
        # print("-------------------")

    function_logger.info("pdf文件读取文本完成!")
def build_template_by_infos(template_path: str,
                            fields_question_condition: list,
                            fields_question_target: list,
                            template_sentence_questions: list,
                            template_sentence_answers: list):
    """
    通过提供的信息构造问题模板
    :param template_path: 模板路径
    :param fields_question_condition: 问句条件词,例["school 学校", "year 年份", "major 专业", "district 省份", "classy 类别"]
    :param fields_question_target: 问句目标词,例["numbers 招生人数 招生计划 招多少人 招生计划是多少 招生人数是多少"]
    :param template_sentence_questions: 模板问题句
    :param template_sentence_answers: 模板答案句
    :return:
    """
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    function_logger.info("开始构造%s的问题模板..." % template_path.split("\\")[-1])
    # 使用pickle存储模板文件
    template_dict = {}
    template_dict["fq_conditon"] = fields_question_condition
    template_dict["fq_target"] = fields_question_target
    template_dict["ts_answers"] = template_sentence_answers
    build_question_sentences = []
    # 获取问句条件词的英文字段
    template_fields_en = []
    for fq_condition in fields_question_condition:
        template_fields_en.append(fq_condition.split(" ")[0])
    # 利用问句条件词英文字段构造子集,使用替换的方法构造所有全排列的问句
    fields_en_subset = get_subset_binary(template_fields_en)
    print(len(fields_en_subset))
    for i_question in range(len(template_sentence_questions)):
        # 查询当前模板问句包含多少问句目标词
        match_question_target = []
        for fq_target in fields_question_target:
            if fq_target.split(
                    " ")[0] in template_sentence_questions[i_question]:
                match_question_target = fq_target.split(" ")
                # 找到一个问句目标词即退出,默认问句中只有一个问句目标词
                break
        # 对问句目标词中对应的每一个询问方式进行替换
        target_word = match_question_target[0]
        for question_mode in match_question_target[1:]:
            # 对子集中的每一个集合进行替换
            for subset in fields_en_subset:
                sentence = template_sentence_questions[i_question].replace("(" + target_word + ")", question_mode) \
                           + "--" + str(i_question)
                # 子集为空,不缺省参数
                if not subset:
                    build_question_sentences.append(sentence)
                # 子集为原集合,不添加
                elif len(subset) == len(template_fields_en):
                    continue
                # 子集有缺省,去除子集中的元素后再添加
                else:
                    for field_en in subset:
                        sentence = sentence.replace("(" + field_en + ")", "")
                    build_question_sentences.append(sentence)
    template_dict["ts_questions"] = build_question_sentences
    with open(template_path, "wb") as p_file:
        pickle.dump(template_dict, p_file)
    function_logger.info("%s的问题模板构建完成!" % template_path.split("\\")[-1])
def build_university_major_dict():
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    function_logger.info("构造专业名称词典...")
    source_path = "Information/大学/大学学科(百度百科网页源码).txt"
    with open(source_path, "r", encoding="utf-8") as source_file:
        main_page_source = source_file.read()

    # bs4方法解析尝试
    # main_page_soup = BeautifulSoup(main_page_source, "lxml")
    # main_page_soup.prettify()
    #
    # source_major_list = main_page_soup.find_all("div", class_="para")
    # i=0
    # while source_major_list[i].text.find("01学科门类") == -1:
    #     i += 1
    # else:
    #     cut_index = i
    # source_major_list = source_major_list[cut_index:]
    # for item in source_major_list:
    #     print(item.text)

    # 正则方式获取大学专业目录
    result = re.findall(r'\d{4,6}[KT]*\s*[\u4e00-\u9fa5]+', main_page_source)
    # 切分部分不符合的数据
    result = result[4:-4]
    # 根据资料构建词典
    with open(dictionary_path + "/major.txt", "w",
              encoding="utf-8") as major_dict:
        major_dict.truncate()
        for item in result:
            major_dict.write(re.findall(r'[\u4e00-\u9fa5]+', item)[0] + "\n")
    function_logger.info("构造专业名称词典完成")
Ejemplo n.º 4
0
def load_table_content(file_path: str):
    """
    通过excel表格加载表格内容
    :param file_path:
    :return:
    """
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    # 加载excel表格
    function_logger.info("加载表格:%s" % file_path.split("\\")[-1])
    wb = load_workbook(file_path)
    sheet_names = wb.sheetnames
    sheet_first = wb.get_sheet_by_name(sheet_names[0])
    table_head = []
    for item in range(1, sheet_first.max_column + 1):
        table_head.append(sheet_first.cell(row=1, column=item).value)
    function_logger.debug("表头:%s" % str(table_head))
    table_attr = {}
    for i_column in range(1, sheet_first.max_column + 1):
        column_name = sheet_first.cell(row=1, column=i_column).value
        column_value = set()
        for i_row in range(2, sheet_first.max_row + 1):
            column_value.add(
                sheet_first.cell(row=i_row, column=i_column).value)
        table_attr[column_name] = str(list(column_value))
    for key in table_attr:
        function_logger.debug(key)
        value_list = [
            value.replace("'", "").strip()
            for value in table_attr[key][1:-1].split(",")
        ]
        value_list.sort()
        function_logger.debug("列表长度:%d" % len(value_list))
        function_logger.debug(str(value_list))
    function_logger.info("加载表格:%s完成!" % file_path.split("\\")[-1])
Ejemplo n.º 5
0
def build_mysql_string_by_template_and_keymap(template_question: str,
                                              template_question_type: str,
                                              keyword_dict: dict) -> str:
    """
    通过模板类型及关键词键值映射返回mysql语句
    :param template_question: 模板问题
    :param template_question_type: 模板问题类型
    :param keyword_dict: 关键词映射
    :return: SQL语句
    """
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    function_logger.info("开始构造MySQL语句...")
    search_table = template_question_type
    # 提取模板句中的槽
    pattern = re.compile(r"[(].*?[)]")
    slots = re.findall(pattern, template_question)
    # print(slots)
    # 构造SQL语句
    mysql_string = ""
    for i_slot in range(len(slots)):
        # function_logger.debug("slot:"+slots[i_slot][1:-1])
        key = keyword_dict["search_" + slots[i_slot][1:-1]]
        # function_logger.debug("key"+key)
        if key == "":
            continue
        else:
            mysql_string += slots[i_slot][1:-1] + "='" + key + "' and "
    if mysql_string != "":
        mysql_string = "select * from " + search_table + " where " + mysql_string[:
                                                                                  -5] + ";"
    function_logger.info("MySQL语句构造完成!")
    return mysql_string
Ejemplo n.º 6
0
def build_mysql_string_by_template(template_question: str,
                                   template_question_type: str) -> str:
    """
    通过模板类型(槽位)构造MySQL语句
    :param template_question: 模板问句
    :param template_question_type: 模板问句类型
    :return: 对应的mysql语句
    """
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    function_logger.info("开始构造MySQL语句...")
    search_table = template_question_type
    # 提取模板句中的槽
    pattern = re.compile(r"[(].*?[)]")
    slots = re.findall(pattern, template_question)
    # print(slots)
    # 构造SQL语句
    mysql_string = "select * from " + search_table + " where "

    for i_slot in range(len(slots)):
        if i_slot == len(slots) - 1:
            mysql_string += "[" + slots[i_slot][1:-1] + "='" + slots[
                i_slot] + "'" + "]"
        else:
            mysql_string += "[" + slots[i_slot][1:-1] + "='" + slots[
                i_slot] + "' and " + "]"
    mysql_string += ";"
    function_logger.info("MySQL语句构造完成!")
    return mysql_string
def pretreat_crawl_questions():
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    data_dir = "Information/大学/常问问题集/Data"
    pickle_dir = "Information/大学/常问问题集/Pickle"
    file_list = os.listdir(data_dir)
    function_logger.debug("大学数量:%d" % len(file_list))
    for file in file_list:
        university_name = file[:-9]
        function_logger.debug(university_name)
        function_logger.info("开始读取%s的常问问题集..." % university_name)
        with open(data_dir + "/" + file, "r", encoding="utf-8") as csvfile:
            csv_reader = csv.reader(csvfile)
            fqa_lines = []
            for row in csv_reader:
                if len(row) == 5:
                    line = {}
                    line["title"] = row[0]
                    line["from"] = row[1]
                    line["time"] = row[2]
                    line["question"] = row[3]
                    line["answer"] = row[4]
                    fqa_lines.append(line)
            fqa_lines.pop(0)
        function_logger.info("读取%s的常用问题集完成!" % university_name)
        function_logger.info("开始写入%s的常用问题集..." % university_name)
        with open(pickle_dir + "/" + university_name, "wb") as p_file:
            pickle.dump(fqa_lines, p_file)
        function_logger.info("写入%s的常用问题集完成!" % university_name)
    function_logger.info("数据处理完成!")
def plan_doc_to_mysql_table_tuple(file_path, school):
    mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    mylogger.info("插入文件" + file_path)
    file_content = read_file_content(file_path)
    file_name = file_path.split("\\")[-1]
    year = file_name.split("-")[0]
    district = file_name.split("-")[1]
    # mylogger.debug("年份:" + year + "地区:" + district)
    table_content = []
    for i in range(len(file_content)):
        file_content[i] = file_content[i].strip()
        temp = file_content[i].split("\t")
        table_content.append(temp)
    table_head = table_content[0]
    # mylogger.debug("表头:" + str(table_head))
    table_content = table_content[1:]
    # 去除统计部分的数据项、无数据的项
    for item in table_content:
        if item[0] == "无数据":
            table_content.remove(item)
        # elif item[1] == "统计":
        #     table_content.remove(item)
    mysql_content = []
    for item in table_content:
        major = item[0]
        classy = item[1]
        numbers = item[2]
        temp = (school, district, year, major, classy, numbers)
        mysql_content.append(temp)
    # mylogger.debug("构造后的数据表项如下:")
    # for item in mysql_content:
    #     mylogger.debug(str(item))
    return mysql_content
Ejemplo n.º 9
0
def read_pdf_to_words(path):
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    function_logger.info("开始读取pdf文件!")
    with pdfplumber.open(path) as pdf:
        all_words = []
        for page in pdf.pages:
            words = page.extract_words()
            all_words.append(words)
    function_logger.info("pdf文件读取table完成!")
    return all_words
def build_classy_dict():
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    function_logger.info("构造类别名称词典...")
    classy = ["文科", "理科", "文史", "理工"]
    with open(dictionary_path + "/classy.txt", "w",
              encoding="utf-8") as classy_dict:
        classy_dict.truncate()
        for item in classy:
            classy_dict.write(item + "\n")
    function_logger.info("构造类别名称词典完成!")
def build_mysql_major_dict():
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    function_logger.info("获取招生计划表中的专业字段...")
    # 招生计划中的专业名称
    plan_sql_string = "SELECT major FROM admission_plan GROUP BY major;"
    myresult = mysql_query_sentence(plan_sql_string)
    function_logger.debug("招生计划表中专业数%d:" % len(myresult))
    pattern = re.compile(r"[(([].*?[))\]].*")
    plan_major_set = set()
    for major in myresult:
        temp = re.sub(pattern, "", major[0])
        plan_major_set.add(temp)
    function_logger.debug("招生计划表中专业数(统计合并后):%d" % len(plan_major_set))
    function_logger.debug(
        str(
            sorted(list(plan_major_set),
                   key=lambda x: lazy_pinyin(x.lower())[0][0])))

    # 录取分数中的专业名称
    score_sql_string = "SELECT major FROM admission_score_major GROUP BY major;"
    myresult = mysql_query_sentence(score_sql_string)
    function_logger.debug("录取分数表中专业数%d:" % len(myresult))
    score_major_set = set()
    for major in myresult:
        temp = re.sub(pattern, "", major[0])
        score_major_set.add(temp)
    function_logger.debug("录取分数表中专业数(统计合并后):%d" % len(score_major_set))
    function_logger.debug(
        str(
            sorted(list(score_major_set),
                   key=lambda x: lazy_pinyin(x.lower())[0][0])))

    # 测定两者交集
    function_logger.debug("以上两者的交集为:")
    major_and_set = plan_major_set.intersection(score_major_set)
    function_logger.debug("交集长度为:%d", len(major_and_set))
    function_logger.debug(
        str(
            sorted(list(major_and_set),
                   key=lambda x: lazy_pinyin(x.lower())[0][0])))

    # 测定两者并集
    function_logger.debug("以上两者的并集为:")
    major_or_set = plan_major_set.union(score_major_set)
    function_logger.debug("并集长度为:%d", len(major_or_set))
    function_logger.debug(
        str(
            sorted(list(major_or_set),
                   key=lambda x: lazy_pinyin(x.lower())[0][0])))
Ejemplo n.º 12
0
def read_pdf_to_tables(path):
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    """
    read pdf and return a table list.
    :param path: the pdf path
    :return tables: table list
    """
    function_logger.info("开始读取pdf文件!")
    with pdfplumber.open(path) as pdf:
        all_tables = []
        for page in pdf.pages:
            tables = page.extract_tables()
            all_tables.append(tables)
    function_logger.info("pdf文件读取table完成!")
    return all_tables
def read_pdf_to_tables(file_path):
    """
    解析pdf文件中的表格
    :param file_path: pdf文件路径
    :return: 表格数据列表
    """
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    function_logger.info("开始读取pdf文件!")
    with pdfplumber.open(path) as pdf:
        all_tables = []
        # 对每一页pdf中的表格进行解析并添加到列表结构中
        for page in pdf.pages:
            tables = page.extract_tables()
            all_tables.append(tables)
    function_logger.info("pdf文件读取table完成!")
    return all_tables
def build_school_dict():
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    function_logger.info("构造学校名称词典...")
    c9 = [
        "北京大学", "清华大学", "复旦大学", "上海交通大学", "浙江大学", "南京大学", "中国科学技术大学",
        "哈尔滨工业大学", "西安交通大学", "北京大学医学部", "上海交通大学医学部", "复旦大学上海医学部"
    ]
    c9_j = [
        "北大", "清华", "复旦", "上交", "浙大", "南大", "中科大", "哈工大", "西交大", "北大医学部",
        "上交医学部", "复旦医学部"
    ]
    with open(dictionary_path + "/school.txt", "w",
              encoding="utf-8") as school_dict:
        school_dict.truncate()
        for item in c9:
            school_dict.write(item + "\n")
        for item in c9_j:
            school_dict.write(item + "\n")
    function_logger.info("构造学校名称词典完成!")
def create_database(db_name: str):
    """
    创建数据库university_admission
    :param db_name: 数据库名
    :return:
    """
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    mydb = connect_mysql_without_db()
    mycursor = mydb.cursor()
    mycursor.execute("SHOW DATABASES")
    dbs = []
    function_logger.debug("数据库如下:")
    for db in mycursor:
        dbs.append(db[0])
        function_logger.debug(db[0])
    if db_name in dbs:
        function_logger.info("数据库" + db_name + "已存在!")
    else:
        mycursor.execute("CREATE DATABASE " + db_name)
        function_logger.info(db_name + "已创建!")
def get_plan_info_ustc():
    mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    file_path = "Information/九校联盟/中国科学技术大学/招生计划"
    main_url = "https://zsb.ustc.edu.cn"
    # 获取分类信息
    main_page_source = request_url(main_url + "/12993/list.htm")
    main_page_source.encoding = main_page_source.apparent_encoding
    main_page_soup = BeautifulSoup(main_page_source.text, "lxml")
    main_page_soup.prettify()
    for area in main_page_soup.find_all("area"):
        page_url = area["href"]
        page_source = request_url(page_url)
        page_source.encoding = page_source.apparent_encoding
        page_soup = BeautifulSoup(page_source.text, "lxml")
        page_soup.prettify()
        title = page_soup.find("h1", class_="arti_title").string
        year = title[:4]
        district = title[5:-4]
        table_name = year + "-" + district
        table_head = ["专业", "类别", "人数"]
        mylogger.debug(table_name)
        mylogger.debug(str(table_head))
        all_lines = []
        for tr in page_soup.find("div",
                                 class_="wp_articlecontent").find_all("tr"):
            line = []
            for td in tr:
                line.append(td.text)
            all_lines.append(line)
        table_content = []
        for line in all_lines[1:]:
            if line[0] != "合计" and line[0] != "小计":
                if district == "浙江" or district == "上海":
                    table_content.append(
                        [line[0] + "(" + line[1] + ")", "理工", line[2]])
                else:
                    table_content.append([line[0], "理工", line[1]])
        for line in table_content:
            mylogger.debug(str(line))
        write_table(file_path, table_name, table_head, table_content)
        mylogger.info(year + district + "的招生计划已存入文件")
def get_undergraduate_university_info():
    # 院校库主页
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    main_url = "https://gaokao.chsi.com.cn/sch/search.do?searchType=1&xlcc=bk&start="
    main_page_source = request_url(main_url + "0")
    main_page_source.encoding = main_page_source.apparent_encoding
    main_page_soup = BeautifulSoup(main_page_source.text, "lxml")
    page_count = int(
        main_page_soup.find("li", class_="lip dot").next_sibling.text)
    page_university_count = 20
    university_infos = []
    for i_page in range(page_count):
        page_url = main_url + str(i_page * page_university_count)
        function_logger.info("页面抓取进度(%d,%d)" % (i_page + 1, int(page_count)))
        function_logger.info("页面url%s" % page_url)
        browser = selenium_chrome(page_url)
        page_souce = browser.find_element_by_class_name(
            "ch-table").get_attribute("innerHTML")
        browser.quit()
        page_soup = BeautifulSoup(page_souce, "lxml")
        page_soup.prettify()
        head = [th.text for th in page_soup.find("tr").find_all("th")]
        print(head)
        for tr in page_soup.find_all("tr")[1:]:
            info = {}
            td_list = tr.find_all("td")
            info["url"] = "https://gaokao.chsi.com.cn" + td_list[0].find(
                "a")["href"]
            for i in [0, 1, 2, 3, 4, 7]:
                info[head[i]] = td_list[i].text.strip()
            info[head[5]] = td_list[5].text.strip().replace("\n", "").replace(
                " ", "").replace("\u2002", " ")
            info[head[6]] = td_list[6].text.strip().replace(
                "\ue664", "有") if td_list[6].text.strip() != "" else "无"
            university_infos.append(info)
    for info in university_infos:
        print(info)
    with open("Information/大学/university_info", "wb") as p_file:
        pickle.dump(university_infos, p_file)
Ejemplo n.º 18
0
def create_plan_score_folder_c9():
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    # C9及其医学部
    c9 = [
        "北京大学", "清华大学", "复旦大学", "上海交通大学", "浙江大学", "南京大学", "中国科学技术大学",
        "哈尔滨工业大学", "西安交通大学", "北京大学医学部", "上海交通大学医学部", "复旦大学上海医学部"
    ]
    catalog = ["招生计划", "录取分数"]
    root_path = "Information/九校联盟"
    for university in c9:
        function_logger.info("创建%s的文件夹" % university)
        if not os.path.exists(root_path + "/" + university):
            os.makedirs(root_path + "/" + university)
            for cat in catalog:
                if not os.path.exists(root_path + "/" + university + "/" +
                                      cat):
                    os.makedirs(root_path + "/" + university + "/" + cat)
                    # 创建source文件夹(存储网络爬取的原始数据)
                    if not os.path.exists(root_path + "/" + university + "/" +
                                          cat + "/source"):
                        os.makedirs(root_path + "/" + university + "/" + cat +
                                    "/source")
        function_logger.info("%s的文件夹创建完成!" % university)
def insert_all_school_table_admission_plan():
    mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    c9 = [
        "北京大学", "清华大学", "复旦大学", "上海交通大学", "浙江大学", "南京大学", "中国科学技术大学",
        "哈尔滨工业大学", "西安交通大学", "北京大学医学部", "上海交通大学医学部", "复旦大学上海医学部"
    ]
    already_get = ["南京大学"]
    for school in already_get:
        mylogger.info("开始插入" + school + "的招生计划数据...")
        dir_path = "Information/九校联盟/" + school + "/招生计划"
        file_list = read_all_file_list(dir_path)
        for file in file_list:
            mylogger.info("构造数据项元组...")
            mysql_content = plan_doc_to_mysql_table_tuple(file, school)
            mylogger.info("将元组数据插入数据库...")
            insert_table_admission_plan(mysql_content)
            mylogger.info("元组数据插入完成!")
        time.sleep(5)
def create_admission_plan_table():
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    db_name = "university_admission"
    tables = search_table_in_db(db_name)
    mydb = connect_mysql_with_db(db_name)
    mycursor = mydb.cursor()

    if "admission_plan" in tables:
        function_logger.info("admission_plan表已存在!")
        function_logger.info("正在删除admission_plan表...")
        mycursor.execute("DROP TABLE admission_plan;")

    mycursor.execute("CREATE TABLE admission_plan("
                     "id INT AUTO_INCREMENT PRIMARY KEY NOT NULL ,"
                     "school VARCHAR(30),"
                     "district VARCHAR(10),"
                     "year INT,"
                     "major VARCHAR(100),"
                     "classy varchar(10),"
                     "numbers varchar(10))")
    function_logger.info("admission_plan表已重新创建!")
def create_admission_score_pro_table():
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    db_name = "university_admission"
    tables = search_table_in_db(db_name)
    mydb = connect_mysql_with_db(db_name)
    mycursor = mydb.cursor()
    if "admission_score_pro" in tables:
        function_logger.info("admission_score_pro表已存在!")
        function_logger.info("正在删除admission_score_pro表...")
        mycursor.execute("DROP TABLE admission_score_pro;")
    mydb = connect_mysql_with_db(db_name)
    mycursor = mydb.cursor()
    # 各省的高校分数线(学校、地区、年份、类别、批次、分数线)
    mycursor.execute("CREATE TABLE admission_score_pro("
                     "id INT AUTO_INCREMENT PRIMARY KEY NOT NULL,"
                     "school VARCHAR(30),"
                     "year INT,"
                     "district VARCHAR(10),"
                     "batch varchar(30),"
                     "classy varchar(10),"
                     "line varchar(30))")
    function_logger.info("admission_score_pro表创建完成!")
def create_admission_score_major_table():
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    db_name = "university_admission"
    tables = search_table_in_db(db_name)
    mydb = connect_mysql_with_db(db_name)
    mycursor = mydb.cursor()
    if "admission_score_major" in tables:
        function_logger.info("admission_score_major表已存在!")
        function_logger.info("正在删除admission_score_major表...")
        mycursor.execute("DROP TABLE admission_score_major;")

    # 高校的专业分数线(学校、地区、年份、专业、类别、最高分、平均分、最低分、录取人数)
    mycursor.execute("CREATE TABLE admission_score_major("
                     "id INT AUTO_INCREMENT PRIMARY KEY NOT NULL,"
                     "school VARCHAR(30),"
                     "district VARCHAR(10),"
                     "year INT,"
                     "major VARCHAR(100),"
                     "classy varchar(30),"
                     "highest varchar(10) NULL,"
                     "average varchar(10) NULL,"
                     "lowest varchar(10),"
                     "amount varchar(10) NULL)")
    function_logger.info("admission_score_major表创建完成!")
Ejemplo n.º 23
0
# -*- coding: utf-8 -*-
"""
@File  : HanLPAPI.py
@Author: SangYu
@Date  : 2018/12/27 14:56
@Desc  : HanLP平台的API
"""
from pyhanlp import *
from Log.Logger import MyLog


# 分词(有词性标注)
def hanlp_nlp_segmentor(sentence):
    nlp_tokenizer = JClass("com.hankcs.hanlp.tokenizer.NLPTokenizer")
    return str(nlp_tokenizer.analyze(sentence)).split(" ")


# 分词(无词性标注)
def hanlp_nlp_segmentor_without_nature(sentence):
    nlp_tokenizer = JClass("com.hankcs.hanlp.tokenizer.NLPTokenizer")
    word_list = str(nlp_tokenizer.analyze(sentence)).split(" ")
    return [word.split("/")[0] for word in word_list]


if __name__ == "__main__":
    mylogger = MyLog(logger=__name__).getlog()
    mylogger.info("start...")
    print(type(hanlp_nlp_segmentor("2015年哈工大软件工程在河南招多少人?")))
    print(hanlp_nlp_segmentor("一五年哈工大软件工程在河南招多少人?"))
    mylogger.info("end...")
Ejemplo n.º 24
0
def get_question_yggk():
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    # 院校咨询页url
    main_url = "https://gaokao.chsi.com.cn"
    file_path = "Information/大学/Test"
    school_urls = [["北京大学", str(26232)], ["哈尔滨工业大学", str(26617)],
                   ["北京大学医学部", str(6405529)], ["上海交通大学", str(6217)],
                   ["上海交通大学医学院", str(61811)], ["清华大学", str(36710)],
                   ["复旦大学", str(7243)], ["南京大学", str(4453)],
                   ["浙江大学", str(43617)], ["中国科学技术大学", str(6280)],
                   ["哈尔滨工业大学(威海)", str(62646117)], ["西安交通大学",
                                                    str(53593)]]
    for school in school_urls:
        function_logger.info("开始抓取" + school[0] + "的招生问题数据...")
        # 创建该学校的问题集收集表,sheet,并写好表头
        table_head = ["标题", "来源", "时间", "问题", "回答"]
        with open(file_path + "/" + school[0] + "常用问题集.csv",
                  "w",
                  encoding='utf-8') as csvfile:
            csvfile.truncate()
            writer = csv.writer(csvfile)
            writer.writerow(table_head)
        main_page_source = request_url(
            "https://gaokao.chsi.com.cn/zxdy/forum--method-listDefault,year-2005,forumid-"
            + school[1] + ",start-0.dhtml")
        main_page_source.encoding = main_page_source.apparent_encoding
        main_page_soup = BeautifulSoup(main_page_source.content, "lxml")
        # 页面总数
        page_count = main_page_soup.find(
            "li", class_="lip dot").next_sibling.a.string
        # 置顶问题个数
        top_question_count = len(
            main_page_soup.find("table", class_="ch-table zx-table").find_all(
                "span", class_="question_top_txt"))
        # 每页问题个数
        page_question_count = 15
        # 通过构造每一个页面url进入具体页面
        for i_page in list(range(10)) + list(range(11, int(page_count))):
            page_url = main_url + "/zxdy/forum--method-listDefault,year-2005,forumid-" + school[
                1] + ",start-" + str(i_page * page_question_count) + ".dhtml"
            # xls表格记录基点(页问题量+置顶问题量+表头)
            # if i_page == 0:
            #     base_count = 1
            # else:
            #     base_count = i_page * page_question_count + top_question_count + 1
            function_logger.info("页面抓取进度(%d,%d)" %
                                 (i_page + 1, int(page_count)))
            function_logger.info("页面url%s" % page_url)
            page_source = request_url(page_url)
            page_source.encoding = page_source.apparent_encoding
            page_soup = BeautifulSoup(page_source.text, "lxml")
            tr_list = page_soup.find("table",
                                     class_="ch-table zx-table").contents
            for item in tr_list:
                if item == "\n":
                    tr_list.remove(item)
            records = []
            # 置顶问答只记录一次
            if i_page == 0:
                start_index = 0
            else:
                start_index = top_question_count * 2
            for i_qa_pair in range(start_index, len(tr_list), 2):
                question_title = "q_title"
                question_from = ""
                question_time = ""
                question_text = "q_text"
                answer_text = "a_text"
                question_title = str(tr_list[i_qa_pair].find(
                    "a", class_="question_t_txt").string).strip()
                function_logger.debug("标题:%s" % question_title)
                question_from = str(tr_list[i_qa_pair].find(
                    "i", title="提问人").next_sibling.string).strip()
                function_logger.debug("来源:%s" % question_from)
                question_time = str(tr_list[i_qa_pair].find(
                    "td", class_="question_t ch-table-center").text).strip()
                function_logger.debug("时间:%s" % question_time)
                # 问题与答案可能出现本页无法写下的情况,需要进行页面跳转获取信息
                question_text_class = tr_list[i_qa_pair + 1].find(
                    "div", class_="question")
                if question_text_class.find(text='[详细]') is None:
                    question_text = str(question_text_class.text).strip()
                else:
                    turn_page_url = main_url + question_text_class.find(
                        "a", text='[详细]')["href"]
                    turn_page_source = request_url(turn_page_url)
                    turn_page_source.encoding = turn_page_source.apparent_encoding
                    turn_page_soup = BeautifulSoup(turn_page_source.text,
                                                   "lxml")
                    question_text = str(
                        turn_page_soup.find("div",
                                            class_="question").text).strip()
                function_logger.debug("问题:%s" % question_text)
                answer_text_class = tr_list[i_qa_pair + 1].find(
                    "div", class_="question_a")
                if answer_text_class.find(text='[详细]') is None:
                    answer_text = str(answer_text_class.text).replace(
                        "[ 回复 ]", "").strip()
                else:
                    turn_page_url = main_url + answer_text_class.find(
                        "a", text='[详细]')["href"]
                    turn_page_source = request_url(turn_page_url)
                    turn_page_source.encoding = turn_page_source.apparent_encoding
                    turn_page_soup = BeautifulSoup(turn_page_source.text,
                                                   "lxml")
                    pattern = re.compile(r"\s+|\n|\t|\v|\ue63c")
                    answer_text = re.sub(pattern, "", str(turn_page_soup.find("div", class_="question_a").text)) \
                        .replace("[回复]", "")
                function_logger.debug("回答:%s" % answer_text)
                records.append([
                    question_title, question_from, question_time,
                    question_text, answer_text
                ])
            with open(file_path + "/" + school[0] + "常用问题集.csv",
                      "a",
                      encoding='utf-8') as csvfile:
                writer = csv.writer(csvfile)
                for record in records:
                    writer.writerow(record)
            time.sleep(3)
        function_logger.info("%s的常用问题集收集完毕!" % school[0])
def frequent_question_normalize(dir_path: str):
    """
    处理常用问题集(csv),问题和答案部分
    :param dir_path: 文件夹路径
    :return:
    """
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    function_logger.info("开始进行数据处理...")
    file_list = read_all_file_list(dir_path + "/source")
    for file in file_list:
        function_logger.debug(file)
        school_name = file.split("\\")[-1][:-9]
        function_logger.info("开始读取%s的常问问题集..." % school_name)
        with open(file, "r", encoding="utf-8") as csvfile:
            csv_reader = csv.reader(csvfile)
            fqa_lines = []
            for row in csv_reader:
                if len(row) == 5:
                    line = {}
                    line["title"] = row[0].replace(" ", "")
                    line["from"] = row[1]
                    line["time"] = row[2]
                    line["question"] = row[3].replace("\u3000", "").replace(
                        "\n", ",").replace(" ", "")
                    line["answer"] = row[4].replace("\ue63c", "").replace("\u3000", "").replace("\n", ",")\
                        .replace(" ", "").lstrip(",")
                    fqa_lines.append(line)
            fqa_lines.pop(0)
        function_logger.info("读取%s的常用问题集完成!" % school_name)
        function_logger.info("开始写入%s的常用问题集..." % school_name)
        with open(dir_path + "/预处理/pickle/" + school_name, "wb") as p_file:
            pickle.dump(fqa_lines, p_file)
        function_logger.info("写入%s的常用问题集完成!" % school_name)
    function_logger.info("数据处理完成!")
            for row in csv_reader:
                if len(row) == 5:
                    line = {}
                    line["title"] = row[0].replace(" ", "")
                    line["from"] = row[1]
                    line["time"] = row[2]
                    line["question"] = row[3].replace("\u3000", "").replace(
                        "\n", ",").replace(" ", "")
                    line["answer"] = row[4].replace("\ue63c", "").replace("\u3000", "").replace("\n", ",")\
                        .replace(" ", "").lstrip(",")
                    fqa_lines.append(line)
            fqa_lines.pop(0)
        function_logger.info("读取%s的常用问题集完成!" % school_name)
        function_logger.info("开始写入%s的常用问题集..." % school_name)
        with open(dir_path + "/预处理/pickle/" + school_name, "wb") as p_file:
            pickle.dump(fqa_lines, p_file)
        function_logger.info("写入%s的常用问题集完成!" % school_name)
    function_logger.info("数据处理完成!")


if __name__ == '__main__':
    main_logger = MyLog(logger=__name__).getlog()
    main_logger.info("start...")
    question_set_dir = "../InformationGet/Information/大学/常问问题集"
    # frequent_question_normalize(question_set_dir)
    with open(question_set_dir + "/预处理/pickle/" + "上海交通大学医学院", "rb") as p_file:
        data = pickle.load(p_file)
    for q in data[:10]:
        print(q)
    main_logger.info("end...")
def get_plan_info_xjtu():
    mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    file_path = "Information/九校联盟/西安交通大学/招生计划"
    # 通过获取单个网页获取信息,需要后续处理,很麻烦
    # mylogger.info("开始获取网页源码...共五个网页")
    # with open(file_path+"/source/page_url_list","w",encoding="utf-8")as url_file:
    #     for i in range(1, 6):
    #         main_url = "http://zs.xjtu.edu.cn/lmy.jsp?a43639t=5&a43639p=" + str(i) \
    #                    + "&a43639c=10&urltype=tree.TreeTempUrl&wbtreeid=1005"
    #         # 获取分类信息
    #         main_page_source = requests.get(main_url).text
    #         main_page_soup = BeautifulSoup(main_page_source, "lxml")
    #         main_page_soup.prettify()
    #         for item in main_page_soup.find("div", id="fybt").find("ul").find_all("a"):
    #             url_file.write(str(item)+"\n")
    # mylogger.info("招生计划页面url获取完成")
    # mylogger.info("开始获取具体页面信息")
    # with open(file_path + "/source/page_url_list", "r", encoding="utf-8")as url_file:
    #     url_source = url_file.read()
    # url_soup = BeautifulSoup(url_source,"lxml")
    # url_soup.prettify()
    # for page_url in url_soup.find_all("a"):
    #     print(page_url)
    # 直接从官网进行数据查询,使用form提交
    # 获取可查询的年份和地区
    main_url = "http://zs.xjtu.edu.cn/bkscx/zsjhcx.htm"
    main_page_source = request_url(main_url)
    main_page_source.encoding = main_page_source.apparent_encoding
    main_page_soup = BeautifulSoup(main_page_source.text, "lxml")
    main_page_soup.prettify()
    years = []
    districts = []
    for year in main_page_soup.find("select", id="nf").find_all("option"):
        years.append(year.string)
    for district in main_page_soup.find("select",
                                        id="sf").find_all("option")[1:]:
        districts.append(district.string)
    mylogger.debug("可查询的年份" + str(years))
    mylogger.debug("可查询的省份" + str(districts))
    search_url = "http://zs.xjtu.edu.cn/zsjg.jsp?wbtreeid=1168"
    for year in years:
        for district in districts:
            # x,y 是查询按钮点击时的坐标,查询按钮大小x,y(54x22)
            params = {"nf": year, "sf": district, "x": "27", "y": "11"}
            return_html = requests.post(search_url, data=params)
            return_soup = BeautifulSoup(return_html.text, "lxml")
            return_soup.prettify()
            all_lines = []
            for tr in return_soup.find("div", id="fybt").find_all("tr"):
                line = []
                for td in tr:
                    if td.string != "\n":
                        line.append(str(td.string).strip())
                all_lines.append(line)
            table_name = year + "-" + district[:-1]
            table_head = ["专业", "类别", "人数"]
            table_content = []
            for line in all_lines[1:-1]:
                classy = line[2]
                if classy == "理":
                    classy = "理工"
                if classy == "文":
                    classy = "文史"
                table_content.append([line[0], classy, line[4]])
            mylogger.debug(table_name)
            mylogger.debug(str(table_head))
            for line in table_content:
                mylogger.debug(str(line))
            write_table(file_path, table_name, table_head, table_content)
            mylogger.info(year + district + "的招生计划已存入文件")
def get_plan_info_nju():
    mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    mylogger.info("开始获取网页源码...")
    main_url = "http://bkzs.nju.edu.cn"
    # 获取分类信息

    file_path = "Information/九校联盟/南京大学/招生计划"
    # 使用selenium获取隐藏部分源码
    # browser = selenium_chrome(main_url+"/4543/list.htm")
    # pro_list = browser.find_element_by_id("MapControl")
    # with open(file_path+"/source/"+"index","w",encoding="utf-8") as file:
    #     file.write(pro_list.get_attribute('innerHTML'))

    # 开始将源码读入并使用bs4解析,获取html页面源码
    # with open(file_path + "/source/" + "index", "r", encoding="utf-8") as file:
    #     source_code = file.read()
    # main_page_soup = BeautifulSoup(source_code, "lxml")
    # main_page_soup.prettify()
    # for li in main_page_soup.find_all("li"):
    #     url = li.a["href"]
    #     pro = li.span.text
    #     print(pro + "\t" + url)
    #     browser = selenium_chrome(main_url+url)
    #     page_source = browser.find_element_by_class_name("wp_articlecontent").get_attribute("innerHTML")
    #     year = re.findall("\d{4}",BeautifulSoup(page_source,"lxml").find("p").text)[0]
    #     with open(file_path + "/source/"+year+"-"+pro+".html","w",encoding="utf-8") as file:
    #         file.write(page_source)
    #     browser.quit()
    #     time.sleep(5)

    # 获取pdf文件
    # file_list = read_all_file_list(file_path + "/source")
    # for file_name in file_list:
    #     pdf_name = file_name.split("\\")[-1][:-5]
    #     if file_name[-4:] == "html":
    #         print(file_name)
    #         with open(file_name, "r", encoding="utf-8") as file:
    #             page_source = file.read()
    #         page_soup = BeautifulSoup(page_source,"lxml")
    #         for item in page_soup.find_all("div",class_="wp_pdf_player"):
    #             pdf_url = item["pdfsrc"]
    #             pdf_source = request_url(main_url+pdf_url)
    #             with open(file_path + "/source/"+pdf_name+".pdf","wb")as pdf_file:
    #                 pdf_file.write(pdf_source.content)

    # 解析pdf文件
    file_list = read_all_file_list(file_path + "/source")
    for file_name in file_list:
        if file_name[-3:] == "pdf":
            pdf_name = file_name.split("\\")[-1][:-4]
            year = pdf_name.split("-")[0]
            pro = pdf_name.split("-")[-1]
            pages = read_pdf_to_tables(file_name)
            table_name = year + "-" + pro
            table_head = ["专业", "类别", "人数"]
            mylogger.debug(table_name)
            mylogger.debug(str(table_head))
            all_lines = []
            for tables in pages:
                for table in tables:
                    for line in table:
                        all_lines.append(line)
            # 分表
            all_tables = []
            table = []
            for line in all_lines:
                if line[0] == "科类":
                    if len(table) != 0:
                        all_tables.append(table)
                    table = []
                    table.append(line)
                else:
                    table.append(line)
            all_tables.append(table)
            # 将标记写入该表下的每一项
            all_lines = []
            for table in all_tables:
                sign = table[1][0]
                if sign == "国家专项计划" or sign == "提前批":
                    for line in table:
                        all_lines.append([
                            line[0],
                            str(line[1]) + "(" + sign + ")", line[2]
                        ])
                else:
                    for line in table:
                        all_lines.append(line)

            table_content = []
            for line in all_lines:
                if line[0] == "科类" or line[0] == "总计" or line[1].find("小计") != -1 or line[1].find("None") != -1 \
                        or line[2] == "" or line[2] == "0" or line[2] is None:
                    continue
                classy = line[0]
                if classy == "理":
                    classy = "理工"
                elif classy == "文":
                    classy = "文史"
                table_content.append(
                    [line[1].replace("( )\n", ""), classy, line[2]])
            for line in table_content:
                mylogger.debug(str(line))
            write_table(file_path, table_name, table_head, table_content)
            mylogger.info(year + pro + "招生计划已存入文件")
def get_plan_info_sjtu():
    mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    # main_url = "http://zsb.sjtu.edu.cn/web/jdzsb"
    # url = main_url + "/3810061.htm"
    # page_source = request_url(url)
    # page_source.encoding = page_source.apparent_encoding
    # page_soup = BeautifulSoup(page_source.text, "lxml")
    # page_soup.prettify()
    main_file_path = "Information/九校联盟/上海交通大学/招生计划"
    # 下载文件
    # logger.info("开始下载文件")
    # for item in page_soup.find("ul", class_="infor_right02_cont").find_all("li"):
    #     logger.debug(item.a["title"])
    #     logger.debug(re.findall('\d{4}', item.a["title"]))
    #     year = re.findall('\d{4}', item.a["title"])[0]
    #     logger.debug(item.a["href"])
    #     specific_url = main_url + "/" + item.a["href"]
    #     sub_page_source = request_url(specific_url)
    #     sub_page_source.encoding = "utf-8"
    #     sub_page_soup = BeautifulSoup(sub_page_source.text, "lxml")
    #     sub_page_soup.prettify()
    #     image_index = 0
    #     for sub_item in sub_page_soup.find("div", class_="artical_box").find_all("img"):
    #         file_name = year + str(image_index) + sub_item["src"].split("/")[-1][-4:]
    #         file_url = sub_item["src"]
    #         if file_url[0] == "f":
    #             continue
    #         else:
    #             file_content = request_url("http://zsb.sjtu.edu.cn" + file_url)
    #         with open(main_file_path + "/source/" + file_name, "wb") as img:
    #             img.write(file_content.content)
    #         image_index += 1
    # logger.info("文件下载完成!")

    # 文件解析
    # logger.info("开始文件解析")
    # file_list = read_all_file_list(main_file_path + "/source")
    # file_2015 = []
    # file_2016 = []
    # file_2017 = []
    # file_2018 = []
    # for item in file_list:
    #     if item[-3:] == "jpg" or item[-3:] == "png":
    #         if item.find("2015") != -1:
    #             file_2015.append(item)
    #         elif item.find("2016") != -1:
    #             file_2016.append(item)
    #         elif item.find("2017") != -1:
    #             file_2017.append(item)
    #         elif item.find("2018") != -1:
    #             file_2018.append(item)
    # logger.info("图片文件转为pdf文件")
    # store_path = os.getcwd() + "/" + main_file_path + "/source"
    # image_to_pdf(file_2015, store_path, "2015.pdf")
    # image_to_pdf(file_2016, store_path, "2016.pdf")
    # image_to_pdf(file_2017, store_path, "2017.pdf")
    # image_to_pdf(file_2018, store_path, "2018.pdf")
    # logger.info("图片文件转成pdf完成!")
    # 重新读入文件列表
    file_list = read_all_file_list(main_file_path + "/source")
    for item in file_list:
        if item[-3:] == "pdf":
            if item.find("2015") != -1:
                # write_plan_info_sjtu_2015(main_file_path, item)
                mylogger.info("2015年数据解析完成!")
            elif item.find("2016") != -1:
                # write_plan_info_sjtu_2016(main_file_path, item)
                mylogger.info("2016年数据解析完成!")
            elif item.find("2017") != -1:
                # write_plan_info_sjtu_2017(main_file_path, item)
                mylogger.info("2017年数据解析完成!")
            elif item.find("2018") != -1:
                write_plan_info_sjtu_2018(main_file_path, item)
                mylogger.info("2018年数据解析完成!")
def get_plan_info_hit():
    mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    mylogger.info("开始获取网页源码...")
    main_url = "http://zsb.hit.edu.cn/information/plan"
    # 获取分类信息
    main_page_source = requests.get(main_url).text
    main_page_soup = BeautifulSoup(main_page_source, "lxml")
    main_page_soup.prettify()
    # 招生计划省份
    mylogger.info("解析招生地区...")
    province = []
    for item in main_page_soup.find(class_="province").find_all(name='a'):
        province.append(item.string.strip())
    mylogger.debug("哈工大招生地区:" + str(province))
    # 招生计划年份
    mylogger.info("解析招生年份...")
    years = []
    for item in main_page_soup.find_all(class_="year-select"):
        years.append(item.string.strip())
    mylogger.debug("哈工大招生年份:" + str(years))

    # 对每年份各省数据进行抽取
    mylogger.info("开始获取各年各地区数据...")
    for pro in province:
        for year in years:
            mylogger.info("开始获取" + year + pro + "的招生计划")
            # 构造链接
            specific_url = main_url + "?" + "year=" + year + "&" + "province=" + pro
            page_source = requests.get(specific_url).text
            page_soup = BeautifulSoup(page_source, "lxml")
            page_soup.prettify()
            # 表名
            table_name = year + "-" + pro
            mylogger.debug("表名:" + table_name)
            # 表头
            table_head = []
            for item in page_soup.find(class_="info_table").thead.find_all(
                    name="td"):
                table_head.append(item.string.strip())
            mylogger.debug("表头:" + str(table_head))
            # 表内容
            table_content = []
            for item in page_soup.find(class_="info_table").tbody.find_all(
                    name="tr"):
                temp = []
                for sub_item in item.find_all(name="td"):
                    temp.append(sub_item.string.strip())
                table_content.append(temp)
            # 去除统计部分的数据项、无数据的项
            for item in table_content:
                if item[0] == "无数据":
                    table_content.remove(item)
                # elif item[1] == "统计":
                #     table_content.remove(item)
            mylogger.debug("表内容如下:")
            for item in table_content:
                mylogger.debug(item)
            # 将表内容写入文本文件
            file_path = "Information/九校联盟/哈尔滨工业大学/招生计划"
            write_table(file_path, table_name, table_head, table_content)
            mylogger.info(year + pro + "的招生计划已存入文件")