def plan_doc_to_mysql_table_tuple(file_path, school):
    mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    mylogger.info("插入文件" + file_path)
    file_content = read_file_content(file_path)
    file_name = file_path.split("\\")[-1]
    year = file_name.split("-")[0]
    district = file_name.split("-")[1]
    # mylogger.debug("年份:" + year + "地区:" + district)
    table_content = []
    for i in range(len(file_content)):
        file_content[i] = file_content[i].strip()
        temp = file_content[i].split("\t")
        table_content.append(temp)
    table_head = table_content[0]
    # mylogger.debug("表头:" + str(table_head))
    table_content = table_content[1:]
    # 去除统计部分的数据项、无数据的项
    for item in table_content:
        if item[0] == "无数据":
            table_content.remove(item)
        # elif item[1] == "统计":
        #     table_content.remove(item)
    mysql_content = []
    for item in table_content:
        major = item[0]
        classy = item[1]
        numbers = item[2]
        temp = (school, district, year, major, classy, numbers)
        mysql_content.append(temp)
    # mylogger.debug("构造后的数据表项如下:")
    # for item in mysql_content:
    #     mylogger.debug(str(item))
    return mysql_content
 def __init__(self, university_id, page_id, page_count, top_question_count,
              record_queue):
     Thread.__init__(self)
     self.university_id = university_id
     self.page_id = page_id
     self.page_count = page_count
     self.top_question_count = top_question_count
     self.record_queue = record_queue
     self.thread_logger = MyLog(
         logger="thread" + str(threading.current_thread().ident)).getlog()
def write_plan_info_sjtu_2018(store_path, info_path):
    mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    year = "2018"
    with pdfplumber.open(info_path) as pdf:
        first_page = pdf.pages[0]
        im = first_page.to_image()
        im.draw_rects(first_page.extract_words())
def score_pro_doc_to_mysql_table_tuple(file_path, school):
    mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    file_content = read_file_content(file_path)
    file_name = file_path.split("\\")[-1]
    year = file_name.split("-")[0]
    table_format = file_name.split("-")[-1]
    # mylogger.debug("年份:" + year + "表类型:" + table_format)
    table_content = []
    for i in range(len(file_content)):
        file_content[i] = file_content[i].strip().replace("-", "NULL")
        temp = file_content[i].split("\t")
        table_content.append(temp)
    table_head = table_content[0]
    # mylogger.debug("表头:" + str(table_head))
    table_content = table_content[1:]
    mysql_content = []
    for item in table_content:
        district = item[0]
        batch = item[1]
        classy = item[2]
        line = item[3]
        temp = (school, year, district, batch, classy, line)
        mysql_content.append(temp)
    # for item in mysql_content:
    #     print(item)
    return mysql_content
def insert_all_school_table_admission_plan():
    mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    c9 = [
        "北京大学", "清华大学", "复旦大学", "上海交通大学", "浙江大学", "南京大学", "中国科学技术大学",
        "哈尔滨工业大学", "西安交通大学", "北京大学医学部", "上海交通大学医学部", "复旦大学上海医学部"
    ]
    already_get = ["南京大学"]
    for school in already_get:
        mylogger.info("开始插入" + school + "的招生计划数据...")
        dir_path = "Information/九校联盟/" + school + "/招生计划"
        file_list = read_all_file_list(dir_path)
        for file in file_list:
            mylogger.info("构造数据项元组...")
            mysql_content = plan_doc_to_mysql_table_tuple(file, school)
            mylogger.info("将元组数据插入数据库...")
            insert_table_admission_plan(mysql_content)
            mylogger.info("元组数据插入完成!")
        time.sleep(5)
def create_database(db_name: str):
    """
    创建数据库university_admission
    :param db_name: 数据库名
    :return:
    """
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    mydb = connect_mysql_without_db()
    mycursor = mydb.cursor()
    mycursor.execute("SHOW DATABASES")
    dbs = []
    function_logger.debug("数据库如下:")
    for db in mycursor:
        dbs.append(db[0])
        function_logger.debug(db[0])
    if db_name in dbs:
        function_logger.info("数据库" + db_name + "已存在!")
    else:
        mycursor.execute("CREATE DATABASE " + db_name)
        function_logger.info(db_name + "已创建!")
def get_plan_info_ustc():
    mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    file_path = "Information/九校联盟/中国科学技术大学/招生计划"
    main_url = "https://zsb.ustc.edu.cn"
    # 获取分类信息
    main_page_source = request_url(main_url + "/12993/list.htm")
    main_page_source.encoding = main_page_source.apparent_encoding
    main_page_soup = BeautifulSoup(main_page_source.text, "lxml")
    main_page_soup.prettify()
    for area in main_page_soup.find_all("area"):
        page_url = area["href"]
        page_source = request_url(page_url)
        page_source.encoding = page_source.apparent_encoding
        page_soup = BeautifulSoup(page_source.text, "lxml")
        page_soup.prettify()
        title = page_soup.find("h1", class_="arti_title").string
        year = title[:4]
        district = title[5:-4]
        table_name = year + "-" + district
        table_head = ["专业", "类别", "人数"]
        mylogger.debug(table_name)
        mylogger.debug(str(table_head))
        all_lines = []
        for tr in page_soup.find("div",
                                 class_="wp_articlecontent").find_all("tr"):
            line = []
            for td in tr:
                line.append(td.text)
            all_lines.append(line)
        table_content = []
        for line in all_lines[1:]:
            if line[0] != "合计" and line[0] != "小计":
                if district == "浙江" or district == "上海":
                    table_content.append(
                        [line[0] + "(" + line[1] + ")", "理工", line[2]])
                else:
                    table_content.append([line[0], "理工", line[1]])
        for line in table_content:
            mylogger.debug(str(line))
        write_table(file_path, table_name, table_head, table_content)
        mylogger.info(year + district + "的招生计划已存入文件")
Esempio n. 8
0
def build_mysql_string_by_template_and_keymap(template_question: str,
                                              template_question_type: str,
                                              keyword_dict: dict) -> str:
    """
    通过模板类型及关键词键值映射返回mysql语句
    :param template_question: 模板问题
    :param template_question_type: 模板问题类型
    :param keyword_dict: 关键词映射
    :return: SQL语句
    """
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    function_logger.info("开始构造MySQL语句...")
    search_table = template_question_type
    # 提取模板句中的槽
    pattern = re.compile(r"[(].*?[)]")
    slots = re.findall(pattern, template_question)
    # print(slots)
    # 构造SQL语句
    mysql_string = ""
    for i_slot in range(len(slots)):
        # function_logger.debug("slot:"+slots[i_slot][1:-1])
        key = keyword_dict["search_" + slots[i_slot][1:-1]]
        # function_logger.debug("key"+key)
        if key == "":
            continue
        else:
            mysql_string += slots[i_slot][1:-1] + "='" + key + "' and "
    if mysql_string != "":
        mysql_string = "select * from " + search_table + " where " + mysql_string[:
                                                                                  -5] + ";"
    function_logger.info("MySQL语句构造完成!")
    return mysql_string
Esempio n. 9
0
def build_mysql_string_by_template(template_question: str,
                                   template_question_type: str) -> str:
    """
    通过模板类型(槽位)构造MySQL语句
    :param template_question: 模板问句
    :param template_question_type: 模板问句类型
    :return: 对应的mysql语句
    """
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    function_logger.info("开始构造MySQL语句...")
    search_table = template_question_type
    # 提取模板句中的槽
    pattern = re.compile(r"[(].*?[)]")
    slots = re.findall(pattern, template_question)
    # print(slots)
    # 构造SQL语句
    mysql_string = "select * from " + search_table + " where "

    for i_slot in range(len(slots)):
        if i_slot == len(slots) - 1:
            mysql_string += "[" + slots[i_slot][1:-1] + "='" + slots[
                i_slot] + "'" + "]"
        else:
            mysql_string += "[" + slots[i_slot][1:-1] + "='" + slots[
                i_slot] + "' and " + "]"
    mysql_string += ";"
    function_logger.info("MySQL语句构造完成!")
    return mysql_string
Esempio n. 10
0
def read_pdf_to_text(path):
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    function_logger.info("开始读取pdf文件!")

    # 按页返回页中各个文本
    source_pdf = open(path, 'rb')
    # 创建PDF,资源管理器,来共享资源
    rsrcmgr = PDFResourceManager()
    # 创建一个PDF设备对象
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    # 创建一个PDF解释其对象
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # 循环遍历列表,每次处理一个page内容
    # doc.get_pages() 获取page列表
    for page in PDFPage.get_pages(source_pdf):
        interpreter.process_page(page)
        # 接受该页面的LTPage对象
        layout = device.get_result()
        # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象
        # 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等
        # 想要获取文本就获得对象的text属性,
        for x in layout:
            if isinstance(x, LTTextBoxHorizontal):
                results = x.get_text()
                print(results)
        print("-----------")
        # print("-------------------")

    function_logger.info("pdf文件读取文本完成!")
def build_template_by_infos(template_path: str,
                            fields_question_condition: list,
                            fields_question_target: list,
                            template_sentence_questions: list,
                            template_sentence_answers: list):
    """
    通过提供的信息构造问题模板
    :param template_path: 模板路径
    :param fields_question_condition: 问句条件词,例["school 学校", "year 年份", "major 专业", "district 省份", "classy 类别"]
    :param fields_question_target: 问句目标词,例["numbers 招生人数 招生计划 招多少人 招生计划是多少 招生人数是多少"]
    :param template_sentence_questions: 模板问题句
    :param template_sentence_answers: 模板答案句
    :return:
    """
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    function_logger.info("开始构造%s的问题模板..." % template_path.split("\\")[-1])
    # 使用pickle存储模板文件
    template_dict = {}
    template_dict["fq_conditon"] = fields_question_condition
    template_dict["fq_target"] = fields_question_target
    template_dict["ts_answers"] = template_sentence_answers
    build_question_sentences = []
    # 获取问句条件词的英文字段
    template_fields_en = []
    for fq_condition in fields_question_condition:
        template_fields_en.append(fq_condition.split(" ")[0])
    # 利用问句条件词英文字段构造子集,使用替换的方法构造所有全排列的问句
    fields_en_subset = get_subset_binary(template_fields_en)
    print(len(fields_en_subset))
    for i_question in range(len(template_sentence_questions)):
        # 查询当前模板问句包含多少问句目标词
        match_question_target = []
        for fq_target in fields_question_target:
            if fq_target.split(
                    " ")[0] in template_sentence_questions[i_question]:
                match_question_target = fq_target.split(" ")
                # 找到一个问句目标词即退出,默认问句中只有一个问句目标词
                break
        # 对问句目标词中对应的每一个询问方式进行替换
        target_word = match_question_target[0]
        for question_mode in match_question_target[1:]:
            # 对子集中的每一个集合进行替换
            for subset in fields_en_subset:
                sentence = template_sentence_questions[i_question].replace("(" + target_word + ")", question_mode) \
                           + "--" + str(i_question)
                # 子集为空,不缺省参数
                if not subset:
                    build_question_sentences.append(sentence)
                # 子集为原集合,不添加
                elif len(subset) == len(template_fields_en):
                    continue
                # 子集有缺省,去除子集中的元素后再添加
                else:
                    for field_en in subset:
                        sentence = sentence.replace("(" + field_en + ")", "")
                    build_question_sentences.append(sentence)
    template_dict["ts_questions"] = build_question_sentences
    with open(template_path, "wb") as p_file:
        pickle.dump(template_dict, p_file)
    function_logger.info("%s的问题模板构建完成!" % template_path.split("\\")[-1])
def build_university_major_dict():
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    function_logger.info("构造专业名称词典...")
    source_path = "Information/大学/大学学科(百度百科网页源码).txt"
    with open(source_path, "r", encoding="utf-8") as source_file:
        main_page_source = source_file.read()

    # bs4方法解析尝试
    # main_page_soup = BeautifulSoup(main_page_source, "lxml")
    # main_page_soup.prettify()
    #
    # source_major_list = main_page_soup.find_all("div", class_="para")
    # i=0
    # while source_major_list[i].text.find("01学科门类") == -1:
    #     i += 1
    # else:
    #     cut_index = i
    # source_major_list = source_major_list[cut_index:]
    # for item in source_major_list:
    #     print(item.text)

    # 正则方式获取大学专业目录
    result = re.findall(r'\d{4,6}[KT]*\s*[\u4e00-\u9fa5]+', main_page_source)
    # 切分部分不符合的数据
    result = result[4:-4]
    # 根据资料构建词典
    with open(dictionary_path + "/major.txt", "w",
              encoding="utf-8") as major_dict:
        major_dict.truncate()
        for item in result:
            major_dict.write(re.findall(r'[\u4e00-\u9fa5]+', item)[0] + "\n")
    function_logger.info("构造专业名称词典完成")
def label_data():
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    data_dir = "Information/大学/常问问题集/Data"
    pickle_dir = "Information/大学/常问问题集/Pickle"
    label_dir = "Information/大学/常问问题集/label"
    file_list = os.listdir(pickle_dir)
    function_logger.debug("大学数量:%d" % len(file_list))
    line_1 = []
    line_2 = []
    line_3 = []
    line_4 = []
    line_5 = []
    line_6 = []
    line_7 = []
    all_count = 0
    for file in file_list:
        print(file)
        university_name = file
        with open(pickle_dir + "/" + university_name, "rb") as p_file:
            lines = pickle.load(p_file)
        lines_count = len(lines)
        all_count += lines_count
    print(all_count)
Esempio n. 14
0
def read_pdf_to_words(path):
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    function_logger.info("开始读取pdf文件!")
    with pdfplumber.open(path) as pdf:
        all_words = []
        for page in pdf.pages:
            words = page.extract_words()
            all_words.append(words)
    function_logger.info("pdf文件读取table完成!")
    return all_words
def build_classy_dict():
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    function_logger.info("构造类别名称词典...")
    classy = ["文科", "理科", "文史", "理工"]
    with open(dictionary_path + "/classy.txt", "w",
              encoding="utf-8") as classy_dict:
        classy_dict.truncate()
        for item in classy:
            classy_dict.write(item + "\n")
    function_logger.info("构造类别名称词典完成!")
def create_admission_plan_table():
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    db_name = "university_admission"
    tables = search_table_in_db(db_name)
    mydb = connect_mysql_with_db(db_name)
    mycursor = mydb.cursor()

    if "admission_plan" in tables:
        function_logger.info("admission_plan表已存在!")
        function_logger.info("正在删除admission_plan表...")
        mycursor.execute("DROP TABLE admission_plan;")

    mycursor.execute("CREATE TABLE admission_plan("
                     "id INT AUTO_INCREMENT PRIMARY KEY NOT NULL ,"
                     "school VARCHAR(30),"
                     "district VARCHAR(10),"
                     "year INT,"
                     "major VARCHAR(100),"
                     "classy varchar(10),"
                     "numbers varchar(10))")
    function_logger.info("admission_plan表已重新创建!")
def insert_all_school_table_admission_score():
    mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    c9 = [
        "北京大学", "清华大学", "复旦大学", "上海交通大学", "浙江大学", "南京大学", "中国科学技术大学",
        "哈尔滨工业大学", "西安交通大学", "北京大学医学部", "上海交通大学医学部", "复旦大学上海医学部"
    ]
    already_get = ["复旦大学", "复旦大学上海医学部"]
    for school in already_get:
        dir_path = "Information/九校联盟/" + school + "/录取分数"
        file_list = read_all_file_list(dir_path)
        for file in file_list:
            table_format = file.split("-")[-1]
            if table_format == "major":
                mysql_content = score_major_doc_to_mysql_table_tuple(
                    file, school)
                insert_table_admission_score_major(mysql_content)
            elif table_format == "pro":
                mysql_content = score_pro_doc_to_mysql_table_tuple(
                    file, school)
                insert_table_admission_score_pro(mysql_content)
def create_admission_score_pro_table():
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    db_name = "university_admission"
    tables = search_table_in_db(db_name)
    mydb = connect_mysql_with_db(db_name)
    mycursor = mydb.cursor()
    if "admission_score_pro" in tables:
        function_logger.info("admission_score_pro表已存在!")
        function_logger.info("正在删除admission_score_pro表...")
        mycursor.execute("DROP TABLE admission_score_pro;")
    mydb = connect_mysql_with_db(db_name)
    mycursor = mydb.cursor()
    # 各省的高校分数线(学校、地区、年份、类别、批次、分数线)
    mycursor.execute("CREATE TABLE admission_score_pro("
                     "id INT AUTO_INCREMENT PRIMARY KEY NOT NULL,"
                     "school VARCHAR(30),"
                     "year INT,"
                     "district VARCHAR(10),"
                     "batch varchar(30),"
                     "classy varchar(10),"
                     "line varchar(30))")
    function_logger.info("admission_score_pro表创建完成!")
Esempio n. 19
0
def read_pdf_to_tables(path):
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    """
    read pdf and return a table list.
    :param path: the pdf path
    :return tables: table list
    """
    function_logger.info("开始读取pdf文件!")
    with pdfplumber.open(path) as pdf:
        all_tables = []
        for page in pdf.pages:
            tables = page.extract_tables()
            all_tables.append(tables)
    function_logger.info("pdf文件读取table完成!")
    return all_tables
def search_table_in_db(db_name: str) -> list:
    """
    查询数据库中表名
    :param db_name: 数据库名
    :return: 数据库中表名列表
    """
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    mydb = connect_mysql_with_db(db_name)
    mycursor = mydb.cursor()
    mycursor.execute("SHOW TABLES")
    tables = []
    function_logger.debug(db_name + "数据库中有以下表:")
    for table in mycursor:
        tables.append(table[0])
        function_logger.debug(table[0])
    return tables
def read_pdf_to_tables(file_path):
    """
    解析pdf文件中的表格
    :param file_path: pdf文件路径
    :return: 表格数据列表
    """
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    function_logger.info("开始读取pdf文件!")
    with pdfplumber.open(path) as pdf:
        all_tables = []
        # 对每一页pdf中的表格进行解析并添加到列表结构中
        for page in pdf.pages:
            tables = page.extract_tables()
            all_tables.append(tables)
    function_logger.info("pdf文件读取table完成!")
    return all_tables
def create_admission_score_major_table():
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    db_name = "university_admission"
    tables = search_table_in_db(db_name)
    mydb = connect_mysql_with_db(db_name)
    mycursor = mydb.cursor()
    if "admission_score_major" in tables:
        function_logger.info("admission_score_major表已存在!")
        function_logger.info("正在删除admission_score_major表...")
        mycursor.execute("DROP TABLE admission_score_major;")

    # 高校的专业分数线(学校、地区、年份、专业、类别、最高分、平均分、最低分、录取人数)
    mycursor.execute("CREATE TABLE admission_score_major("
                     "id INT AUTO_INCREMENT PRIMARY KEY NOT NULL,"
                     "school VARCHAR(30),"
                     "district VARCHAR(10),"
                     "year INT,"
                     "major VARCHAR(100),"
                     "classy varchar(30),"
                     "highest varchar(10) NULL,"
                     "average varchar(10) NULL,"
                     "lowest varchar(10),"
                     "amount varchar(10) NULL)")
    function_logger.info("admission_score_major表创建完成!")
def score_major_doc_to_mysql_table_tuple(file_path, school):
    mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    file_content = read_file_content(file_path)
    file_name = file_path.split("\\")[-1]
    year = file_name.split("-")[0]
    district = file_name.split("-")[1]
    table_format = file_name.split("-")[-1]
    # mylogger.debug("年份:" + year + "地区:" + district + "表类型:" + table_format)
    table_content = []
    for i in range(len(file_content)):
        file_content[i] = file_content[i].strip().replace("-", "NULL")
        temp = file_content[i].split("\t")
        table_content.append(temp)
    table_head = table_content[0]
    # mylogger.debug("表头:" + str(table_head))
    table_content = table_content[1:]
    # # 去除统计部分的数据项、无数据的项
    # for item in table_content:
    #     if item[0] == "无数据":
    #         table_content.remove(item)
    #     elif item[1] == "统计":
    #         table_content.remove(item)
    mysql_content = []
    for item in table_content:
        major = item[0]
        classy = item[1]
        highest = item[2]
        average = item[3]
        lowest = item[4]
        amount = item[5]
        temp = (school, district, year, major, classy, highest, average,
                lowest, amount)
        mysql_content.append(temp)
    # for item in mysql_content:
    #     print(item)
    return mysql_content
def build_school_dict():
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    function_logger.info("构造学校名称词典...")
    c9 = [
        "北京大学", "清华大学", "复旦大学", "上海交通大学", "浙江大学", "南京大学", "中国科学技术大学",
        "哈尔滨工业大学", "西安交通大学", "北京大学医学部", "上海交通大学医学部", "复旦大学上海医学部"
    ]
    c9_j = [
        "北大", "清华", "复旦", "上交", "浙大", "南大", "中科大", "哈工大", "西交大", "北大医学部",
        "上交医学部", "复旦医学部"
    ]
    with open(dictionary_path + "/school.txt", "w",
              encoding="utf-8") as school_dict:
        school_dict.truncate()
        for item in c9:
            school_dict.write(item + "\n")
        for item in c9_j:
            school_dict.write(item + "\n")
    function_logger.info("构造学校名称词典完成!")
def get_undergraduate_university_info():
    # 院校库主页
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    main_url = "https://gaokao.chsi.com.cn/sch/search.do?searchType=1&xlcc=bk&start="
    main_page_source = request_url(main_url + "0")
    main_page_source.encoding = main_page_source.apparent_encoding
    main_page_soup = BeautifulSoup(main_page_source.text, "lxml")
    page_count = int(
        main_page_soup.find("li", class_="lip dot").next_sibling.text)
    page_university_count = 20
    university_infos = []
    for i_page in range(page_count):
        page_url = main_url + str(i_page * page_university_count)
        function_logger.info("页面抓取进度(%d,%d)" % (i_page + 1, int(page_count)))
        function_logger.info("页面url%s" % page_url)
        browser = selenium_chrome(page_url)
        page_souce = browser.find_element_by_class_name(
            "ch-table").get_attribute("innerHTML")
        browser.quit()
        page_soup = BeautifulSoup(page_souce, "lxml")
        page_soup.prettify()
        head = [th.text for th in page_soup.find("tr").find_all("th")]
        print(head)
        for tr in page_soup.find_all("tr")[1:]:
            info = {}
            td_list = tr.find_all("td")
            info["url"] = "https://gaokao.chsi.com.cn" + td_list[0].find(
                "a")["href"]
            for i in [0, 1, 2, 3, 4, 7]:
                info[head[i]] = td_list[i].text.strip()
            info[head[5]] = td_list[5].text.strip().replace("\n", "").replace(
                " ", "").replace("\u2002", " ")
            info[head[6]] = td_list[6].text.strip().replace(
                "\ue664", "有") if td_list[6].text.strip() != "" else "无"
            university_infos.append(info)
    for info in university_infos:
        print(info)
    with open("Information/大学/university_info", "wb") as p_file:
        pickle.dump(university_infos, p_file)
Esempio n. 26
0
def create_plan_score_folder_c9():
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    # C9及其医学部
    c9 = [
        "北京大学", "清华大学", "复旦大学", "上海交通大学", "浙江大学", "南京大学", "中国科学技术大学",
        "哈尔滨工业大学", "西安交通大学", "北京大学医学部", "上海交通大学医学部", "复旦大学上海医学部"
    ]
    catalog = ["招生计划", "录取分数"]
    root_path = "Information/九校联盟"
    for university in c9:
        function_logger.info("创建%s的文件夹" % university)
        if not os.path.exists(root_path + "/" + university):
            os.makedirs(root_path + "/" + university)
            for cat in catalog:
                if not os.path.exists(root_path + "/" + university + "/" +
                                      cat):
                    os.makedirs(root_path + "/" + university + "/" + cat)
                    # 创建source文件夹(存储网络爬取的原始数据)
                    if not os.path.exists(root_path + "/" + university + "/" +
                                          cat + "/source"):
                        os.makedirs(root_path + "/" + university + "/" + cat +
                                    "/source")
        function_logger.info("%s的文件夹创建完成!" % university)
Esempio n. 27
0
# -*- coding: utf-8 -*-
"""
@File  : HanLPAPI.py
@Author: SangYu
@Date  : 2018/12/27 14:56
@Desc  : HanLP平台的API
"""
from pyhanlp import *
from Log.Logger import MyLog


# 分词(有词性标注)
def hanlp_nlp_segmentor(sentence):
    nlp_tokenizer = JClass("com.hankcs.hanlp.tokenizer.NLPTokenizer")
    return str(nlp_tokenizer.analyze(sentence)).split(" ")


# 分词(无词性标注)
def hanlp_nlp_segmentor_without_nature(sentence):
    nlp_tokenizer = JClass("com.hankcs.hanlp.tokenizer.NLPTokenizer")
    word_list = str(nlp_tokenizer.analyze(sentence)).split(" ")
    return [word.split("/")[0] for word in word_list]


if __name__ == "__main__":
    mylogger = MyLog(logger=__name__).getlog()
    mylogger.info("start...")
    print(type(hanlp_nlp_segmentor("2015年哈工大软件工程在河南招多少人?")))
    print(hanlp_nlp_segmentor("一五年哈工大软件工程在河南招多少人?"))
    mylogger.info("end...")
Esempio n. 28
0
                else:
                    turn_page_url = main_url + answer_text_class.find(
                        "a", text='[详细]')["href"]
                    turn_page_source = request_url(turn_page_url)
                    turn_page_source.encoding = turn_page_source.apparent_encoding
                    turn_page_soup = BeautifulSoup(turn_page_source.text,
                                                   "lxml")
                    pattern = re.compile(r"\s+|\n|\t|\v|\ue63c")
                    answer_text = re.sub(pattern, "", str(turn_page_soup.find("div", class_="question_a").text)) \
                        .replace("[回复]", "")
                function_logger.debug("回答:%s" % answer_text)
                records.append([
                    question_title, question_from, question_time,
                    question_text, answer_text
                ])
            with open(file_path + "/" + school[0] + "常用问题集.csv",
                      "a",
                      encoding='utf-8') as csvfile:
                writer = csv.writer(csvfile)
                for record in records:
                    writer.writerow(record)
            time.sleep(3)
        function_logger.info("%s的常用问题集收集完毕!" % school[0])


if __name__ == '__main__':
    main_logger = MyLog(__name__).getlog()
    main_logger.debug("start...")
    get_question_yggk()
    main_logger.debug("end...")
Esempio n. 29
0
def get_question_yggk():
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    # 院校咨询页url
    main_url = "https://gaokao.chsi.com.cn"
    file_path = "Information/大学/Test"
    school_urls = [["北京大学", str(26232)], ["哈尔滨工业大学", str(26617)],
                   ["北京大学医学部", str(6405529)], ["上海交通大学", str(6217)],
                   ["上海交通大学医学院", str(61811)], ["清华大学", str(36710)],
                   ["复旦大学", str(7243)], ["南京大学", str(4453)],
                   ["浙江大学", str(43617)], ["中国科学技术大学", str(6280)],
                   ["哈尔滨工业大学(威海)", str(62646117)], ["西安交通大学",
                                                    str(53593)]]
    for school in school_urls:
        function_logger.info("开始抓取" + school[0] + "的招生问题数据...")
        # 创建该学校的问题集收集表,sheet,并写好表头
        table_head = ["标题", "来源", "时间", "问题", "回答"]
        with open(file_path + "/" + school[0] + "常用问题集.csv",
                  "w",
                  encoding='utf-8') as csvfile:
            csvfile.truncate()
            writer = csv.writer(csvfile)
            writer.writerow(table_head)
        main_page_source = request_url(
            "https://gaokao.chsi.com.cn/zxdy/forum--method-listDefault,year-2005,forumid-"
            + school[1] + ",start-0.dhtml")
        main_page_source.encoding = main_page_source.apparent_encoding
        main_page_soup = BeautifulSoup(main_page_source.content, "lxml")
        # 页面总数
        page_count = main_page_soup.find(
            "li", class_="lip dot").next_sibling.a.string
        # 置顶问题个数
        top_question_count = len(
            main_page_soup.find("table", class_="ch-table zx-table").find_all(
                "span", class_="question_top_txt"))
        # 每页问题个数
        page_question_count = 15
        # 通过构造每一个页面url进入具体页面
        for i_page in list(range(10)) + list(range(11, int(page_count))):
            page_url = main_url + "/zxdy/forum--method-listDefault,year-2005,forumid-" + school[
                1] + ",start-" + str(i_page * page_question_count) + ".dhtml"
            # xls表格记录基点(页问题量+置顶问题量+表头)
            # if i_page == 0:
            #     base_count = 1
            # else:
            #     base_count = i_page * page_question_count + top_question_count + 1
            function_logger.info("页面抓取进度(%d,%d)" %
                                 (i_page + 1, int(page_count)))
            function_logger.info("页面url%s" % page_url)
            page_source = request_url(page_url)
            page_source.encoding = page_source.apparent_encoding
            page_soup = BeautifulSoup(page_source.text, "lxml")
            tr_list = page_soup.find("table",
                                     class_="ch-table zx-table").contents
            for item in tr_list:
                if item == "\n":
                    tr_list.remove(item)
            records = []
            # 置顶问答只记录一次
            if i_page == 0:
                start_index = 0
            else:
                start_index = top_question_count * 2
            for i_qa_pair in range(start_index, len(tr_list), 2):
                question_title = "q_title"
                question_from = ""
                question_time = ""
                question_text = "q_text"
                answer_text = "a_text"
                question_title = str(tr_list[i_qa_pair].find(
                    "a", class_="question_t_txt").string).strip()
                function_logger.debug("标题:%s" % question_title)
                question_from = str(tr_list[i_qa_pair].find(
                    "i", title="提问人").next_sibling.string).strip()
                function_logger.debug("来源:%s" % question_from)
                question_time = str(tr_list[i_qa_pair].find(
                    "td", class_="question_t ch-table-center").text).strip()
                function_logger.debug("时间:%s" % question_time)
                # 问题与答案可能出现本页无法写下的情况,需要进行页面跳转获取信息
                question_text_class = tr_list[i_qa_pair + 1].find(
                    "div", class_="question")
                if question_text_class.find(text='[详细]') is None:
                    question_text = str(question_text_class.text).strip()
                else:
                    turn_page_url = main_url + question_text_class.find(
                        "a", text='[详细]')["href"]
                    turn_page_source = request_url(turn_page_url)
                    turn_page_source.encoding = turn_page_source.apparent_encoding
                    turn_page_soup = BeautifulSoup(turn_page_source.text,
                                                   "lxml")
                    question_text = str(
                        turn_page_soup.find("div",
                                            class_="question").text).strip()
                function_logger.debug("问题:%s" % question_text)
                answer_text_class = tr_list[i_qa_pair + 1].find(
                    "div", class_="question_a")
                if answer_text_class.find(text='[详细]') is None:
                    answer_text = str(answer_text_class.text).replace(
                        "[ 回复 ]", "").strip()
                else:
                    turn_page_url = main_url + answer_text_class.find(
                        "a", text='[详细]')["href"]
                    turn_page_source = request_url(turn_page_url)
                    turn_page_source.encoding = turn_page_source.apparent_encoding
                    turn_page_soup = BeautifulSoup(turn_page_source.text,
                                                   "lxml")
                    pattern = re.compile(r"\s+|\n|\t|\v|\ue63c")
                    answer_text = re.sub(pattern, "", str(turn_page_soup.find("div", class_="question_a").text)) \
                        .replace("[回复]", "")
                function_logger.debug("回答:%s" % answer_text)
                records.append([
                    question_title, question_from, question_time,
                    question_text, answer_text
                ])
            with open(file_path + "/" + school[0] + "常用问题集.csv",
                      "a",
                      encoding='utf-8') as csvfile:
                writer = csv.writer(csvfile)
                for record in records:
                    writer.writerow(record)
            time.sleep(3)
        function_logger.info("%s的常用问题集收集完毕!" % school[0])
def frequent_question_normalize(dir_path: str):
    """
    处理常用问题集(csv),问题和答案部分
    :param dir_path: 文件夹路径
    :return:
    """
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    function_logger.info("开始进行数据处理...")
    file_list = read_all_file_list(dir_path + "/source")
    for file in file_list:
        function_logger.debug(file)
        school_name = file.split("\\")[-1][:-9]
        function_logger.info("开始读取%s的常问问题集..." % school_name)
        with open(file, "r", encoding="utf-8") as csvfile:
            csv_reader = csv.reader(csvfile)
            fqa_lines = []
            for row in csv_reader:
                if len(row) == 5:
                    line = {}
                    line["title"] = row[0].replace(" ", "")
                    line["from"] = row[1]
                    line["time"] = row[2]
                    line["question"] = row[3].replace("\u3000", "").replace(
                        "\n", ",").replace(" ", "")
                    line["answer"] = row[4].replace("\ue63c", "").replace("\u3000", "").replace("\n", ",")\
                        .replace(" ", "").lstrip(",")
                    fqa_lines.append(line)
            fqa_lines.pop(0)
        function_logger.info("读取%s的常用问题集完成!" % school_name)
        function_logger.info("开始写入%s的常用问题集..." % school_name)
        with open(dir_path + "/预处理/pickle/" + school_name, "wb") as p_file:
            pickle.dump(fqa_lines, p_file)
        function_logger.info("写入%s的常用问题集完成!" % school_name)
    function_logger.info("数据处理完成!")