def pretreat_crawl_questions():
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    data_dir = "Information/大学/常问问题集/Data"
    pickle_dir = "Information/大学/常问问题集/Pickle"
    file_list = os.listdir(data_dir)
    function_logger.debug("大学数量:%d" % len(file_list))
    for file in file_list:
        university_name = file[:-9]
        function_logger.debug(university_name)
        function_logger.info("开始读取%s的常问问题集..." % university_name)
        with open(data_dir + "/" + file, "r", encoding="utf-8") as csvfile:
            csv_reader = csv.reader(csvfile)
            fqa_lines = []
            for row in csv_reader:
                if len(row) == 5:
                    line = {}
                    line["title"] = row[0]
                    line["from"] = row[1]
                    line["time"] = row[2]
                    line["question"] = row[3]
                    line["answer"] = row[4]
                    fqa_lines.append(line)
            fqa_lines.pop(0)
        function_logger.info("读取%s的常用问题集完成!" % university_name)
        function_logger.info("开始写入%s的常用问题集..." % university_name)
        with open(pickle_dir + "/" + university_name, "wb") as p_file:
            pickle.dump(fqa_lines, p_file)
        function_logger.info("写入%s的常用问题集完成!" % university_name)
    function_logger.info("数据处理完成!")
def frequent_question_normalize(dir_path: str):
    """
    处理常用问题集(csv),问题和答案部分
    :param dir_path: 文件夹路径
    :return:
    """
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    function_logger.info("开始进行数据处理...")
    file_list = read_all_file_list(dir_path + "/source")
    for file in file_list:
        function_logger.debug(file)
        school_name = file.split("\\")[-1][:-9]
        function_logger.info("开始读取%s的常问问题集..." % school_name)
        with open(file, "r", encoding="utf-8") as csvfile:
            csv_reader = csv.reader(csvfile)
            fqa_lines = []
            for row in csv_reader:
                if len(row) == 5:
                    line = {}
                    line["title"] = row[0].replace(" ", "")
                    line["from"] = row[1]
                    line["time"] = row[2]
                    line["question"] = row[3].replace("\u3000", "").replace(
                        "\n", ",").replace(" ", "")
                    line["answer"] = row[4].replace("\ue63c", "").replace("\u3000", "").replace("\n", ",")\
                        .replace(" ", "").lstrip(",")
                    fqa_lines.append(line)
            fqa_lines.pop(0)
        function_logger.info("读取%s的常用问题集完成!" % school_name)
        function_logger.info("开始写入%s的常用问题集..." % school_name)
        with open(dir_path + "/预处理/pickle/" + school_name, "wb") as p_file:
            pickle.dump(fqa_lines, p_file)
        function_logger.info("写入%s的常用问题集完成!" % school_name)
    function_logger.info("数据处理完成!")
Exemple #3
0
def load_table_content(file_path: str):
    """
    通过excel表格加载表格内容
    :param file_path:
    :return:
    """
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    # 加载excel表格
    function_logger.info("加载表格:%s" % file_path.split("\\")[-1])
    wb = load_workbook(file_path)
    sheet_names = wb.sheetnames
    sheet_first = wb.get_sheet_by_name(sheet_names[0])
    table_head = []
    for item in range(1, sheet_first.max_column + 1):
        table_head.append(sheet_first.cell(row=1, column=item).value)
    function_logger.debug("表头:%s" % str(table_head))
    table_attr = {}
    for i_column in range(1, sheet_first.max_column + 1):
        column_name = sheet_first.cell(row=1, column=i_column).value
        column_value = set()
        for i_row in range(2, sheet_first.max_row + 1):
            column_value.add(
                sheet_first.cell(row=i_row, column=i_column).value)
        table_attr[column_name] = str(list(column_value))
    for key in table_attr:
        function_logger.debug(key)
        value_list = [
            value.replace("'", "").strip()
            for value in table_attr[key][1:-1].split(",")
        ]
        value_list.sort()
        function_logger.debug("列表长度:%d" % len(value_list))
        function_logger.debug(str(value_list))
    function_logger.info("加载表格:%s完成!" % file_path.split("\\")[-1])
def search_table_in_db(db_name: str) -> list:
    """
    查询数据库中表名
    :param db_name: 数据库名
    :return: 数据库中表名列表
    """
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    mydb = connect_mysql_with_db(db_name)
    mycursor = mydb.cursor()
    mycursor.execute("SHOW TABLES")
    tables = []
    function_logger.debug(db_name + "数据库中有以下表:")
    for table in mycursor:
        tables.append(table[0])
        function_logger.debug(table[0])
    return tables
def create_database(db_name: str):
    """
    创建数据库university_admission
    :param db_name: 数据库名
    :return:
    """
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    mydb = connect_mysql_without_db()
    mycursor = mydb.cursor()
    mycursor.execute("SHOW DATABASES")
    dbs = []
    function_logger.debug("数据库如下:")
    for db in mycursor:
        dbs.append(db[0])
        function_logger.debug(db[0])
    if db_name in dbs:
        function_logger.info("数据库" + db_name + "已存在!")
    else:
        mycursor.execute("CREATE DATABASE " + db_name)
        function_logger.info(db_name + "已创建!")
def label_data():
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    data_dir = "Information/大学/常问问题集/Data"
    pickle_dir = "Information/大学/常问问题集/Pickle"
    label_dir = "Information/大学/常问问题集/label"
    file_list = os.listdir(pickle_dir)
    function_logger.debug("大学数量:%d" % len(file_list))
    line_1 = []
    line_2 = []
    line_3 = []
    line_4 = []
    line_5 = []
    line_6 = []
    line_7 = []
    all_count = 0
    for file in file_list:
        print(file)
        university_name = file
        with open(pickle_dir + "/" + university_name, "rb") as p_file:
            lines = pickle.load(p_file)
        lines_count = len(lines)
        all_count += lines_count
    print(all_count)
def get_plan_info_ustc():
    mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    file_path = "Information/九校联盟/中国科学技术大学/招生计划"
    main_url = "https://zsb.ustc.edu.cn"
    # 获取分类信息
    main_page_source = request_url(main_url + "/12993/list.htm")
    main_page_source.encoding = main_page_source.apparent_encoding
    main_page_soup = BeautifulSoup(main_page_source.text, "lxml")
    main_page_soup.prettify()
    for area in main_page_soup.find_all("area"):
        page_url = area["href"]
        page_source = request_url(page_url)
        page_source.encoding = page_source.apparent_encoding
        page_soup = BeautifulSoup(page_source.text, "lxml")
        page_soup.prettify()
        title = page_soup.find("h1", class_="arti_title").string
        year = title[:4]
        district = title[5:-4]
        table_name = year + "-" + district
        table_head = ["专业", "类别", "人数"]
        mylogger.debug(table_name)
        mylogger.debug(str(table_head))
        all_lines = []
        for tr in page_soup.find("div",
                                 class_="wp_articlecontent").find_all("tr"):
            line = []
            for td in tr:
                line.append(td.text)
            all_lines.append(line)
        table_content = []
        for line in all_lines[1:]:
            if line[0] != "合计" and line[0] != "小计":
                if district == "浙江" or district == "上海":
                    table_content.append(
                        [line[0] + "(" + line[1] + ")", "理工", line[2]])
                else:
                    table_content.append([line[0], "理工", line[1]])
        for line in table_content:
            mylogger.debug(str(line))
        write_table(file_path, table_name, table_head, table_content)
        mylogger.info(year + district + "的招生计划已存入文件")
def get_plan_info_xjtu():
    mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    file_path = "Information/九校联盟/西安交通大学/招生计划"
    # 通过获取单个网页获取信息,需要后续处理,很麻烦
    # mylogger.info("开始获取网页源码...共五个网页")
    # with open(file_path+"/source/page_url_list","w",encoding="utf-8")as url_file:
    #     for i in range(1, 6):
    #         main_url = "http://zs.xjtu.edu.cn/lmy.jsp?a43639t=5&a43639p=" + str(i) \
    #                    + "&a43639c=10&urltype=tree.TreeTempUrl&wbtreeid=1005"
    #         # 获取分类信息
    #         main_page_source = requests.get(main_url).text
    #         main_page_soup = BeautifulSoup(main_page_source, "lxml")
    #         main_page_soup.prettify()
    #         for item in main_page_soup.find("div", id="fybt").find("ul").find_all("a"):
    #             url_file.write(str(item)+"\n")
    # mylogger.info("招生计划页面url获取完成")
    # mylogger.info("开始获取具体页面信息")
    # with open(file_path + "/source/page_url_list", "r", encoding="utf-8")as url_file:
    #     url_source = url_file.read()
    # url_soup = BeautifulSoup(url_source,"lxml")
    # url_soup.prettify()
    # for page_url in url_soup.find_all("a"):
    #     print(page_url)
    # 直接从官网进行数据查询,使用form提交
    # 获取可查询的年份和地区
    main_url = "http://zs.xjtu.edu.cn/bkscx/zsjhcx.htm"
    main_page_source = request_url(main_url)
    main_page_source.encoding = main_page_source.apparent_encoding
    main_page_soup = BeautifulSoup(main_page_source.text, "lxml")
    main_page_soup.prettify()
    years = []
    districts = []
    for year in main_page_soup.find("select", id="nf").find_all("option"):
        years.append(year.string)
    for district in main_page_soup.find("select",
                                        id="sf").find_all("option")[1:]:
        districts.append(district.string)
    mylogger.debug("可查询的年份" + str(years))
    mylogger.debug("可查询的省份" + str(districts))
    search_url = "http://zs.xjtu.edu.cn/zsjg.jsp?wbtreeid=1168"
    for year in years:
        for district in districts:
            # x,y 是查询按钮点击时的坐标,查询按钮大小x,y(54x22)
            params = {"nf": year, "sf": district, "x": "27", "y": "11"}
            return_html = requests.post(search_url, data=params)
            return_soup = BeautifulSoup(return_html.text, "lxml")
            return_soup.prettify()
            all_lines = []
            for tr in return_soup.find("div", id="fybt").find_all("tr"):
                line = []
                for td in tr:
                    if td.string != "\n":
                        line.append(str(td.string).strip())
                all_lines.append(line)
            table_name = year + "-" + district[:-1]
            table_head = ["专业", "类别", "人数"]
            table_content = []
            for line in all_lines[1:-1]:
                classy = line[2]
                if classy == "理":
                    classy = "理工"
                if classy == "文":
                    classy = "文史"
                table_content.append([line[0], classy, line[4]])
            mylogger.debug(table_name)
            mylogger.debug(str(table_head))
            for line in table_content:
                mylogger.debug(str(line))
            write_table(file_path, table_name, table_head, table_content)
            mylogger.info(year + district + "的招生计划已存入文件")
def get_plan_info_nju():
    mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    mylogger.info("开始获取网页源码...")
    main_url = "http://bkzs.nju.edu.cn"
    # 获取分类信息

    file_path = "Information/九校联盟/南京大学/招生计划"
    # 使用selenium获取隐藏部分源码
    # browser = selenium_chrome(main_url+"/4543/list.htm")
    # pro_list = browser.find_element_by_id("MapControl")
    # with open(file_path+"/source/"+"index","w",encoding="utf-8") as file:
    #     file.write(pro_list.get_attribute('innerHTML'))

    # 开始将源码读入并使用bs4解析,获取html页面源码
    # with open(file_path + "/source/" + "index", "r", encoding="utf-8") as file:
    #     source_code = file.read()
    # main_page_soup = BeautifulSoup(source_code, "lxml")
    # main_page_soup.prettify()
    # for li in main_page_soup.find_all("li"):
    #     url = li.a["href"]
    #     pro = li.span.text
    #     print(pro + "\t" + url)
    #     browser = selenium_chrome(main_url+url)
    #     page_source = browser.find_element_by_class_name("wp_articlecontent").get_attribute("innerHTML")
    #     year = re.findall("\d{4}",BeautifulSoup(page_source,"lxml").find("p").text)[0]
    #     with open(file_path + "/source/"+year+"-"+pro+".html","w",encoding="utf-8") as file:
    #         file.write(page_source)
    #     browser.quit()
    #     time.sleep(5)

    # 获取pdf文件
    # file_list = read_all_file_list(file_path + "/source")
    # for file_name in file_list:
    #     pdf_name = file_name.split("\\")[-1][:-5]
    #     if file_name[-4:] == "html":
    #         print(file_name)
    #         with open(file_name, "r", encoding="utf-8") as file:
    #             page_source = file.read()
    #         page_soup = BeautifulSoup(page_source,"lxml")
    #         for item in page_soup.find_all("div",class_="wp_pdf_player"):
    #             pdf_url = item["pdfsrc"]
    #             pdf_source = request_url(main_url+pdf_url)
    #             with open(file_path + "/source/"+pdf_name+".pdf","wb")as pdf_file:
    #                 pdf_file.write(pdf_source.content)

    # 解析pdf文件
    file_list = read_all_file_list(file_path + "/source")
    for file_name in file_list:
        if file_name[-3:] == "pdf":
            pdf_name = file_name.split("\\")[-1][:-4]
            year = pdf_name.split("-")[0]
            pro = pdf_name.split("-")[-1]
            pages = read_pdf_to_tables(file_name)
            table_name = year + "-" + pro
            table_head = ["专业", "类别", "人数"]
            mylogger.debug(table_name)
            mylogger.debug(str(table_head))
            all_lines = []
            for tables in pages:
                for table in tables:
                    for line in table:
                        all_lines.append(line)
            # 分表
            all_tables = []
            table = []
            for line in all_lines:
                if line[0] == "科类":
                    if len(table) != 0:
                        all_tables.append(table)
                    table = []
                    table.append(line)
                else:
                    table.append(line)
            all_tables.append(table)
            # 将标记写入该表下的每一项
            all_lines = []
            for table in all_tables:
                sign = table[1][0]
                if sign == "国家专项计划" or sign == "提前批":
                    for line in table:
                        all_lines.append([
                            line[0],
                            str(line[1]) + "(" + sign + ")", line[2]
                        ])
                else:
                    for line in table:
                        all_lines.append(line)

            table_content = []
            for line in all_lines:
                if line[0] == "科类" or line[0] == "总计" or line[1].find("小计") != -1 or line[1].find("None") != -1 \
                        or line[2] == "" or line[2] == "0" or line[2] is None:
                    continue
                classy = line[0]
                if classy == "理":
                    classy = "理工"
                elif classy == "文":
                    classy = "文史"
                table_content.append(
                    [line[1].replace("( )\n", ""), classy, line[2]])
            for line in table_content:
                mylogger.debug(str(line))
            write_table(file_path, table_name, table_head, table_content)
            mylogger.info(year + pro + "招生计划已存入文件")
def get_plan_info_hit():
    mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    mylogger.info("开始获取网页源码...")
    main_url = "http://zsb.hit.edu.cn/information/plan"
    # 获取分类信息
    main_page_source = requests.get(main_url).text
    main_page_soup = BeautifulSoup(main_page_source, "lxml")
    main_page_soup.prettify()
    # 招生计划省份
    mylogger.info("解析招生地区...")
    province = []
    for item in main_page_soup.find(class_="province").find_all(name='a'):
        province.append(item.string.strip())
    mylogger.debug("哈工大招生地区:" + str(province))
    # 招生计划年份
    mylogger.info("解析招生年份...")
    years = []
    for item in main_page_soup.find_all(class_="year-select"):
        years.append(item.string.strip())
    mylogger.debug("哈工大招生年份:" + str(years))

    # 对每年份各省数据进行抽取
    mylogger.info("开始获取各年各地区数据...")
    for pro in province:
        for year in years:
            mylogger.info("开始获取" + year + pro + "的招生计划")
            # 构造链接
            specific_url = main_url + "?" + "year=" + year + "&" + "province=" + pro
            page_source = requests.get(specific_url).text
            page_soup = BeautifulSoup(page_source, "lxml")
            page_soup.prettify()
            # 表名
            table_name = year + "-" + pro
            mylogger.debug("表名:" + table_name)
            # 表头
            table_head = []
            for item in page_soup.find(class_="info_table").thead.find_all(
                    name="td"):
                table_head.append(item.string.strip())
            mylogger.debug("表头:" + str(table_head))
            # 表内容
            table_content = []
            for item in page_soup.find(class_="info_table").tbody.find_all(
                    name="tr"):
                temp = []
                for sub_item in item.find_all(name="td"):
                    temp.append(sub_item.string.strip())
                table_content.append(temp)
            # 去除统计部分的数据项、无数据的项
            for item in table_content:
                if item[0] == "无数据":
                    table_content.remove(item)
                # elif item[1] == "统计":
                #     table_content.remove(item)
            mylogger.debug("表内容如下:")
            for item in table_content:
                mylogger.debug(item)
            # 将表内容写入文本文件
            file_path = "Information/九校联盟/哈尔滨工业大学/招生计划"
            write_table(file_path, table_name, table_head, table_content)
            mylogger.info(year + pro + "的招生计划已存入文件")
def get_plan_info_fudan():
    mylogger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    file_path_benbu = "Information/九校联盟/复旦大学/招生计划"
    file_path_yixue = "Information/九校联盟/复旦大学上海医学部/招生计划"
    # 直接从官网进行数据查询,使用form提交
    # 获取可查询的年份和地区
    main_url = "http://www.ao.fudan.edu.cn/index!enrollmentPlan.html"
    main_page_source = request_url(main_url)
    main_page_source.encoding = main_page_source.apparent_encoding
    main_page_soup = BeautifulSoup(main_page_source.text, "lxml")
    main_page_soup.prettify()
    years = []
    districts = []
    for year in main_page_soup.find("select", id="nf").find_all("option"):
        years.append(year.string)
    for district in main_page_soup.find("select", id="ss").find_all("option"):
        districts.append(district.string)
    mylogger.debug("可查询的年份" + str(years))
    mylogger.debug("可查询的省份" + str(districts))
    search_url = "http://www.ao.fudan.edu.cn/index!enrollmentPlan.action"
    # 2006-2015年有数据
    for year in years:
        for district in districts:
            params = {"lb": "plan", "nf": year, "ss": district}
            return_html = requests.post(search_url, data=params)
            return_soup = BeautifulSoup(return_html.text, "lxml")
            return_soup.prettify()
            all_lines = []
            for div in return_soup.find_all("div",
                                            class_="inquirytable_result"):
                for tr in div.find_all("tr"):
                    line = []
                    for td in tr:
                        if td.string != "\n":
                            line.append(str(td.string).strip())
                    all_lines.append(line)
            table_name = year + "-" + district
            table_head = ["专业", "类别", "人数"]
            mylogger.debug(table_name)
            mylogger.debug(str(table_head))
            # 数据查询为空
            if len(all_lines) < 3:
                continue
            # 开始提取数据
            table_content_benbu = []
            table_content_yixue = []
            # 2013年开始复旦大学与复旦大学上海医学部分开招生
            if int(year) < 2013:
                for line in all_lines[1:-1]:
                    # 去除文史汇总和理工汇总
                    if line[0] == "文史汇总" or line[0] == "理工汇总":
                        continue
                    # 上海地区表头有不同
                    if district == "上海":
                        table_content_benbu.append([line[0], line[1], line[5]])
                    else:
                        table_content_benbu.append([line[0], line[1], line[3]])
            else:
                # 先将本部和医学院的数据分开
                index = 0
                for i_line in range(1, len(all_lines)):
                    if all_lines[i_line][0] == "专业名称":
                        index = i_line
                        break
                if index == 0:
                    all_lines_benbu = all_lines
                    all_lines_yixue = []
                else:
                    all_lines_benbu = all_lines[:index]
                    all_lines_yixue = all_lines[index:]
                for line in all_lines_benbu[1:-1]:
                    # 去除文史汇总和理工汇总
                    if line[0] == "文史汇总" or line[0] == "理工汇总":
                        continue
                    # 上海地区表头有不同
                    if district == "上海":
                        table_content_benbu.append([line[0], line[1], line[5]])
                    else:
                        table_content_benbu.append([line[0], line[1], line[3]])
                if len(all_lines_yixue) != 0:
                    for line in all_lines_yixue[1:-1]:
                        # 去除文史汇总和理工汇总
                        if line[0] == "文史汇总" or line[0] == "理工汇总":
                            continue
                        # 上海地区表头有不同
                        if district == "上海":
                            table_content_yixue.append(
                                [line[0], line[1], line[5]])
                        else:
                            table_content_yixue.append(
                                [line[0], line[1], line[3]])
            mylogger.debug("本部招生计划:")
            for line in table_content_benbu:
                mylogger.debug(str(line))
            mylogger.debug("医学院招生计划:")
            for line in table_content_yixue:
                mylogger.debug(str(line))
            write_table(file_path_benbu, table_name, table_head,
                        table_content_benbu)
            mylogger.info("本部" + year + district + "的招生计划已存入文件")
            if len(table_content_yixue) != 0:
                write_table(file_path_yixue, table_name, table_head,
                            table_content_yixue)
                mylogger.info("医学院" + year + district + "的招生计划已存入文件")
def build_mysql_major_dict():
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    function_logger.info("获取招生计划表中的专业字段...")
    # 招生计划中的专业名称
    plan_sql_string = "SELECT major FROM admission_plan GROUP BY major;"
    myresult = mysql_query_sentence(plan_sql_string)
    function_logger.debug("招生计划表中专业数%d:" % len(myresult))
    pattern = re.compile(r"[(([].*?[))\]].*")
    plan_major_set = set()
    for major in myresult:
        temp = re.sub(pattern, "", major[0])
        plan_major_set.add(temp)
    function_logger.debug("招生计划表中专业数(统计合并后):%d" % len(plan_major_set))
    function_logger.debug(
        str(
            sorted(list(plan_major_set),
                   key=lambda x: lazy_pinyin(x.lower())[0][0])))

    # 录取分数中的专业名称
    score_sql_string = "SELECT major FROM admission_score_major GROUP BY major;"
    myresult = mysql_query_sentence(score_sql_string)
    function_logger.debug("录取分数表中专业数%d:" % len(myresult))
    score_major_set = set()
    for major in myresult:
        temp = re.sub(pattern, "", major[0])
        score_major_set.add(temp)
    function_logger.debug("录取分数表中专业数(统计合并后):%d" % len(score_major_set))
    function_logger.debug(
        str(
            sorted(list(score_major_set),
                   key=lambda x: lazy_pinyin(x.lower())[0][0])))

    # 测定两者交集
    function_logger.debug("以上两者的交集为:")
    major_and_set = plan_major_set.intersection(score_major_set)
    function_logger.debug("交集长度为:%d", len(major_and_set))
    function_logger.debug(
        str(
            sorted(list(major_and_set),
                   key=lambda x: lazy_pinyin(x.lower())[0][0])))

    # 测定两者并集
    function_logger.debug("以上两者的并集为:")
    major_or_set = plan_major_set.union(score_major_set)
    function_logger.debug("并集长度为:%d", len(major_or_set))
    function_logger.debug(
        str(
            sorted(list(major_or_set),
                   key=lambda x: lazy_pinyin(x.lower())[0][0])))
Exemple #13
0
def test_frequent_question(file_path: str):
    """
    读取常用问题集文件,并进行测试
    :param file_path: 文件(csv)路径
    :return:
    """
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    school_name = file.split("\\")[-1][:-14]
    print(school_name)
    function_logger.info("开始读取%s的常问问题集" % school_name)
    with open(file, "r", encoding="utf-8") as csvfile:
        csv_reader = csv.reader(csvfile)
        table_lines = []
        for row in csv_reader:
            table_lines.append(row)
    table_head = table_lines[0]
    table_content = table_lines[1:]
    # print(table_head)
    # for line in table_content:
    #     print(line)
    # 开始进行测试
    function_logger.info("读取完成,开始进行测试!")
    question_count = len(table_content)
    print(question_count)
    answer_count = 0
    answer_null_count = 0
    mysql_string_null_count = 0
    mysql_string_only_school_count = 0
    current_index = 1
    os.chdir(os.path.split(os.path.realpath(__file__))[0])
    if not os.path.exists("record/" + school_name):
        os.makedirs("record/" + school_name)
    with open("record/" + school_name + "/mysql_string_null", "w", encoding="utf-8") as msn_file, \
            open("record/" + school_name + "/mysql_string_only_school", "w", encoding="utf-8") as msos_file, \
            open("record/" + school_name + "/answer_null", "w", encoding="utf-8") as an_file, \
            open("record/" + school_name + "/answer_not_null", "w", encoding="utf-8") as ann_file:
        for record in table_content:
            function_logger.info("%s测试进度%d/%d" %
                                 (school_name, question_count, current_index))
            if len(record) == 5:
                question = record[3]
                function_logger.debug(question)
                start_time = time.time()
                mid_result, answer = answer_question_by_template(
                    question, 1, school_name)
                end_time = time.time()
                function_logger.debug("查询时间:%s" % (end_time - start_time))
                function_logger.debug(mid_result["mysql_string"])
                function_logger.debug(answer[0])

                if answer == ["问句条件词为空,无法构建查询语句!"]:
                    mysql_string_null_count += 1
                    msn_file.write(question + "\n")
                    msn_file.write(mid_result["mysql_string"] + "\n")
                    msn_file.write(str(answer) + "\n")
                elif answer == ["问句条件词只有学校,查询过宽!"]:
                    mysql_string_only_school_count += 1
                    msos_file.write(question + "\n")
                    msos_file.write(mid_result["mysql_string"] + "\n")
                    msos_file.write(str(answer) + "\n")
                elif answer == ["查询结果为空!"]:
                    answer_null_count += 1
                    an_file.write(question + "\n")
                    an_file.write(mid_result["mysql_string"] + "\n")
                    an_file.write(str(answer) + "\n")
                else:
                    answer_count += 1
                    # 多余三条的记录只记录前三条和总记录条数
                    ann_file.write(question + "\n")
                    ann_file.write(mid_result["mysql_string"] + "\n")
                    ann_file.write("答案条数:" + str(len(answer)) + "\t")
                    if len(answer) > 3:
                        ann_file.write(str(answer[:3]) + "\n")
                    else:
                        ann_file.write(str(answer) + "\n")
            current_index += 1
        for input_file in [msn_file, msos_file, an_file, ann_file]:
            input_file.write(
                "总问题数%d\t查询语句构造为空数%d\t只有学校关键词数%d\t查询结果为空数%d\t有回答数%d\n" %
                (question_count, mysql_string_null_count,
                 mysql_string_only_school_count, answer_null_count,
                 answer_count))
    os.chdir(os.path.split(os.path.realpath(__file__))[0])
    with open("record/all.txt", "a", encoding="utf-8") as record_file:
        now_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        record_file.write(now_time + "\t" + school_name + "\n")
        record_file.write(
            "总问题数%d\t查询语句构造为空数%d\t只有学校关键词数%d\t查询结果为空数%d\t有回答数%d\n" %
            (question_count, mysql_string_null_count,
             mysql_string_only_school_count, answer_null_count, answer_count))
def get_question_yggk():
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    # 院校咨询页url
    main_url = "https://gaokao.chsi.com.cn"
    file_path = "Information/大学/Test"
    allready_get = [["北京大学", str(26232)], ["哈尔滨工业大学", str(26617)],
                    ["北京大学医学部", str(6405529)], ["上海交通大学", str(6217)],
                    ["上海交通大学医学院", str(61811)], ["清华大学", str(36710)],
                    ["复旦大学", str(7243)], ["南京大学", str(4453)],
                    ["浙江大学", str(43617)], ["中国科学技术大学", str(6280)],
                    ["哈尔滨工业大学(威海)", str(62646117)], ["西安交通大学",
                                                     str(53593)]]
    university_formid = []
    with open("Information/大学/university_info", "rb") as p_file:
        university_infos = pickle.load(p_file)
    for info in university_infos:
        if "985" in info["院校特性"] or "211" in info["院校特性"]:
            if info["forum_id"] != "":
                university_formid.append([info["院校名称"], info["forum_id"]])
    function_logger.info("共有%d所985、211大学" % len(university_formid))
    for university in university_formid:
        begin = time.time()
        function_logger.info("开始抓取" + university[0] + "的招生问题数据...")
        main_page_url = "https://gaokao.chsi.com.cn/zxdy/forum--method-listDefault,year-2005,forumid-" + university[
            1] + ",start-0.dhtml"
        try:
            main_page_source = request_url(main_page_url)
            main_page_source.encoding = main_page_source.apparent_encoding
            main_page_soup = BeautifulSoup(main_page_source.content, "lxml")
            # 获取页面总数,页面栏含有省略号、不含省略号两种查找方式
            if main_page_soup.find("li", class_="lip dot"):
                page_count = main_page_soup.find(
                    "li", class_="lip dot").next_sibling.a.string
            else:
                page_count = main_page_soup.find(
                    "ul",
                    class_="ch-page clearfix").find_all("li")[-2].a.string
            # 置顶问题个数
            top_question_count = len(
                main_page_soup.find("table",
                                    class_="ch-table zx-table").find_all(
                                        "span", class_="question_top_txt"))
            function_logger.debug("页面总数:%d 置顶问题个数:%d" %
                                  (int(page_count), int(top_question_count)))
        except Exception as e:
            # 招生咨询页面没有数据(三个大学)
            function_logger.error("%s咨询界面没有数据,页面链接为:%s" %
                                  (university[0], main_page_url))
            function_logger.error("错误信息:%s" % e)
            continue
        # 创建该学校的问题集收集表,并写好表头
        table_head = ["标题", "来源", "时间", "问题", "回答"]
        csvfile = open(file_path + "/" + university[0] + "常用问题集.csv",
                       "w",
                       newline="",
                       encoding='utf-8')
        csvfile.truncate()
        writer = csv.writer(csvfile)
        writer.writerow(table_head)
        record_queue = Queue()
        # 每次开启10个线程,进行数据下载和存储
        start_index = 0
        end_index = 10
        while True:
            if start_index > int(page_count):
                break
            else:
                dThread = [
                    DownloadPageInfo(university[1], page_id, int(page_count),
                                     top_question_count, record_queue)
                    for page_id in range(start_index, end_index)
                ]
                sThread = SavePageInfo(record_queue, writer)
                for d in dThread:
                    d.start()
                sThread.start()
                for d in dThread:
                    d.join()
                record_queue.put(-1)
                sThread.join()
                start_index += 10
                end_index += 10
                if end_index > int(page_count):
                    end_index = int(page_count)

        csvfile.close()
        function_logger.info("抓取%s的信息用时:%ds" %
                             (university[0], time.time() - begin))
Exemple #15
0
                else:
                    turn_page_url = main_url + answer_text_class.find(
                        "a", text='[详细]')["href"]
                    turn_page_source = request_url(turn_page_url)
                    turn_page_source.encoding = turn_page_source.apparent_encoding
                    turn_page_soup = BeautifulSoup(turn_page_source.text,
                                                   "lxml")
                    pattern = re.compile(r"\s+|\n|\t|\v|\ue63c")
                    answer_text = re.sub(pattern, "", str(turn_page_soup.find("div", class_="question_a").text)) \
                        .replace("[回复]", "")
                function_logger.debug("回答:%s" % answer_text)
                records.append([
                    question_title, question_from, question_time,
                    question_text, answer_text
                ])
            with open(file_path + "/" + school[0] + "常用问题集.csv",
                      "a",
                      encoding='utf-8') as csvfile:
                writer = csv.writer(csvfile)
                for record in records:
                    writer.writerow(record)
            time.sleep(3)
        function_logger.info("%s的常用问题集收集完毕!" % school[0])


if __name__ == '__main__':
    main_logger = MyLog(__name__).getlog()
    main_logger.debug("start...")
    get_question_yggk()
    main_logger.debug("end...")
Exemple #16
0
def get_question_yggk():
    function_logger = MyLog(logger=sys._getframe().f_code.co_name).getlog()
    # 院校咨询页url
    main_url = "https://gaokao.chsi.com.cn"
    file_path = "Information/大学/Test"
    school_urls = [["北京大学", str(26232)], ["哈尔滨工业大学", str(26617)],
                   ["北京大学医学部", str(6405529)], ["上海交通大学", str(6217)],
                   ["上海交通大学医学院", str(61811)], ["清华大学", str(36710)],
                   ["复旦大学", str(7243)], ["南京大学", str(4453)],
                   ["浙江大学", str(43617)], ["中国科学技术大学", str(6280)],
                   ["哈尔滨工业大学(威海)", str(62646117)], ["西安交通大学",
                                                    str(53593)]]
    for school in school_urls:
        function_logger.info("开始抓取" + school[0] + "的招生问题数据...")
        # 创建该学校的问题集收集表,sheet,并写好表头
        table_head = ["标题", "来源", "时间", "问题", "回答"]
        with open(file_path + "/" + school[0] + "常用问题集.csv",
                  "w",
                  encoding='utf-8') as csvfile:
            csvfile.truncate()
            writer = csv.writer(csvfile)
            writer.writerow(table_head)
        main_page_source = request_url(
            "https://gaokao.chsi.com.cn/zxdy/forum--method-listDefault,year-2005,forumid-"
            + school[1] + ",start-0.dhtml")
        main_page_source.encoding = main_page_source.apparent_encoding
        main_page_soup = BeautifulSoup(main_page_source.content, "lxml")
        # 页面总数
        page_count = main_page_soup.find(
            "li", class_="lip dot").next_sibling.a.string
        # 置顶问题个数
        top_question_count = len(
            main_page_soup.find("table", class_="ch-table zx-table").find_all(
                "span", class_="question_top_txt"))
        # 每页问题个数
        page_question_count = 15
        # 通过构造每一个页面url进入具体页面
        for i_page in list(range(10)) + list(range(11, int(page_count))):
            page_url = main_url + "/zxdy/forum--method-listDefault,year-2005,forumid-" + school[
                1] + ",start-" + str(i_page * page_question_count) + ".dhtml"
            # xls表格记录基点(页问题量+置顶问题量+表头)
            # if i_page == 0:
            #     base_count = 1
            # else:
            #     base_count = i_page * page_question_count + top_question_count + 1
            function_logger.info("页面抓取进度(%d,%d)" %
                                 (i_page + 1, int(page_count)))
            function_logger.info("页面url%s" % page_url)
            page_source = request_url(page_url)
            page_source.encoding = page_source.apparent_encoding
            page_soup = BeautifulSoup(page_source.text, "lxml")
            tr_list = page_soup.find("table",
                                     class_="ch-table zx-table").contents
            for item in tr_list:
                if item == "\n":
                    tr_list.remove(item)
            records = []
            # 置顶问答只记录一次
            if i_page == 0:
                start_index = 0
            else:
                start_index = top_question_count * 2
            for i_qa_pair in range(start_index, len(tr_list), 2):
                question_title = "q_title"
                question_from = ""
                question_time = ""
                question_text = "q_text"
                answer_text = "a_text"
                question_title = str(tr_list[i_qa_pair].find(
                    "a", class_="question_t_txt").string).strip()
                function_logger.debug("标题:%s" % question_title)
                question_from = str(tr_list[i_qa_pair].find(
                    "i", title="提问人").next_sibling.string).strip()
                function_logger.debug("来源:%s" % question_from)
                question_time = str(tr_list[i_qa_pair].find(
                    "td", class_="question_t ch-table-center").text).strip()
                function_logger.debug("时间:%s" % question_time)
                # 问题与答案可能出现本页无法写下的情况,需要进行页面跳转获取信息
                question_text_class = tr_list[i_qa_pair + 1].find(
                    "div", class_="question")
                if question_text_class.find(text='[详细]') is None:
                    question_text = str(question_text_class.text).strip()
                else:
                    turn_page_url = main_url + question_text_class.find(
                        "a", text='[详细]')["href"]
                    turn_page_source = request_url(turn_page_url)
                    turn_page_source.encoding = turn_page_source.apparent_encoding
                    turn_page_soup = BeautifulSoup(turn_page_source.text,
                                                   "lxml")
                    question_text = str(
                        turn_page_soup.find("div",
                                            class_="question").text).strip()
                function_logger.debug("问题:%s" % question_text)
                answer_text_class = tr_list[i_qa_pair + 1].find(
                    "div", class_="question_a")
                if answer_text_class.find(text='[详细]') is None:
                    answer_text = str(answer_text_class.text).replace(
                        "[ 回复 ]", "").strip()
                else:
                    turn_page_url = main_url + answer_text_class.find(
                        "a", text='[详细]')["href"]
                    turn_page_source = request_url(turn_page_url)
                    turn_page_source.encoding = turn_page_source.apparent_encoding
                    turn_page_soup = BeautifulSoup(turn_page_source.text,
                                                   "lxml")
                    pattern = re.compile(r"\s+|\n|\t|\v|\ue63c")
                    answer_text = re.sub(pattern, "", str(turn_page_soup.find("div", class_="question_a").text)) \
                        .replace("[回复]", "")
                function_logger.debug("回答:%s" % answer_text)
                records.append([
                    question_title, question_from, question_time,
                    question_text, answer_text
                ])
            with open(file_path + "/" + school[0] + "常用问题集.csv",
                      "a",
                      encoding='utf-8') as csvfile:
                writer = csv.writer(csvfile)
                for record in records:
                    writer.writerow(record)
            time.sleep(3)
        function_logger.info("%s的常用问题集收集完毕!" % school[0])
Exemple #17
0
        result_edit.append("问句条件词为空,无法构建查询语句!")
    else:
        # 若只有学校一个关键词
        if "and" not in mysql_string:
            result_edit.append("问句条件词只有学校,查询过宽!")
        else:
            result = mysql_query_sentence(mysql_string)
            if len(result) == 0:
                result_edit.append("查询结果为空!")
            else:
                mid_result["search_result"] = result
                for item in result:
                    answer_string = build_mysql_answer_string_by_template(
                        match_template_answer, item)
                    result_edit.append(answer_string)
    return mid_result, result_edit


if __name__ == '__main__':
    main_logger = MyLog(logger=__name__).getlog()
    main_logger.info("start...")
    test_question = "哈工大前年软件工程石家庄招生人数?"
    main_logger.debug(test_question)
    test_mid_result, test_result = answer_question_by_template(test_question)
    for mid in test_mid_result:
        main_logger.debug(str(mid) + ":" + str(test_mid_result[mid]))
    main_logger.debug("查询结果:")
    for result in test_result:
        main_logger.debug(str(result))
    main_logger.info("end...")