Esempio n. 1
0
    def parse_multi_select_question(self, item_html, response):
        question_option_list = []
        question_answer_list = []

        # 題目,选项
        question_content_html = item_html(".question").children(".question_option")
        question_title = question_content_html(".q_tit").text()
        for question_content_option in question_content_html.children(".ops").items():
            question_option_item = question_content_option("label").html()
            if question_option_item.find(".") != -1:
                question_option_item = question_option_item[question_option_item.index(".") + 1:]
            question_option_list.append(question_option_item)

        # 答案、解析
        question_answer_container = item_html(".tea_explain_content")
        for question_answer_html in question_answer_container(".exp_correct_answer_desc ul").children().items():
            question_answer_list.append(question_answer_html("span").text())
        question_resolve_html = question_answer_container(".tea_explain_text").html()
        question = QuestionItem()
        question["question_title"] = question_title
        question["question_answer"] = "".join(question_answer_list)
        question["question_resolve_content"] = question_resolve_html
        question["question_content"] = question_option_list
        question["question_type"] = 4
        question["question_content_file_url_list"] = []
        return question
Esempio n. 2
0
 def parse_table_question(self, item_html, response):
     question_html = item_html(".question")
     question_title_list = []
     question_content_list = []
     question_answer_list = []
     for question_title in question_html.children("p").items():
         question_title_list.append("<p>" + question_title.html() + "</p>")
     for tr_index, table_tr in enumerate(question_html(".check_table").children("tr").items()):
         if tr_index > 0:
             for td_index, table_tr_td in enumerate(table_tr.children("td").items()):
                 if td_index == 0:
                     question_content_list.append("<p>" + table_tr_td.text() + "</p>")
     question_answer_container = item_html(".tea_explain_content")
     for answer_html in question_answer_container(
             ".exp_correct_answer_content .exp_correct_answer_desc ul").children("li").items():
         question_answer_list.append(answer_html(".answer").text())
     question_resolve_html = question_answer_container(".exp_tea_explain_content .tea_explain_text").html()
     question = QuestionItem()
     question['question_title'] = "".join(question_title_list)
     question['question_answer'] = ",".join(question_answer_list)
     question_content_str = []
     question_content_str.append("".join(question_content_list))
     question['question_content'] = question_content_str
     question['question_resolve_content'] = question_resolve_html
     question['question_type'] = 3
     question["question_content_file_url_list"] = []
     return question
Esempio n. 3
0
 def parse_judgement_question(self, item_html, response):
     question_content = []
     question_answer = []
     question_title = []
     for question_content_html in item_html(".question").children(
             "p").items():
         if question_content_html.html() is not None:
             question_title.append("<p>" + question_content_html.html() +
                                   "</p>")
     for question_content_html in item_html(".question").children(
             ".question_option").items():
         question_content.append(
             "<p>" + str(question_content_html(".q_tit").text()) +
             "[BlankArea]" + "</p>")
     question_answer_container = item_html(".tea_explain_content")
     for question_answer_html in question_answer_container(
             ".exp_correct_answer_content .exp_correct_answer_desc ul"
     ).children().items():
         question_answer.append(question_answer_html("span").text())
     question_resolve = question_answer_container(
         ".tea_explain_text").html()
     question = QuestionItem()
     question["question_title"] = question_title
     question_content_arr = []
     question_content_arr.append("".join(question_content))
     question["question_content"] = question_content_arr
     question["question_answer"] = ",".join(question_answer)
     question["question_resolve_content"] = question_resolve
     question["question_type"] = 2
     question["question_content_file_url_list"] = []
     return question
Esempio n. 4
0
 def parse_single_select_question(self, item_html, response):
     question_content_list = []
     question_option_list = []
     question_answer_list = []
     question_resolve_list = []
     question_list = []
     for question_content_html in item_html(".question").children(".question_option").items():
         question_option_list_temp = []
         question_content_list.append(question_content_html(".q_tit ").text())
         for question_content_option in question_content_html.children(".ops").items():
             option_item = question_content_option("label").html()
             option_item = option_item[option_item.index(".") + 1:]
             question_option_list_temp.append(option_item)
         question_option_list.append(question_option_list_temp)
     question_answer_container = item_html(".tea_explain_content")
     question_answer_num = []
     for question_answer_html in question_answer_container(".exp_correct_answer_desc ul").children().items():
         question_answer_list.append(question_answer_html("span").text())
         question_answer_num_item = question_answer_html.text()
         if question_answer_num_item.find(".") != -1:
             question_answer_num_item = question_answer_num_item[:question_answer_num_item.index(".")]
         question_answer_num.append(question_answer_num_item)
     if question_answer_container(".tea_explain_text").children().html() is not None:
         for question_resolve_html in question_answer_container(".tea_explain_text").children("p").items():
             question_resolve_list.append(question_resolve_html.html())
     else:
         question_answer_html = question_answer_container(".tea_explain_text").text()
         question_resolve_list.append(question_answer_html)
     for i, value in enumerate(question_content_list):
         question = QuestionItem()
         question["question_title"] = value
         question["question_answer"] = question_answer_list[i]
         question["question_order"] = question_answer_num[i]
         if len(question_resolve_list) < len(question_content_list):
             question_resolve_html_temp = "".join(question_resolve_list)
             start_str = question_answer_num[i]
             if i != len(question_content_list) - 1:
                 end = question_answer_num[i + 1]
                 pattern = re.compile(str(start_str) + '(.*?)' + str(end), re.S)
                 question_resolve_html_list = pattern.findall(question_resolve_html_temp)
                 if question_resolve_html_list is not None and len(question_resolve_html_list)>0:
                     question_resolve_html_temp = question_resolve_html_list[0]
                     if question_resolve_html_temp.find(".") != -1:
                         question_resolve_html_temp = question_resolve_html_temp[
                                                      question_resolve_html_temp.index(".") + 1:]
             else:
                 question_resolve_html_temp = question_resolve_html_temp[
                                              question_resolve_html_temp.index(start_str) + 1:]
             question["question_resolve_content"] = question_resolve_html_temp
         else:
             question_resolve_temp = question_resolve_list[i]
             if question_resolve_temp.find(".") != -1:
                 question_resolve_temp = question_resolve_temp[question_resolve_temp.index(".") + 1:]
             question["question_resolve_content"] = question_resolve_temp
         question["question_content"] = question_option_list[i]
         question["question_type"] = 1
         question["question_content_file_url_list"] = []
         question_list.append(question)
     return question_list
Esempio n. 5
0
 def parse_single_select_question(self, item_html, response):
     question_content_list = []
     question_option_list = []
     question_answer_list = []
     question_resolve_list = []
     question_list = []
     for question_content_html in item_html(".question").children(
             ".question_option").items():
         question_option_list_temp = []
         question_content_list.append(
             question_content_html(".q_tit ").text())
         for question_content_option in question_content_html.children(
                 ".ops").items():
             question_option_item = question_content_option("label").html()
             if question_option_item.find(".") != -1:
                 question_option_item = question_option_item[
                     question_option_item.index(".") + 1:]
             question_option_list_temp.append(question_option_item)
         question_option_list.append(question_option_list_temp)
     question_answer_container = item_html(".tea_explain_content")
     question_answer_num = []
     for question_answer_html in question_answer_container(
             ".exp_correct_answer_desc ul").children().items():
         question_answer_list.append(question_answer_html("span").text())
         question_answer_num_item = question_answer_html.text()
         if question_answer_num_item.find(".") != -1:
             question_answer_num_item = question_answer_num_item[:
                                                                 question_answer_num_item
                                                                 .index("."
                                                                        )]
         question_answer_num.append(question_answer_num_item)
     if question_answer_container(
             ".tea_explain_text").children().html() is not None:
         for question_resolve_html in question_answer_container(
                 ".tea_explain_text").children("p").items():
             question_resolve_list.append(question_resolve_html.html())
     else:
         question_answer_html = question_answer_container(
             ".tea_explain_text").text()
         question_resolve_list.append(question_answer_html)
     for i, value in enumerate(question_content_list):
         question = QuestionItem()
         question["question_title"] = value
         question["question_answer"] = question_answer_list[i]
         question["question_order"] = question_answer_num[i]
         print(value)
         if len(question_resolve_list) < len(question_content_list):
             question["question_resolve_content"] = "".join(
                 question_resolve_list)
         else:
             question["question_resolve_content"] = question_resolve_list[i]
         question["question_content"] = question_option_list[i]
         question["question_type"] = 1
         question["question_content_file_url_list"] = []
         question_list.append(question)
     return question_list
Esempio n. 6
0
    def parse_judgement_question(self, item_html, response):
        question = QuestionItem()
        question_content = []
        question_answer = []
        question_title = []
        for question_content_html in item_html(".question").children("p").items():
            if question_content_html.html() is not None:
                question_title.append("<p>" + question_content_html.html() + "</p>")
        for question_content_html in item_html(".question").children(".question_option").items():
            q_title = question_content_html(".q_tit").text()
            if q_title.find(".") != -1:
                q_title = q_title[q_title.index(".") + 1:]
            question_content.append("<p>" + q_title + "[BlankArea]" + "</p>")
        question_answer_container = item_html(".tea_explain_content")
        question_answer_num = []
        for question_answer_html in question_answer_container(
                ".exp_correct_answer_content .exp_correct_answer_desc ul").children().items():
            question_answer.append(question_answer_html("span").text())
            question_answer_num.append(question_answer_html.text())
        question_resolve = question_answer_container(".tea_explain_text").html()
        for index, value in enumerate(question_answer_num):
            if value.strip !='' and value.find(".") != -1:
                value = value[:value.index(".")]
                if index == 0:
                    question["question_order"] = value
                question_resolve = question_resolve.replace(value, str(index + 1))

        question["question_title"] = "".join(question_title)
        question_content_list = []
        question_content_list.append("".join(question_content))
        question["question_content"] = question_content_list
        question["question_answer"] = ",".join(question_answer)
        question["question_resolve_content"] = question_resolve
        question["question_type"] = 2
        question["question_content_file_url_list"] = []
        return question
Esempio n. 7
0
    def parse_question(self, response):
        title = response.meta['title']
        knowledge_name = response.meta['knowledge_name']
        article_id = response.meta['article_id']
        question = QuestionItem()
        content_str = html.unescape(
            PyQuery(
                response.css(
                    ".question .question_desc .question_option .q_tit .left").
                extract_first())).html()
        content = content_str[content_str.index('.') + 1:]
        answer = response.css(
            ".question .question_desc .answer_content .correctAnswer span::text"
        ).extract_first()
        resolve_content = response.css(
            ".resolve_content .desc::text").extract_first()
        option_list = []
        question_type = html.unescape(PyQuery(response.body))(
            ".question .question_desc .question_option .ops").attr("class")
        if question_type.find("empis") != -1:
            content += "<p>" + response.css(
                ".question .question_desc .question_option .ops::text"
            ).extract_first() + "</p>"
            print(question_type)
        else:
            for option_selector in response.css(
                    ".question .question_desc .question_option .ops"):
                option_str = option_selector.css("label::text").extract_first()
                if option_str is not None:
                    if option_str.index(".") != -1:
                        option_list.append(option_str[option_str.index(".") +
                                                      1:])
                    else:
                        option_list.append(option_str)
                else:
                    option_str = option_selector.xpath(
                        "text()").extract_first()
                    if option_str.index(".") != -1:
                        option_list.append(option_str[option_str.index(".") +
                                                      1:])
                    else:
                        option_list.append(option_str)

        ##获取文章的相关内容
        article_title = response.css(
            ".word_content  .article_tit::text").extract_first()
        article_paragraph_html_str_array = PyQuery(
            response.css(".article").extract_first())('div').html().split(
                "<br/><br/>")
        article_content_list = []
        article_content_translation_list = []
        flag = True  # 是否能找到自然段
        insert_flag = False
        for i, article_paragraph_html in enumerate(
                article_paragraph_html_str_array):
            if article_paragraph_html.strip(
            ) != "" and article_paragraph_html is not None:
                paragraph_content_list = []
                paragraph_content_translation_list = []
                article_paragraph_html = html.unescape(article_paragraph_html)
                if article_paragraph_html is not None and article_paragraph_html.strip(
                ) != '':
                    for phase_span in PyQuery(article_paragraph_html).children(
                            "span").items():
                        if phase_span.attr("data-translation") is not None:
                            paragraph_content_translation_list.append(
                                phase_span.attr("data-translation"))
                        paragraph_content_list.append(
                            phase_span(".text").text())
                        paragraph = phase_span(".text #ParagraphAr").attr(
                            "src")
                        insert_area = phase_span(".text").children(
                            ".insert-area").html()
                        if insert_area is not None and insert_area != '':
                            insert_flag = True
                            print(
                                phase_span(".text").children(
                                    ".insert-area").html())
                        if not paragraph is None:
                            flag = False
                            question['question_belong_paragraph'] = i + 1
                article_content_list.append("<p>" +
                                            "".join(paragraph_content_list) +
                                            "</p>")
                if len(paragraph_content_translation_list) > 0:
                    article_content_translation_list.append(
                        "".join(paragraph_content_translation_list))
        question['question_type'] = 1
        if flag:
            question['question_belong_paragraph'] = len(
                article_paragraph_html_str_array)
        else:
            if insert_flag:
                question['question_type'] = 6
                new_article_html = ""
                qestion_insert_area = article_content_list[
                    question['question_belong_paragraph'] - 1]
                count = 0
                select_index = 'A'
                i = 0
                while (i < len(qestion_insert_area)):
                    if qestion_insert_area[i:i + len("[■]")] == "[■]":
                        option_list.append(select_index)
                        new_article_html += qestion_insert_area[
                            i:i + len("[■]")] + "(" + select_index + ")"
                        select_index = chr(ord(select_index) + 1)
                        i += len("[■]")
                    else:
                        new_article_html += qestion_insert_area[i]
                        i = i + 1
                question['question_insert_content'] = new_article_html

        question['name'] = title
        question['question_title'] = content
        question['question_knowledge_name'] = knowledge_name
        question['question_answer'] = answer
        question['question_content'] = option_list
        question['question_resolve_content'] = resolve_content
        question['question_article_title'] = article_title
        question['question_article_id'] = article_id
        question['question_article_content'] = article_content_list
        question[
            'question_article_content_translation'] = article_content_translation_list
        question['question_module_type'] = 1
        question['question_content_file_url_list'] = []
        question["question_order"] = response.meta["question_order"]
        yield question
Esempio n. 8
0
    def parse_question_html(self, question_html, page_index_to_num, response):
        question = QuestionItem()
        question_answer_content_str = ""
        question_content_str = []

        question_content_file = []
        question_type = 0  # 1是选择,2判断,3是填空
        if question_html("input") is not None:
            input_class = question_html("input").attr("class")
            if input_class.index("fillBlankData") != -1:
                question_type = 3

        # 处理题干
        question_content_title = []
        for question_content in question_html.children("p").items():
            if question_content.html() is not None:
                question_content_title.append(question_content.html())

        # 处理填空题题型
        if question_type == 3:
            question_content = []

            is_fill_blank_page = False
            if question_html(".fillBlank_content").html() is not None:
                question_list = self.parse_fill_blank_data(question_html(".fillBlank_content "), response)
                question_content = question_list[0]
                question_content_file = question_list[1]
            else:
                question_content = self.parse_fill_blank_page_data(question_html(".drag_content"), response)
                is_fill_blank_page = True
            # 解析答案
            question_answer_container = question_html(".tea_explain_content")
            question_answer_temp = {}
            question_answer = {}
            for question_answer_html in question_answer_container(
                    ".exp_correct_answer_content .exp_correct_answer_desc ul").children("li").items():
                answer_num = question_answer_html(".pnum").text()[:-1]
                answer_text = question_answer_html(".answer").text()
                question_answer_temp[answer_num] = answer_text

            question["question_order"] = list(question_answer_temp.keys())[0]
            # 解析问题的解析
            question_resolve_list = question_answer_container(".exp_tea_explain_content .tea_explain_text").html()
            if is_fill_blank_page:
                question_content.append("<p>")
                num_index = 1
                for key, value in page_index_to_num.items():
                    question_content.append(str(num_index) + " Paragraph " + value + " [BlankArea],")
                    answer_content = question_answer_temp[key]
                    question_answer[value] = answer_content
                    question_resolve_list = question_resolve_list.replace(str(key), str(num_index))
                    num_index += 1

                question_content.append("</p>")
            else:
                question_answer = question_answer_temp
                num_index = 1
                for key, value in question_answer_temp.items():
                    question_resolve_list = question_resolve_list.replace(str(key), str(num_index))
                    num_index += 1

            question_content_str.append("".join(question_content))
            question_answer_content_str = ",".join(list(question_answer.values()))
        question["question_content"] = question_content_str
        question["question_answer"] = question_answer_content_str
        question["question_resolve_content"] = "".join(question_resolve_list)
        question["question_title"] = "".join(question_content_title)
        question["question_type"] = question_type
        question["question_content_file_url_list"] = question_content_file
        return question
    def parse_question(self, response):
        title = response.meta['title']
        knowledge_name = response.meta['knowledge_name']
        article_id = response.meta['article_id']

        # 提取听力题目相关
        content_str = html.unescape(PyQuery(response.body))(
            ".question .question_desc .question_option .q_tit .left").text()
        if content_str is not None and content_str.find('.') != -1:
            content = content_str[content_str.index('.') + 1:]
        else:
            content = content_str
        answer = response.css(
            ".question .question_desc .answer_content .correctAnswer span::text"
        ).extract_first()
        resolve_content = response.css(
            ".resolve_content .desc::text").extract_first()
        option_list = []
        question_type = 0
        content_list = []
        if response.css(".question .question_desc .question_option"
                        ).extract_first() is not None:
            for option_selector in response.css(
                    ".question .question_desc .question_option .ops"):
                option_str = option_selector.css("label::text").extract_first()
                if option_str is not None:
                    if option_str.index(".") != -1:
                        option_list.append(option_str[option_str.index(".") +
                                                      1:])
                    else:
                        option_list.append(option_str)
                else:
                    option_str = option_selector.xpath(
                        "text()").extract_first()
                    if option_str.index(".") != -1:
                        option_list.append(option_str[option_str.index(".") +
                                                      1:])
                    else:
                        option_list.append(option_str)
        elif response.css(".question .question_desc table::text"
                          ).extract_first() is not None:
            content_str = html.unescape(PyQuery(response.body))(
                ".question .question_desc .toefl_listen_table .q_tit .left"
            ).text()
            question_type = 2
            table_html = html.unescape(PyQuery(
                response.body))(".question .question_desc table tbody")
            for tr_index, table_tr in enumerate(
                    table_html.children("tr").items()):
                if tr_index > 0:
                    for td_index, table_td in enumerate(
                            table_tr.children("td").items()):
                        if td_index == 0:
                            content_list.append("<p>" + content_str +
                                                "</p><p>" +
                                                table_td(".ops").text() +
                                                "</p>")
                else:
                    for td_index, table_td in enumerate(
                            table_tr.children("td").items()):
                        if td_index > 0:
                            option_list.append(table_td(".name").text())

        else:
            print("Sss")

        ##提取听力原文
        article_title = response.meta['article_title']
        page_body_html = html.unescape(
            PyQuery(
                response.css(
                    ".ielts_listen_review_scroll  .nano-content .nano-content_in .article"
                ).extract_first()))
        page_article_content = []
        page_article_content_translation = []
        for page_html_p in page_body_html.children("span").items():
            if page_html_p.attr("data-translation") is not None:
                page_article_content_translation.append(
                    "<p>" + page_html_p.attr("data-translation") + "</p>")
            page_article_content.append("<p>" + page_html_p(".text").text() +
                                        "</p>")

        # 提取听力材料
        mp3_url = re.findall(r'https:\/\/.*?\.mp3', str(response.body))
        question_audio_url = ""
        if len(mp3_url) > 0:
            question_audio_url = str(
                mp3_url[0])[str(mp3_url[0]).rindex('http'):]
        else:
            print("sss")

        if question_type == 2:
            for index, question in enumerate(content_list):
                question = QuestionItem()
                question['name'] = title
                question['question_title'] = content_list[index]
                question['question_knowledge_name'] = knowledge_name
                question['question_answer'] = answer[index]
                question['question_content'] = option_list
                question['question_resolve_content'] = resolve_content
                question['question_article_title'] = article_title
                question['question_article_id'] = article_id
                audio_content = []
                audio_content.append("".join(page_article_content))
                question['question_article_content'] = audio_content
                audio_content_trans = []
                audio_content_trans.append(
                    "".join(page_article_content_translation))
                question[
                    'question_article_content_translation'] = audio_content_trans
                question['question_type'] = 1
                question['question_module_type'] = 5
                question['question_content_file_url_list'] = []
                question["question_audio_refer"] = response.meta["url"]
                question["question_audio_url"] = question_audio_url
                question["question_order"] = response.meta["question_order"]
                yield question
        else:
            question = QuestionItem()
            question['name'] = title
            question['question_title'] = content
            question['question_knowledge_name'] = knowledge_name
            question['question_answer'] = answer
            question['question_content'] = option_list
            question['question_resolve_content'] = resolve_content
            question['question_article_title'] = article_title
            question['question_article_id'] = article_id
            audio_content = []
            audio_content.append("".join(page_article_content))
            question['question_article_content'] = audio_content
            audio_content_trans = []
            audio_content_trans.append(
                "".join(page_article_content_translation))
            question[
                'question_article_content_translation'] = audio_content_trans
            question['question_type'] = 1
            question['question_module_type'] = 5
            question['question_content_file_url_list'] = []
            question["question_audio_refer"] = response.meta["url"]
            question["question_audio_url"] = question_audio_url
            question["question_order"] = response.meta["question_order"]
            yield question
Esempio n. 10
0
    def parse_question_html(self, question_html, response):
        question = QuestionItem()
        question_answer_content_str = ""
        question_content_str = []
        question_resolve_list = []
        question_content_file = []
        question_type = 0  # 1是选择,2判断,3是填空
        if question_html("input") is not None:
            input_class = question_html("input").attr("class")
            if input_class.index("fillBlankData") != -1:
                question_type = 3

        # 处理题干
        question_content_title = []
        """for question_content in question_html.children("p").items():
            if question_content.html() is not None:
                question_content_title.append(question_content.html())"""
        question_content_title.append(
            "Complete the notes below.Write ONE WORD AND/OR A NUMBER for each answer."
        )

        # 处理填空题题型
        if question_type == 3:
            question_content = []

            is_fill_blank_page = False
            if question_html(".fillBlank_content").html() is not None:
                question_list = self.parse_fill_blank_data(
                    question_html(".fillBlank_content "), response)
                question_content = question_list[0]
                question_content_file = question_list[1]
            else:
                question_content = self.parse_fill_blank_page_data(
                    question_html(".drag_content"), response)
                is_fill_blank_page = True
            # 解析答案
            question_answer_container = question_html(".tea_explain_content")
            question_answer_temp = {}
            question_answer = {}
            for question_answer_html in question_answer_container(
                    ".exp_correct_answer_content .exp_correct_answer_desc ul"
            ).children("li").items():
                answer_num = question_answer_html(".pnum").text()[:-1]
                answer_text = question_answer_html(".answer").text()
                question_answer_temp[answer_num] = answer_text

            # 解析问题的解析
            question_resolve_list.append(
                question_answer_container(
                    ".exp_tea_explain_content .tea_explain_text").html())
            question_answer = question_answer_temp
            question_content_str.append("".join(question_content))
            question_answer_content_str = ",".join(
                list(question_answer.values()))
        question["question_order"] = list(question_answer.keys())[0]
        question["question_content"] = question_content_str
        question["question_answer"] = question_answer_content_str
        question["question_resolve_content"] = "".join(question_resolve_list)
        question["question_title"] = "".join(question_content_title)
        question["question_type"] = question_type
        question["question_content_file_url_list"] = question_content_file
        return question