def parse_multi_select_question(self, item_html, response): question_option_list = [] question_answer_list = [] # 題目,选项 question_content_html = item_html(".question").children(".question_option") question_title = question_content_html(".q_tit").text() for question_content_option in question_content_html.children(".ops").items(): question_option_item = question_content_option("label").html() if question_option_item.find(".") != -1: question_option_item = question_option_item[question_option_item.index(".") + 1:] question_option_list.append(question_option_item) # 答案、解析 question_answer_container = item_html(".tea_explain_content") for question_answer_html in question_answer_container(".exp_correct_answer_desc ul").children().items(): question_answer_list.append(question_answer_html("span").text()) question_resolve_html = question_answer_container(".tea_explain_text").html() question = QuestionItem() question["question_title"] = question_title question["question_answer"] = "".join(question_answer_list) question["question_resolve_content"] = question_resolve_html question["question_content"] = question_option_list question["question_type"] = 4 question["question_content_file_url_list"] = [] return question
def parse_table_question(self, item_html, response): question_html = item_html(".question") question_title_list = [] question_content_list = [] question_answer_list = [] for question_title in question_html.children("p").items(): question_title_list.append("<p>" + question_title.html() + "</p>") for tr_index, table_tr in enumerate(question_html(".check_table").children("tr").items()): if tr_index > 0: for td_index, table_tr_td in enumerate(table_tr.children("td").items()): if td_index == 0: question_content_list.append("<p>" + table_tr_td.text() + "</p>") question_answer_container = item_html(".tea_explain_content") for answer_html in question_answer_container( ".exp_correct_answer_content .exp_correct_answer_desc ul").children("li").items(): question_answer_list.append(answer_html(".answer").text()) question_resolve_html = question_answer_container(".exp_tea_explain_content .tea_explain_text").html() question = QuestionItem() question['question_title'] = "".join(question_title_list) question['question_answer'] = ",".join(question_answer_list) question_content_str = [] question_content_str.append("".join(question_content_list)) question['question_content'] = question_content_str question['question_resolve_content'] = question_resolve_html question['question_type'] = 3 question["question_content_file_url_list"] = [] return question
def parse_judgement_question(self, item_html, response): question_content = [] question_answer = [] question_title = [] for question_content_html in item_html(".question").children( "p").items(): if question_content_html.html() is not None: question_title.append("<p>" + question_content_html.html() + "</p>") for question_content_html in item_html(".question").children( ".question_option").items(): question_content.append( "<p>" + str(question_content_html(".q_tit").text()) + "[BlankArea]" + "</p>") question_answer_container = item_html(".tea_explain_content") for question_answer_html in question_answer_container( ".exp_correct_answer_content .exp_correct_answer_desc ul" ).children().items(): question_answer.append(question_answer_html("span").text()) question_resolve = question_answer_container( ".tea_explain_text").html() question = QuestionItem() question["question_title"] = question_title question_content_arr = [] question_content_arr.append("".join(question_content)) question["question_content"] = question_content_arr question["question_answer"] = ",".join(question_answer) question["question_resolve_content"] = question_resolve question["question_type"] = 2 question["question_content_file_url_list"] = [] return question
def parse_single_select_question(self, item_html, response): question_content_list = [] question_option_list = [] question_answer_list = [] question_resolve_list = [] question_list = [] for question_content_html in item_html(".question").children(".question_option").items(): question_option_list_temp = [] question_content_list.append(question_content_html(".q_tit ").text()) for question_content_option in question_content_html.children(".ops").items(): option_item = question_content_option("label").html() option_item = option_item[option_item.index(".") + 1:] question_option_list_temp.append(option_item) question_option_list.append(question_option_list_temp) question_answer_container = item_html(".tea_explain_content") question_answer_num = [] for question_answer_html in question_answer_container(".exp_correct_answer_desc ul").children().items(): question_answer_list.append(question_answer_html("span").text()) question_answer_num_item = question_answer_html.text() if question_answer_num_item.find(".") != -1: question_answer_num_item = question_answer_num_item[:question_answer_num_item.index(".")] question_answer_num.append(question_answer_num_item) if question_answer_container(".tea_explain_text").children().html() is not None: for question_resolve_html in question_answer_container(".tea_explain_text").children("p").items(): question_resolve_list.append(question_resolve_html.html()) else: question_answer_html = question_answer_container(".tea_explain_text").text() question_resolve_list.append(question_answer_html) for i, value in enumerate(question_content_list): question = QuestionItem() question["question_title"] = value question["question_answer"] = question_answer_list[i] question["question_order"] = question_answer_num[i] if len(question_resolve_list) < len(question_content_list): question_resolve_html_temp = "".join(question_resolve_list) start_str = question_answer_num[i] if i != len(question_content_list) - 1: end = question_answer_num[i + 1] pattern = re.compile(str(start_str) + '(.*?)' + str(end), re.S) question_resolve_html_list = pattern.findall(question_resolve_html_temp) if question_resolve_html_list is not None and len(question_resolve_html_list)>0: question_resolve_html_temp = question_resolve_html_list[0] if question_resolve_html_temp.find(".") != -1: question_resolve_html_temp = question_resolve_html_temp[ question_resolve_html_temp.index(".") + 1:] else: question_resolve_html_temp = question_resolve_html_temp[ question_resolve_html_temp.index(start_str) + 1:] question["question_resolve_content"] = question_resolve_html_temp else: question_resolve_temp = question_resolve_list[i] if question_resolve_temp.find(".") != -1: question_resolve_temp = question_resolve_temp[question_resolve_temp.index(".") + 1:] question["question_resolve_content"] = question_resolve_temp question["question_content"] = question_option_list[i] question["question_type"] = 1 question["question_content_file_url_list"] = [] question_list.append(question) return question_list
def parse_single_select_question(self, item_html, response): question_content_list = [] question_option_list = [] question_answer_list = [] question_resolve_list = [] question_list = [] for question_content_html in item_html(".question").children( ".question_option").items(): question_option_list_temp = [] question_content_list.append( question_content_html(".q_tit ").text()) for question_content_option in question_content_html.children( ".ops").items(): question_option_item = question_content_option("label").html() if question_option_item.find(".") != -1: question_option_item = question_option_item[ question_option_item.index(".") + 1:] question_option_list_temp.append(question_option_item) question_option_list.append(question_option_list_temp) question_answer_container = item_html(".tea_explain_content") question_answer_num = [] for question_answer_html in question_answer_container( ".exp_correct_answer_desc ul").children().items(): question_answer_list.append(question_answer_html("span").text()) question_answer_num_item = question_answer_html.text() if question_answer_num_item.find(".") != -1: question_answer_num_item = question_answer_num_item[: question_answer_num_item .index("." )] question_answer_num.append(question_answer_num_item) if question_answer_container( ".tea_explain_text").children().html() is not None: for question_resolve_html in question_answer_container( ".tea_explain_text").children("p").items(): question_resolve_list.append(question_resolve_html.html()) else: question_answer_html = question_answer_container( ".tea_explain_text").text() question_resolve_list.append(question_answer_html) for i, value in enumerate(question_content_list): question = QuestionItem() question["question_title"] = value question["question_answer"] = question_answer_list[i] question["question_order"] = question_answer_num[i] print(value) if len(question_resolve_list) < len(question_content_list): question["question_resolve_content"] = "".join( question_resolve_list) else: question["question_resolve_content"] = question_resolve_list[i] question["question_content"] = question_option_list[i] question["question_type"] = 1 question["question_content_file_url_list"] = [] question_list.append(question) return question_list
def parse_judgement_question(self, item_html, response): question = QuestionItem() question_content = [] question_answer = [] question_title = [] for question_content_html in item_html(".question").children("p").items(): if question_content_html.html() is not None: question_title.append("<p>" + question_content_html.html() + "</p>") for question_content_html in item_html(".question").children(".question_option").items(): q_title = question_content_html(".q_tit").text() if q_title.find(".") != -1: q_title = q_title[q_title.index(".") + 1:] question_content.append("<p>" + q_title + "[BlankArea]" + "</p>") question_answer_container = item_html(".tea_explain_content") question_answer_num = [] for question_answer_html in question_answer_container( ".exp_correct_answer_content .exp_correct_answer_desc ul").children().items(): question_answer.append(question_answer_html("span").text()) question_answer_num.append(question_answer_html.text()) question_resolve = question_answer_container(".tea_explain_text").html() for index, value in enumerate(question_answer_num): if value.strip !='' and value.find(".") != -1: value = value[:value.index(".")] if index == 0: question["question_order"] = value question_resolve = question_resolve.replace(value, str(index + 1)) question["question_title"] = "".join(question_title) question_content_list = [] question_content_list.append("".join(question_content)) question["question_content"] = question_content_list question["question_answer"] = ",".join(question_answer) question["question_resolve_content"] = question_resolve question["question_type"] = 2 question["question_content_file_url_list"] = [] return question
def parse_question(self, response): title = response.meta['title'] knowledge_name = response.meta['knowledge_name'] article_id = response.meta['article_id'] question = QuestionItem() content_str = html.unescape( PyQuery( response.css( ".question .question_desc .question_option .q_tit .left"). extract_first())).html() content = content_str[content_str.index('.') + 1:] answer = response.css( ".question .question_desc .answer_content .correctAnswer span::text" ).extract_first() resolve_content = response.css( ".resolve_content .desc::text").extract_first() option_list = [] question_type = html.unescape(PyQuery(response.body))( ".question .question_desc .question_option .ops").attr("class") if question_type.find("empis") != -1: content += "<p>" + response.css( ".question .question_desc .question_option .ops::text" ).extract_first() + "</p>" print(question_type) else: for option_selector in response.css( ".question .question_desc .question_option .ops"): option_str = option_selector.css("label::text").extract_first() if option_str is not None: if option_str.index(".") != -1: option_list.append(option_str[option_str.index(".") + 1:]) else: option_list.append(option_str) else: option_str = option_selector.xpath( "text()").extract_first() if option_str.index(".") != -1: option_list.append(option_str[option_str.index(".") + 1:]) else: option_list.append(option_str) ##获取文章的相关内容 article_title = response.css( ".word_content .article_tit::text").extract_first() article_paragraph_html_str_array = PyQuery( response.css(".article").extract_first())('div').html().split( "<br/><br/>") article_content_list = [] article_content_translation_list = [] flag = True # 是否能找到自然段 insert_flag = False for i, article_paragraph_html in enumerate( article_paragraph_html_str_array): if article_paragraph_html.strip( ) != "" and article_paragraph_html is not None: paragraph_content_list = [] paragraph_content_translation_list = [] article_paragraph_html = html.unescape(article_paragraph_html) if article_paragraph_html is not None and article_paragraph_html.strip( ) != '': for phase_span in PyQuery(article_paragraph_html).children( "span").items(): if phase_span.attr("data-translation") is not None: paragraph_content_translation_list.append( phase_span.attr("data-translation")) paragraph_content_list.append( phase_span(".text").text()) paragraph = phase_span(".text #ParagraphAr").attr( "src") insert_area = phase_span(".text").children( ".insert-area").html() if insert_area is not None and insert_area != '': insert_flag = True print( phase_span(".text").children( ".insert-area").html()) if not paragraph is None: flag = False question['question_belong_paragraph'] = i + 1 article_content_list.append("<p>" + "".join(paragraph_content_list) + "</p>") if len(paragraph_content_translation_list) > 0: article_content_translation_list.append( "".join(paragraph_content_translation_list)) question['question_type'] = 1 if flag: question['question_belong_paragraph'] = len( article_paragraph_html_str_array) else: if insert_flag: question['question_type'] = 6 new_article_html = "" qestion_insert_area = article_content_list[ question['question_belong_paragraph'] - 1] count = 0 select_index = 'A' i = 0 while (i < len(qestion_insert_area)): if qestion_insert_area[i:i + len("[■]")] == "[■]": option_list.append(select_index) new_article_html += qestion_insert_area[ i:i + len("[■]")] + "(" + select_index + ")" select_index = chr(ord(select_index) + 1) i += len("[■]") else: new_article_html += qestion_insert_area[i] i = i + 1 question['question_insert_content'] = new_article_html question['name'] = title question['question_title'] = content question['question_knowledge_name'] = knowledge_name question['question_answer'] = answer question['question_content'] = option_list question['question_resolve_content'] = resolve_content question['question_article_title'] = article_title question['question_article_id'] = article_id question['question_article_content'] = article_content_list question[ 'question_article_content_translation'] = article_content_translation_list question['question_module_type'] = 1 question['question_content_file_url_list'] = [] question["question_order"] = response.meta["question_order"] yield question
def parse_question_html(self, question_html, page_index_to_num, response): question = QuestionItem() question_answer_content_str = "" question_content_str = [] question_content_file = [] question_type = 0 # 1是选择,2判断,3是填空 if question_html("input") is not None: input_class = question_html("input").attr("class") if input_class.index("fillBlankData") != -1: question_type = 3 # 处理题干 question_content_title = [] for question_content in question_html.children("p").items(): if question_content.html() is not None: question_content_title.append(question_content.html()) # 处理填空题题型 if question_type == 3: question_content = [] is_fill_blank_page = False if question_html(".fillBlank_content").html() is not None: question_list = self.parse_fill_blank_data(question_html(".fillBlank_content "), response) question_content = question_list[0] question_content_file = question_list[1] else: question_content = self.parse_fill_blank_page_data(question_html(".drag_content"), response) is_fill_blank_page = True # 解析答案 question_answer_container = question_html(".tea_explain_content") question_answer_temp = {} question_answer = {} for question_answer_html in question_answer_container( ".exp_correct_answer_content .exp_correct_answer_desc ul").children("li").items(): answer_num = question_answer_html(".pnum").text()[:-1] answer_text = question_answer_html(".answer").text() question_answer_temp[answer_num] = answer_text question["question_order"] = list(question_answer_temp.keys())[0] # 解析问题的解析 question_resolve_list = question_answer_container(".exp_tea_explain_content .tea_explain_text").html() if is_fill_blank_page: question_content.append("<p>") num_index = 1 for key, value in page_index_to_num.items(): question_content.append(str(num_index) + " Paragraph " + value + " [BlankArea],") answer_content = question_answer_temp[key] question_answer[value] = answer_content question_resolve_list = question_resolve_list.replace(str(key), str(num_index)) num_index += 1 question_content.append("</p>") else: question_answer = question_answer_temp num_index = 1 for key, value in question_answer_temp.items(): question_resolve_list = question_resolve_list.replace(str(key), str(num_index)) num_index += 1 question_content_str.append("".join(question_content)) question_answer_content_str = ",".join(list(question_answer.values())) question["question_content"] = question_content_str question["question_answer"] = question_answer_content_str question["question_resolve_content"] = "".join(question_resolve_list) question["question_title"] = "".join(question_content_title) question["question_type"] = question_type question["question_content_file_url_list"] = question_content_file return question
def parse_question(self, response): title = response.meta['title'] knowledge_name = response.meta['knowledge_name'] article_id = response.meta['article_id'] # 提取听力题目相关 content_str = html.unescape(PyQuery(response.body))( ".question .question_desc .question_option .q_tit .left").text() if content_str is not None and content_str.find('.') != -1: content = content_str[content_str.index('.') + 1:] else: content = content_str answer = response.css( ".question .question_desc .answer_content .correctAnswer span::text" ).extract_first() resolve_content = response.css( ".resolve_content .desc::text").extract_first() option_list = [] question_type = 0 content_list = [] if response.css(".question .question_desc .question_option" ).extract_first() is not None: for option_selector in response.css( ".question .question_desc .question_option .ops"): option_str = option_selector.css("label::text").extract_first() if option_str is not None: if option_str.index(".") != -1: option_list.append(option_str[option_str.index(".") + 1:]) else: option_list.append(option_str) else: option_str = option_selector.xpath( "text()").extract_first() if option_str.index(".") != -1: option_list.append(option_str[option_str.index(".") + 1:]) else: option_list.append(option_str) elif response.css(".question .question_desc table::text" ).extract_first() is not None: content_str = html.unescape(PyQuery(response.body))( ".question .question_desc .toefl_listen_table .q_tit .left" ).text() question_type = 2 table_html = html.unescape(PyQuery( response.body))(".question .question_desc table tbody") for tr_index, table_tr in enumerate( table_html.children("tr").items()): if tr_index > 0: for td_index, table_td in enumerate( table_tr.children("td").items()): if td_index == 0: content_list.append("<p>" + content_str + "</p><p>" + table_td(".ops").text() + "</p>") else: for td_index, table_td in enumerate( table_tr.children("td").items()): if td_index > 0: option_list.append(table_td(".name").text()) else: print("Sss") ##提取听力原文 article_title = response.meta['article_title'] page_body_html = html.unescape( PyQuery( response.css( ".ielts_listen_review_scroll .nano-content .nano-content_in .article" ).extract_first())) page_article_content = [] page_article_content_translation = [] for page_html_p in page_body_html.children("span").items(): if page_html_p.attr("data-translation") is not None: page_article_content_translation.append( "<p>" + page_html_p.attr("data-translation") + "</p>") page_article_content.append("<p>" + page_html_p(".text").text() + "</p>") # 提取听力材料 mp3_url = re.findall(r'https:\/\/.*?\.mp3', str(response.body)) question_audio_url = "" if len(mp3_url) > 0: question_audio_url = str( mp3_url[0])[str(mp3_url[0]).rindex('http'):] else: print("sss") if question_type == 2: for index, question in enumerate(content_list): question = QuestionItem() question['name'] = title question['question_title'] = content_list[index] question['question_knowledge_name'] = knowledge_name question['question_answer'] = answer[index] question['question_content'] = option_list question['question_resolve_content'] = resolve_content question['question_article_title'] = article_title question['question_article_id'] = article_id audio_content = [] audio_content.append("".join(page_article_content)) question['question_article_content'] = audio_content audio_content_trans = [] audio_content_trans.append( "".join(page_article_content_translation)) question[ 'question_article_content_translation'] = audio_content_trans question['question_type'] = 1 question['question_module_type'] = 5 question['question_content_file_url_list'] = [] question["question_audio_refer"] = response.meta["url"] question["question_audio_url"] = question_audio_url question["question_order"] = response.meta["question_order"] yield question else: question = QuestionItem() question['name'] = title question['question_title'] = content question['question_knowledge_name'] = knowledge_name question['question_answer'] = answer question['question_content'] = option_list question['question_resolve_content'] = resolve_content question['question_article_title'] = article_title question['question_article_id'] = article_id audio_content = [] audio_content.append("".join(page_article_content)) question['question_article_content'] = audio_content audio_content_trans = [] audio_content_trans.append( "".join(page_article_content_translation)) question[ 'question_article_content_translation'] = audio_content_trans question['question_type'] = 1 question['question_module_type'] = 5 question['question_content_file_url_list'] = [] question["question_audio_refer"] = response.meta["url"] question["question_audio_url"] = question_audio_url question["question_order"] = response.meta["question_order"] yield question
def parse_question_html(self, question_html, response): question = QuestionItem() question_answer_content_str = "" question_content_str = [] question_resolve_list = [] question_content_file = [] question_type = 0 # 1是选择,2判断,3是填空 if question_html("input") is not None: input_class = question_html("input").attr("class") if input_class.index("fillBlankData") != -1: question_type = 3 # 处理题干 question_content_title = [] """for question_content in question_html.children("p").items(): if question_content.html() is not None: question_content_title.append(question_content.html())""" question_content_title.append( "Complete the notes below.Write ONE WORD AND/OR A NUMBER for each answer." ) # 处理填空题题型 if question_type == 3: question_content = [] is_fill_blank_page = False if question_html(".fillBlank_content").html() is not None: question_list = self.parse_fill_blank_data( question_html(".fillBlank_content "), response) question_content = question_list[0] question_content_file = question_list[1] else: question_content = self.parse_fill_blank_page_data( question_html(".drag_content"), response) is_fill_blank_page = True # 解析答案 question_answer_container = question_html(".tea_explain_content") question_answer_temp = {} question_answer = {} for question_answer_html in question_answer_container( ".exp_correct_answer_content .exp_correct_answer_desc ul" ).children("li").items(): answer_num = question_answer_html(".pnum").text()[:-1] answer_text = question_answer_html(".answer").text() question_answer_temp[answer_num] = answer_text # 解析问题的解析 question_resolve_list.append( question_answer_container( ".exp_tea_explain_content .tea_explain_text").html()) question_answer = question_answer_temp question_content_str.append("".join(question_content)) question_answer_content_str = ",".join( list(question_answer.values())) question["question_order"] = list(question_answer.keys())[0] question["question_content"] = question_content_str question["question_answer"] = question_answer_content_str question["question_resolve_content"] = "".join(question_resolve_list) question["question_title"] = "".join(question_content_title) question["question_type"] = question_type question["question_content_file_url_list"] = question_content_file return question