def parse_detail_mongo(self, response): item = CorpusHealthItem() try: item['url'] = response.url question = response.xpath( '//div[@class="ask_title"]/h1').extract()[0] askTxt = self.filter_tags_blank(question) try: desc = response.xpath( '//div[@class="ask_title"]/following-sibling::div[@class="wd_cont_s"][1]/p[1]' ).extract()[0] descText = self.filter_tags_blank(desc) except Exception as e: descText = '' item['question'] = {'askText': askTxt, 'askDesc': descText} try: answerList = [] answers = response.xpath('//div[@class="angle"]') for answer_each in answers: answer = answer_each.xpath( './/following-sibling::p[1]').extract()[0] answerList.append(self.filter_tags_blank(answer)) item['answer'] = answerList except Exception as e: item['answer'] = [] # print(item) # print(item['answer']) yield item except Exception as e: print(e) logger.info("匹配信息出错。错误原因:") logger.info(e)
def parse_detail_mongo(self, response): item = CorpusHealthItem() try: item['url'] = response.url question = response.xpath('//h1[@id="d_askH1"]').extract()[0] askTxt = self.filter_tags_blank(question) try: desc = response.xpath( '//p[@class="crazy_new"][1]').extract()[0] descText = self.filter_tags_blank(desc) except Exception as e: descText = '' item['question'] = {'askText': askTxt, 'askDesc': descText} try: answerList = response.xpath('//div[@class="b_anscont_cont"]') itemList = [] for index, answerli in enumerate(answerList): answer_each = answerli.xpath( './/div[@class="crazy_new"]/p/text()').extract() answer = "".join(answer_each) itemList.append("".join(answer.split())) item['answer'] = itemList except Exception as e: item['answer'] = '' # print(item) # print(item['answer']) yield item except Exception as e: print(e) logger.info("匹配信息出错。错误原因:") logger.info(e)
def parse_detail_mongo(self, response): item = CorpusHealthItem() try: item['url'] = response.url question = response.xpath( '//div[@class="iask_detail01a"]//ul').extract()[0] askTxt = self.filter_tags_blank(question) try: desc = response.xpath( '//div[@class="iask_detail01b1"]/dl[2]/dd').extract()[0] descText = self.filter_tags_blank(desc) except Exception as e: descText = '' item['question'] = {'askText': askTxt, 'askDesc': descText} try: answerList = [] answers = response.xpath('//div[@class="iask_answer02a"]') for item_each in answers: answerLi = item_each.xpath('.//dd').extract()[0] answerList.append(self.filter_tags_blank(answerLi)) item['answer'] = answerList except Exception as e: item['answer'] = '' # print(item) # print(item['answer']) yield item except Exception as e: print(e) logger.info("匹配信息出错。错误原因:") logger.info(e)
def parse_detail_mongo(self, response): item = CorpusHealthItem() try: item['url'] = response.url question = response.xpath('//p[@class="ask_article_title_p1"]').extract()[0] askTxt = self.filter_tags_blank(question) try: desc = response.xpath('//p[@class="ask_article_nr1_p2"]').extract()[0] descText = self.filter_tags_blank(desc) except Exception as e: descText = '' item['question'] = {'askText': askTxt, 'askDesc': descText} try: answerList = [] answers = response.xpath('//div[@class="answer_content2_1"]') for item_each in answers: answerList.append(self.filter_tags_blank(item_each.extract())) item['answer'] = answerList except Exception as e: item['answer'] = '' # print(item) # print(item['answer']) yield item except Exception as e: print(e) logger.info("匹配信息出错。错误原因:") logger.info(e)
def parse_detail_mongo(self, response): item = CorpusHealthItem() try: item['url'] = response.url question = response.xpath('//h1[contains(@class, "fyahei")]').extract()[0] askTxt = self.filter_tags_blank(question) descText = '' # try: # desc = response.xpath('//p[@class="pd_txt"]').extract()[0] # descText = self.filter_tags_blank(desc) # except Exception as e: # descText = '' item['question'] = {'askText': askTxt, 'askDesc': descText} # try: # answer = response.xpath('//div[@class="an_cont"]/dl/dt').extract()[0] # item['answer'] = self.filter_tags_blank(answer) # except Exception as e: # item['answer'] = '' item['answer'] = '' # print(item) # print(item['answer']) yield item except Exception as e: print(e) logger.info("匹配信息出错。错误原因:") logger.info(e)
def parse_detail_mongo(self, response): item = CorpusHealthItem() try: item['url'] = response.url question = response.xpath('//h1').extract()[0] askTxt = self.filter_tags_blank(question) try: desc = response.xpath('//div[@class="descip"]').extract()[0] descText = self.filter_tags_blank(desc) except Exception as e: descText = '' item['question'] = {'askText': askTxt, 'askDesc': descText} try: answerList = response.xpath('//div[@class="dorawer"]/div[@class="descip paint1"]').extract() itemList = [] for index, answerli in enumerate(answerList): itemList.append(self.filter_tags_blank(answerli)) item['answer'] = itemList except Exception as e: item['answer'] = '' # print(item) # print(item['answer']) yield item except Exception as e: print(e) logger.info("匹配信息出错。错误原因:") logger.info(e)
def parse_detail_mongo(self, response): item = CorpusHealthItem() try: item['url'] = response.url question = response.xpath( '//h1[@class="four font-16 u_tit"]').extract()[0] askTxt = self.filter_tags_blank(question) try: desc = response.xpath('//p[@class="k_questiond"]').extract()[0] descText = self.filter_tags_blank(desc) except Exception as e: descText = '' item['question'] = {'askText': askTxt, 'askDesc': descText} try: answerList = response.xpath( '//div[@class="k_answerlist"]/div[@class="k_answerli"]') itemList = [] for index, answerli in enumerate(answerList): answer_each = answerli.xpath( './/div[@class="crazy_new"]').extract()[0] itemList.append(self.filter_tags_blank(answer_each)) item['answer'] = itemList except Exception as e: item['answer'] = '' yield item except Exception as e: print(e) logger.info("匹配信息出错。错误原因:") logger.info(e)
def parse_detail_mongo(self, response): item = CorpusHealthItem() try: item['url'] = response.url question = response.xpath('//dl[@class="iask13_title"]/dt').extract()[0] askTxt = self.filter_tags_blank(question) try: desc = response.xpath('//div[@class="iask13 iask13_q"]/ul[@class="iask13_con"]').extract()[0] descText = self.filter_tags_blank(desc) except Exception as e: descText = '' item['question'] = {'askText': askTxt, 'askDesc': descText} try: answerList = [] answers = response.xpath('//div[@class="iask13 iask13_a"]') for item_each in answers: ulList = item_each.xpath('.//ul[@class="iask13_con"]') tempAnswer = '' for item_ul in ulList: tempAnswer = tempAnswer + self.filter_tags_blank(item_ul.extract()) answerList.append(tempAnswer) item['answer'] = answerList except Exception as e: item['answer'] = '' # print(item) # print(item['answer']) yield item except Exception as e: print(e) logger.info("匹配信息出错。错误原因:") logger.info(e)
def parse_detail_mongo(self, response): print('==============') item = CorpusHealthItem() try: item['url'] = response.url question = response.xpath('//div[@class="why"]/h1').extract()[0] askTxt = self.filter_tags_blank(question) try: desc = response.xpath('//p[@class="pd_txt"]').extract()[0] descText = self.filter_tags_blank(desc) except Exception as e: descText = '' item['question'] = {'askText': askTxt, 'askDesc': descText} try: answer = response.xpath( '//div[@class="an_cont"]/dl/dt').extract()[0] item['answer'] = self.filter_tags_blank(answer) except Exception as e: item['answer'] = '' print(item) # yield item except Exception as e: print(e) logger.info("匹配信息出错。错误原因:") logger.info(e)
def parse_detail(self, response): item = CorpusHealthItem() try: item['url'] = response.url item['question'] = response.xpath( '//p[@class="crazy_new"]/text()').extract()[1].strip() answerList = response.xpath( '//div[@class="b_anscont_cont"][1]/div[@class="crazy_new"]/p/text()' ).extract() answer = "".join(answerList) item['answer'] = "".join(answer.split()) yield item except Exception as e: print(e) logger.info("匹配信息出错。错误原因:") logger.info(e)
def parse_detail_mongo(self, response): item = CorpusHealthItem() try: item['url'] = response.url question = response.xpath('//span[@class="title"]').extract()[0] # question = response.xpath('//h1').extract()[0] askText = self.filter_tags_blank(question) item['question'] = {'askText': askText, 'askDesc': ''} item['answer'] = '' # print(item) # print(item['answer']) yield item except Exception as e: print(e) logger.info("匹配信息出错。错误原因:") logger.info(e)
def parse_detail_mongo(self, response): item = CorpusHealthItem() try: item['url'] = response.url question = response.xpath('//h1').extract()[0].replace( '<span>问</span>', '') askTxt = self.filter_tags_blank(question) try: desc = response.xpath( '//div[@class="wenti_dec"]/p').extract()[0] descText = self.filter_tags_blank(desc) except Exception as e: descText = '' item['question'] = {'askText': askTxt, 'askDesc': descText} item['answer'] = '' # print(item) # print(item['answer']) yield item except Exception as e: print(e) logger.info("匹配信息出错。错误原因:") logger.info(e)
def parse_detail_mongo(self, response): item = CorpusHealthItem() try: item['url'] = response.url question = response.xpath('//p[@class="fl dib fb"]').extract()[0] askTxt = self.filter_tags_blank(question) try: desc = response.xpath('//div[@id="qdetailc"]').extract()[0] descText = self.filter_tags_blank(desc) except Exception as e: descText = '' item['question'] = {'askText': askTxt, 'askDesc': descText} try: answer = response.xpath('//div[@class="pt15 f14 graydeep pl20 pr20"]').extract()[0] item['answer'] = self.filter_tags_blank(answer) except Exception as e: item['answer'] = '' # print(item) # print(item['answer']) yield item except Exception as e: print(e) logger.info("匹配信息出错。错误原因:") logger.info(e)