def process_url_page(self,keshi_name): #http://ypk.39.net/search/{0}-p{1}/ all_page_url = 'http://jbk.39.net/bw/{0}_p0#ps'.format(keshi_name) #all_page_url = "http://ypk.39.net/search/all?k=".format(parse.quote(disease)) header={'User-Agent':ChoiceUAIP().choice_ua()} request = Request(all_page_url,headers=header) opener = ChoiceUAIP().choice_proxy() response = opener.open(request).read() if response is None:pass allcontent = response.decode('gb2312','ignore') selector=etree.HTML(allcontent) #将源码转化为能被XPath匹配的格式 #//*[@id="res_tab_1"]/div[11]/a[11] ''' try: urlpage = selector.xpath('//*[@id="res_tab_1"]/div[@class="site-pages"]/a[@class="sp-a"][-1]/@href') except Exception as e: pass ''' print(selector) urlpage = selector.xpath('//*[@id="res_tab_1"]/div[@class="site-pages"]/a[@class="sp-a"]/@href')[1] #if urlpage is None:pass print('pages:{}'.format(urlpage)) all_pages = urlpage.replace('#ps','').split('p')[-1] #if urlpage is None:pass return all_pages
def process_disease_cause(self, disease): header = {'User-Agent': ChoiceUAIP().choice_ua()} request = Request(self.url, headers=header) opener = ChoiceUAIP().choice_proxy() response = opener.open(request).read() print(chardet.detect(response)) allcontent = response.decode('utf-8', 'ignore') #print(allcontent) if allcontent is None: pass selector = etree.HTML(allcontent) #将源码转化为能被XPath匹配的格式 #//*[@id="content"]/p[4] #//*[@id="content"]/p[7] #//*[@id="content"]/p[4] disease_all_infos = selector.xpath('//*[@id="content"]//text()') #print(drugs_info) disease_all_info = ' '.join(disease_all_infos) #disease_all_info = '++'.join(disease_all_info) #disease_all_info = ' '.join(disease_all_info.split()).replace('++','') #print(disease_all_info) first_split_str = "病因详情 /{} 编辑".format(disease) disease_cause_first = disease_all_info.split(first_split_str)[1] #print(len(disease_cause_first)) #print(disease_cause_first) sec_split_str = "症状 /{} 编辑".format(disease) disease_cause = disease_cause_first.split(sec_split_str)[0] #print(disease_cause) return disease_cause
def process_disease_check_detail(self): header={'User-Agent':ChoiceUAIP().choice_ua()} request = Request(self.url,headers=header) opener = ChoiceUAIP().choice_proxy() response = opener.open(request).read() if response is None:pass allcontent = response.decode('gb2312') #print(chardet.detect(response)) #print(urlopen(Request(url,headers=header)).read().decode('gb2312')) selector=etree.HTML(allcontent) #将源码转化为能被XPath匹配的格式 #/html/body/section/div[3]/div[1]/div[1]/div[2] check_url = self.url common_check = selector.xpath('//div[@class="content clearfix"]//div[@class="chi-know chi-int"]/div[@class="checkbox"]/div//text()') #print(str(common_check).replace(' ','')) common_check = '++'.join(common_check) common_check = ' '.join(common_check.split()).replace('++','') print(common_check) checks = selector.xpath('//div[@class="content clearfix"]//div[@class="chi-know chi-int"]/div[@class="art-box"]/p//text()') checks = ' '.join(checks) print(str(checks)) check_updatetime = selector.xpath('//div[@class="content clearfix"]//div[@class="chi-know chi-int"]/dl[@class="intro"]/dd[@class="i3"]/span/text()')[0].replace('更新','') print(check_updatetime) #//*[@id="s_browseCount"] browse_count = selector.xpath('//dd[@class="i3"]/span[2]/span/text()')[0] print(browse_count) #//*[@id="s_collectCount"] collect_count = selector.xpath('//dd[@class="i3"]/span[3]/span/text()')[0] print(collect_count) keys_list = ['check_url','common_check','checks','check_updatetime','browse_count','collect_count'] vals_list = [check_url,common_check,checks,check_updatetime,browse_count,collect_count] check_dict = dict(zip(keys_list,vals_list)) print(check_dict) return check_dict
def process_url_page(disease): #http://ypk.39.net/search/{0}-p{1}/ all_page_url = "http://ypk.39.net/search/{}-p1".format( parse.quote(disease)) #all_page_url = "http://ypk.39.net/search/all?k=".format(parse.quote(disease)) header = {'User-Agent': ChoiceUAIP().choice_ua()} request = Request(all_page_url, headers=header) opener = ChoiceUAIP().choice_proxy() response = opener.open(request).read() if response is None: pass allcontent = response.decode('gb2312', 'ignore') selector = etree.HTML(allcontent) #将源码转化为能被XPath匹配的格式 #/html/body/div[8]/div[2]/div[4]/i #/html/body/div[8]/div[2]/div[4]/i #/html/body/div[8]/div[2]/span/span[1]/b urlpage = selector.xpath( '//div[@class="page"]/div[@class="search_right"]/div[@class="search_tips"]/i/text()' )[0] #pgleft #/html/body/div[8]/div[2]/div[4]/i #/html/body/div[8]/div[2]/span/span[1]/b #urlpage = selector.xpath('//div[@class="page"]//span[@class="pgleft"]/b/text()') print('pages:{}'.format(urlpage)) if urlpage is None: pass return int(int(urlpage) / 15 + 2)
def process_qa_corpus_detail(self): #druginfo_url = durgurl+'manual' header = {'User-Agent': ChoiceUAIP().choice_ua()} request = Request(self.url, headers=header) opener = ChoiceUAIP().choice_proxy() response = opener.open(request).read() if response is None: pass allcontent = response.decode('gb2312', 'ignore') if allcontent is None: pass selector = etree.HTML(allcontent) #将源码转化为能被XPath匹配的格式 #/html/body/section/div[3]/div[1]/div/div[1]/h4 #/html/body/section/div[3]/div[1]/div/div[1]/h4 qa_corpus = selector.xpath( '//div[@class="content clearfix"]//div[@class="chi-exp-item "]//text()' ) #print(qa_corpus) qa_corpus = '++'.join(qa_corpus) #print(''.join(strs.split()).replace('++','')) qa_corpus = ' '.join(qa_corpus.split()).replace('++', '') #print(qa_corpus) #question1 = selector.xpath('//div[@class="content clearfix"]//div[@class="chi-exp-item "]/h4/text()') #print(question1) return qa_corpus
def process_symptom(self): header = {'User-Agent': ChoiceUAIP().choice_ua()} request = Request(self.symptoms_url, headers=header) opener = ChoiceUAIP().choice_proxy() response = opener.open(request).read() if response is None: pass allcontent = response.decode('gb2312') #print(chardet.detect(response)) #print(urlopen(Request(url,headers=header)).read().decode('gb2312')) selector = etree.HTML(allcontent) #将源码转化为能被XPath匹配的格式 #/html/body/section/div[3]/div[1]/div[1]/dl[2]/dd[2]/text() symptoms_url = self.symptoms_url common_symptoms = selector.xpath( '//div[@class="content clearfix"]//dl[@class="links"]/dd//text()') common_symptoms = '++'.join(common_symptoms) common_symptoms_str = ' '.join(common_symptoms.split()).replace( '++', '').split('相关症状:') print(common_symptoms_str) common_symptoms = common_symptoms_str[0] links_symptoms = common_symptoms_str[1] print(links_symptoms) symptoms = selector.xpath( '//div[@class="content clearfix"]//div[@class="art-box"]/p//text()' ) symptoms = ' '.join(symptoms) print(str(symptoms)) symptoms_updatetime = selector.xpath( '//div[@class="content clearfix"]//dl[@class="intro"]/dd[@class="i3"]/span/text()' )[0].replace('更新', '') print(symptoms_updatetime) #//*[@id="s_browseCount"] browse_count = selector.xpath( '//dd[@class="i3"]/span[2]/span/text()')[0] print(browse_count) #//*[@id="s_collectCount"] /html/body/section/div[3]/div[1]/div[1]/dl[1]/dd[2]/span[3]/text() collect_count = selector.xpath( '//dd[@class="i3"]/span[3]/span/text()')[0] print(collect_count) keys_list = [ "symptoms_url", "common_symptoms", "links_symptoms", "symptoms", "symptoms_updatetime", "browse_count", "collect_count" ] vals_list = [ symptoms_url, common_symptoms, links_symptoms, symptoms, symptoms_updatetime, browse_count, collect_count ] check_dict = dict(zip(keys_list, vals_list)) print(check_dict) return check_dict
def process_drugs_overview_detail(self, drugurl): ''' 药品概述详细信息 ''' print(drugurl) viewurl = drugurl.replace('manual', '') print(viewurl) header = {'User-Agent': ChoiceUAIP().choice_ua()} request = Request(viewurl, headers=header) opener = ChoiceUAIP().choice_proxy() response = opener.open(request).read() if response is None: pass allcontent = response.decode('gb2312', 'ignore') if allcontent is None: pass selector = etree.HTML(allcontent) #将源码转化为能被XPath匹配的格式 if selector.xpath( '//div[@class="gaisu"]//ul[@class="showlis"]/li[1]/text()' ) is None: drug_form = selector.xpath( '//div[@class="gaisu"]//ul[@class="showlis"]/li[1]/text()')[0] print(drug_form) #/html/body/div[12]/div[2]/div[1]/div[1]/ul[2]/li[2]/text() drug_spec = selector.xpath( '//div[@class="gaisu"]//ul[@class="showlis"]/li[2]/text()')[0] print(drug_spec) else: drug_form = "null" drug_spec = "null" therapeutic_diseases = selector.xpath( '//div[@class="gs_right"]/ul[@class="whatsthis clearfix"]/li//text()' ) #therapeutic_diseases = ' '.join(therapeutic_diseases) print('治疗常用疾病:{}'.format(therapeutic_diseases)) key_list = ['drug_form', 'drug_spec', 'therapeutic_diseases'] val_list = [drug_form, drug_spec, therapeutic_diseases] ''' vals_list = [] keys_list = [] keys_list.extend(key_list) vals_list.extend(val_list) #vals_list.append(drug_form) #vals_list.append(drug_spec) #vals_list.append(therapeutic_diseases) dict_test = dict(zip(keys_list,vals_list)) print(dict_test) ''' return key_list, val_list
def process_drug_url(self): header = {'User-Agent': ChoiceUAIP().choice_ua()} request = Request(self.url, headers=header) opener = ChoiceUAIP().choice_proxy() response = opener.open(request).read() if response is None: pass allcontent = response.decode('gb2312', 'ignore') selector = etree.HTML(allcontent) #将源码转化为能被XPath匹配的格式 #/html/body/div[8]/div[2]/ul/li[1]/div[1]/strong/a hrefs = selector.xpath('//div[@class="msgs"]/strong/a/@href') if hrefs is None: pass drughrefs = [] for i in range(len(hrefs)): drughref = 'http://ypk.39.net{}manual'.format(hrefs[i]) drughrefs.append(drughref) return drughrefs
def process_disease_wiki(self, disease): header = {'User-Agent': ChoiceUAIP().choice_ua()} request = Request(self.url, headers=header) opener = ChoiceUAIP().choice_proxy() response = opener.open(request).read() print(chardet.detect(response)) allcontent = response.decode('utf-8', 'ignore') #print(allcontent) if allcontent is None: pass selector = etree.HTML(allcontent) #将源码转化为能被XPath匹配的格式 #//*[@id="content"]/p[4] #//*[@id="content"]/p[7] #//*[@id="content"]/p[4] disease_wiki_all_infos = selector.xpath('//*[@id="content"]//text()') #print(drugs_info) disease_wiki_data = ' '.join(disease_wiki_all_infos) return disease_wiki_data
def process_drugs_manual_detail(self, durgurl): druginfo_url = durgurl + 'manual' header = {'User-Agent': ChoiceUAIP().choice_ua()} request = Request(druginfo_url, headers=header) opener = ChoiceUAIP().choice_proxy() response = opener.open(request).read() if response is None: pass allcontent = response.decode('gb2312', 'ignore') if allcontent is None: pass selector = etree.HTML(allcontent) #将源码转化为能被XPath匹配的格式 #/html/body/div[9]/div[2]/div[3]/div/dl[1]/dd/p/text()[1] drugs_info = selector.xpath('//div[@class="tab_box"]/div//text()') #print(drugs_info) strs = '++'.join(drugs_info) #print(''.join(strs.split()).replace('++','')) strs = ' '.join(strs.split()).replace('++', '') return strs
def get_pages(self): #//*[@id="anpSelectData_Settings"]/a[13] #http://ask.familydoctor.com.cn/jbk/d369?page=0& url_page = self.url_start_page + '?page=0&' header = {'User-Agent': ChoiceUAIP().choice_ua()} request = Request(url_page, headers=header) opener = ChoiceUAIP().choice_proxy() response = opener.open(request).read() print(chardet.detect(response)) allcontent = response.decode('utf-8', 'ignore') #print(allcontent) if allcontent is None: pass selector = etree.HTML(allcontent) #将源码转化为能被XPath匹配的格式 #//*[@id="content"]/p[4] #//*[@id="content"]/p[7] #//*[@id="content"]/p[4] page_str = selector.xpath( '//*[@id="anpSelectData_Settings"]/a[13]/@href')[0] page_num = page_str.replace('&', '').split('=')[1] return page_num