def parse_list_page(self, response): multi_xpath = '//div[@class="supply-cell" or @class="supply-cell supply-cell-bg"]' html5_response = response_html5parse(response) page_hxs = HtmlXPathSelector(html5_response) multi_hxs = page_hxs.select(multi_xpath) for hxs in multi_hxs: shop_name = ''.join(hxs.select('./div/div/span/a//text()').extract()) shop_name = clean_string(shop_name) shop_site_url = ''.join(hxs.select('./div/div/span/a[1]/@href').extract()) shop_site_url = urllib.unquote(shop_site_url).strip() detail_url = shop_site_url doc = { 'shop_name': shop_name, 'shop_site_url': shop_site_url, } query = response.meta['query'] list_url = response.url if not shop_site_url: next_request = None else: headers = { 'referer': shop_site_url } next_request = Request(detail_url, headers=headers, callback=self.parse_about_page) item = LegItem(collection=self.collection, doc=doc, next_request=next_request, list_url=list_url, query=query) yield self.item_or_request(item)
def parse_list_page(self, response): multi_xpath = '//*[@id="weibo"]/li' html5_response = response_html5parse(response) hxs = HtmlXPathSelector(html5_response) multi_hxs = hxs.select(multi_xpath) for hxs in multi_hxs: # #user_nick = ''.join(hxs.select('.//a[@name="weibo_rootnick"]/text()').extract()) #user_url = ''.join(hxs.select('.//a[@name="weibo_rootnick"]/@href').extract()) # at_users = [] # for at_user_hxs in hxs.select('.//a[@name="weibo_nick"]'): # #@Riffraff雞東 # at_user_nick = at_user_hxs.select('./text()') # #http://weibo.com/n/Riffraff雞東 # at_user_url = at_user_hxs.select('./@href') # at_users.append((at_user_nick,at_user_url)) #3分钟前 - 新浪微博 pubtime, site_name = ''.join(hxs.select('.//div[@class="m"]/text()').extract()).split(' - ', 1) #wait todo change pubtime pubtime = None title = None # :我还是大大方方的告诉 # 了 私信太麻烦了。就是他爱我 我不爱他 她爱他 他不爱她 她爱他 她的他爱我 我爱她爱她的他 懂了吗.? #todo fix it #content是否抓取@ #content = ''.join(hxs.select('./div/text()').extract()) #http://weibo.com/2440278962/zwZ7p6yUK?type=comment url = ''.join(hxs.select('.//a[@name="weibo_ping"]/@href').extract()) url = url_clean(url) #评论(0) comment_count = ''.join(hxs.select('.//a[@name="weibo_ping"]/text()').extract()) comment_count = extract_number(comment_count) #转发(0) repost_count = ''.join(hxs.select('.//a[@name="weibo_trans"]/text()').extract()) repost_count = extract_number(repost_count) #todo datatype url = urllib.unquote(url).strip() doc = { 'site_name': site_name, 'title': title, 'pubtime': pubtime, #todo'content': content, 'url': url, 'reply_num': comment_count, 'retweet_num': repost_count, 'data_type': self.data_type } detail_url = url list_url = response.url query = response.meta.get('query') item = LegItem(collection='web_data', doc=doc, detail_url=detail_url, list_url=list_url, query=query) if detail_url and self.visit_detail: detail_request = Request(detail_url, callback=self.parse_detail_page) detail_request.meta['item'] = item detail_request.meta['query'] = query yield detail_request else: yield item
def parse_list_page(self, response): """ """ multi_xpath = '//li[@class="plResultTerms clearfix"]' html5_response = response_html5parse(response) page_hxs = HtmlXPathSelector(html5_response) multi_hxs = page_hxs.select(multi_xpath) for hxs in multi_hxs: #提取部分 shop_name = ''.join(hxs.select('./div/h2/a//text()').extract()) shop_name = clean_string(shop_name) shop_site_url = ''.join(hxs.select('./div/h2/a/@href').extract()) shop_site_url = urllib.unquote(shop_site_url).strip() about_url = os.path.join(shop_site_url, 'about.html') contact_url = os.path.join(shop_site_url, 'contact.html') doc = { 'shop_name': shop_name.strip(), 'shop_site_url': shop_site_url.strip(), 'about_url': about_url, 'contact_url': contact_url, } query = response.meta['query'] list_url = response.url if not shop_site_url: next_request = None else: next_request = Request(about_url, callback=self.parse_about_page) item = LegItem(collection=self.collection, doc=doc, next_request=next_request, list_url=list_url, query=query) yield self.item_or_request(item)
def parse_detail_page(self, response): item = response.meta['item'] doc = item['doc'] html5_response = response_html5parse(response) page_hxs = HtmlXPathSelector(html5_response) shop_products_hxs = page_hxs.select('//div[@id="about"]//text()') shop_products = parse_products(shop_products_hxs, junks=['更多']) shop_launch_time = ''.join(page_hxs.select('//div[@id="about"]/div[2]/table/tbody/tr[2]/td[2]//text()').extract()) shop_launch_time = shop_launch_time.strip() contact2_hxs = page_hxs.select('//div[@id="contact"]//table//td//text()') contact_dic = parse_contact2(contact2_hxs) shop_email = ''.join(page_hxs.select('//div[@id="contact"]/div[2]/table/tbody/tr[8]/td[2]/a/@href').extract()) shop_email = shop_email.strip().lstrip('mailto::') qq_hxs = page_hxs.select('//div[@id="contact"]') shop_qq = parse_qq_num(qq_hxs) doc.update(contact_dic) doc['shop_products'] = shop_products doc['shop_launch_time'] = shop_launch_time doc['shop_email'] = shop_email doc['shop_qq'] = shop_qq yield self.item_or_request(item)
def parse_list_page(self, response): multi_xpath = '//li[@class="g"]' html5_response = response_html5parse(response) hxs = HtmlXPathSelector(html5_response) multi_hxs = hxs.select(multi_xpath) for hxs in multi_hxs: #//li[@class="g"][1]//h3/a/@href url = ''.join(hxs.select('.//h3/a/@href').extract()) url = urllib.unquote(url).strip() doc = { 'data_source': 'google专利搜索', 'url': url, } detail_url = fix_possible_missing_scheme(url) list_url = response.url query = response.meta.get('query') if not detail_url: next_request = None else: # detail_url = detail_url.replace('_free', '') next_request = Request(detail_url, callback=self.parse_detail_page) item = PatentItem(doc=doc, next_request=next_request, list_url=list_url, query=query, attachments=[], attachment_urls=[]) yield self.item_or_request(item)
def parse_list_page(self, response): """ """ multi_xpath = '//div[@class="border_b list_tiao"]' html5_response = response_html5parse(response) page_hxs = HtmlXPathSelector(html5_response) multi_hxs = page_hxs.select(multi_xpath) for hxs in multi_hxs: shop_name = "".join(hxs.select('.//div[@class="c_name"]/a//text()').extract()) shop_name = clean_string(shop_name) shop_site_url = "".join(hxs.select('.//div[@class="c_name"]/a/@href').extract()) shop_site_url = urllib.unquote(shop_site_url).strip() about_url = os.path.join(shop_site_url, "company") contact_url = os.path.join(shop_site_url, "contact") doc = { "shop_name": shop_name.strip(), "shop_site_url": shop_site_url.strip(), "about_url": about_url, "contact_url": contact_url, } query = response.meta["query"] list_url = response.url if not shop_site_url: next_request = None else: headers = {"referer": shop_site_url} next_request = Request(about_url, headers=headers, callback=self.parse_about_page) item = LegItem( collection=self.collection, doc=doc, next_request=next_request, list_url=list_url, query=query ) yield self.item_or_request(item)
def parse_list_page(self, response): multi_xpath = '//div[@class="shopListCon"]//tr' html5_response = response_html5parse(response) page_hxs = HtmlXPathSelector(html5_response) multi_hxs = page_hxs.select(multi_xpath) for hxs in multi_hxs: shop_name = ''.join(hxs.select('./td[2]//a//text()').extract()) shop_name = clean_string(shop_name) shop_site_url = ''.join(hxs.select('./td[2]//a/@href').extract()) shop_site_url = urllib.unquote(shop_site_url).strip() doc = { 'shop_name': shop_name, 'shop_site_url': shop_site_url, } detail_url = shop_site_url query = response.meta['query'] list_url = response.url if not shop_site_url: next_request = None else: next_request = Request(detail_url, callback=self.parse_detail_page) item = LegItem(collection=self.collection, doc=doc, next_request=next_request, list_url=list_url, query=query) yield self.item_or_request(item)
def parse_about_page(self, response): item = response.meta['item'] doc = item['doc'] html5_response = response_html5parse(response) page_hxs = HtmlXPathSelector(html5_response) shop_products_hxs = page_hxs.select('//div[@class="side_2"]//text()') junks = [ '加入失败', '您的询盘篮内信息已满0条!', '达到信息添加上限', '加入成功', '已成功添加到询盘篮!', '您的询盘篮中共有0家公司的0个产品', '继续浏览', ] shop_products = parse_products(shop_products_hxs, junks=junks) doc['shop_products'] = shop_products.strip() contact_url = doc.get('contact_url', '') if not contact_url: next_request = None else: next_request = Request(contact_url, callback=self.parse_contact_page) item['next_request'] = next_request yield self.item_or_request(item)
def parse_about_page(self, response): item = response.meta['item'] doc = item['doc'] html5_response = response_html5parse(response) page_hxs = HtmlXPathSelector(html5_response) shop_products_hxs = page_hxs.select('//div[@class="content" or @class="m-content"]//text()') junks = [ '当前位置:首页>企业简介','温馨提示:绿色字体部分为已审核企业信息' ] shop_products = parse_products(shop_products_hxs, junks=junks) contact2_hxs = page_hxs.select('//div[@class="companyInfo"]//td//text()') contact2_dic = parse_contact2(contact2_hxs) doc.update(contact2_dic) doc['shop_products'] = shop_products contact_url = doc.get('contact_url', '') if not contact_url: next_request = None else: next_request = Request(contact_url, callback=self.parse_contact_page) item['next_request'] = next_request yield self.item_or_request(item)
def parse_detail_page(self, response): item = response.meta['item'] html5_response = response_html5parse(response) hxs = HtmlXPathSelector(html5_response) abstract = ''.join(hxs.select('//span[@id="txtAbstr"]//text()').extract()) agent_institution = ''.join(hxs.select('//span[@id="tdANM"]//text()').extract()) claims = ''.join(hxs.select('//span[@id="txtClaim"]//text()').extract()) doc = item['doc'] doc['abstract'] = abstract doc['agent_institution'] = agent_institution doc['claims'] = claims dic = doc['dic'] pno = 'APP%s'%dic['StrANX'] pdf_url = 'http://searchtel.patentstar.com.cn/CPRS2010/Docdb/GetBns.aspx?PNo=%s'%pno next_request = Request(pdf_url,callback=self.parse_pdf) item['next_request'] = next_request attachments = item['attachments'] attach1 = { 'url': response.url, 'data': response.body_as_unicode(), 'mime_type': get_mime_type_in_response(response) } attachments.append(attach1) yield self.item_or_request(item)
def parse_list_page(self, response): multi_xpath = '//div[@class="xia_xuxian list_tiao"]' html5_response = response_html5parse(response) page_hxs = HtmlXPathSelector(html5_response) multi_hxs = page_hxs.select(multi_xpath) for hxs in multi_hxs: shop_name = ''.join(hxs.select('.//ul[@class="list_link"]/li/a//text()').extract()) shop_name = clean_string(shop_name) shop_site_url = ''.join(hxs.select('.//ul[@class="list_link"]/li/a/@href').extract()) shop_site_url = urllib.unquote(shop_site_url).strip() about_url = os.path.join(shop_site_url, 'profile.html') contact_url = os.path.join(shop_site_url, 'contact.html') doc = { 'shop_name': shop_name, 'shop_site_url': shop_site_url, 'about_url': about_url, 'contact_url': contact_url, } query = response.meta['query'] list_url = response.url if not shop_site_url: next_request = None else: headers = { 'referer': shop_site_url } next_request = Request(about_url, headers=headers, callback=self.parse_about_page) item = LegItem(collection=self.collection, doc=doc, next_request=next_request, list_url=list_url, query=query) yield self.item_or_request(item)
def parse_list_page(self, response): multi_xpath = '//div[@id="module-list"]/dl' html5_response = response_html5parse(response) hxs = HtmlXPathSelector(html5_response) multi_hxs = hxs.select(multi_xpath) for hxs in multi_hxs: title = ''.join(hxs.select('./dt[@class="title"]/a//text()').extract()) pub_time = ''.join(hxs.select('./dt[@class="title"]/span//text()').extract()) overview = ''.join(hxs.select('./dd[@class="content"]//text()').extract()) url = ''.join(hxs.select('./dt[@class="title"]/a/@href').extract()) info_misc_hxs = hxs.select('./dd[@class="info"]//text()') author, site_name, view_count, reply_count = self._ana_info_misc(info_misc_hxs) url = urllib.unquote(url).strip() doc = { 'data_source': '奇虎论坛搜索', 'site_name': site_name, 'title': title, 'pub_time': pub_time, 'overview': overview, 'url': url, 'author': author, 'view_count': view_count, 'reply_count': reply_count, } detail_url = url list_url = response.url query = response.meta.get('query') if not detail_url: next_request = None else: next_request = Request(detail_url, callback=self.parse_detail_page) item = BbsItem(doc=doc, next_request=next_request, list_url=list_url, query=query) yield self.item_or_request(item)
def parse_list_page(self, response): multi_xpath = '//div[@class="special-area-cont"]/div' html5_response = response_html5parse(response) hxs = HtmlXPathSelector(html5_response) multi_hxs = hxs.select(multi_xpath) query = response.meta.get('query') for hxs in multi_hxs: #//li[@class="g"][1]//h3/a/@href title = ''.join(hxs.select('.//h3//text()').extract()).strip() _video_name = ''.join(hxs.select('.//h3//span//font//text()').extract()) if _video_name != query: continue url = ''.join(hxs.select('.//h3//a/@href').extract()) _id = get_url_query(url)['id'] doc = { 'data_source': 'baidu视频搜索', 'url': url, 'title': title, 'id': _id, } list_url = response.url json_list_url = 'http://video.baidu.com/htvshowsingles/?id=%s' % _id next_request = Request(json_list_url, callback=self.parse_site_list) item = VideoZjcmItem(doc=doc, next_request=next_request, list_url=list_url, query=query, attachments=[], attachment_urls=[]) yield self.item_or_request(item)
def parse_list_page(self, response): multi_xpath = '//*[@id="r"]/table' html5_response = response_html5parse(response) hxs = HtmlXPathSelector(html5_response) multi_hxs = hxs.select(multi_xpath) for hxs in multi_hxs: site_name, pub_time = ''.join(hxs.select('.//nobr//text()').extract()).split(' ', 1) title = ''.join(hxs.select('.//span/b//text()').extract()) overview = ''.join(hxs.select('.//font[@size="-1"]//text()').extract()) url = ''.join(hxs.select('.//span/../@href').extract()) url = urllib.unquote(url).strip() doc = { 'data_source': '百度新闻搜索', 'site_name': site_name, 'pub_time': pub_time, 'title': title, 'overview': overview, 'url': url, } detail_url = fix_possible_missing_scheme(url) list_url = response.url query = response.meta.get('query') if not detail_url: next_request = None else: next_request = Request(detail_url, callback=self.parse_detail_page) item = NewsItem(doc=doc, next_request=next_request, list_url=list_url, query=query) yield self.item_or_request(item)
def parse_list_page(self, response): """ """ multi_xpath = '//div[@class="tc_qytitle1"]' html5_response = response_html5parse(response) page_hxs = HtmlXPathSelector(html5_response) multi_hxs = page_hxs.select(multi_xpath) for hxs in multi_hxs: #提取部分 shop_name = ''.join(hxs.select('./div/dl/dt/a//text()').extract()) shop_name = clean_string(shop_name) shop_site_url = ''.join(hxs.select('./div/dl/dt/a/@href').extract()) shop_site_url = urllib.unquote(shop_site_url).strip() detail_url = os.path.join(shop_site_url, 'clist--.html') doc = { 'shop_name': shop_name.strip(), 'shop_site_url': shop_site_url.strip(), 'detail_url': detail_url, } query = response.meta['query'] list_url = response.url if not shop_site_url: next_request = None else: headers = { 'referer': shop_site_url } next_request = Request(detail_url, headers=headers, callback=self.parse_detail_page) item = LegItem(collection=self.collection, doc=doc, next_request=next_request, list_url=list_url, query=query) yield self.item_or_request(item)
def parse_list_page(self, response): multi_xpath = '//div[@class="results"]/div' html5_response = response_html5parse(response) hxs = HtmlXPathSelector(html5_response) multi_hxs = hxs.select(multi_xpath) for hxs in multi_hxs: title = ''.join(hxs.select('./h3//text()').extract()) overview = ''.join(hxs.select('./div[@class="ft"]//text()').extract() + hxs.select('./table[@class="vrbox"]//text()').extract()) url = ''.join(hxs.select('./h3/a/@href').extract()) url = urllib.unquote(url).strip() info_misc_hxs = hxs.select('.//cite//text()') site_name, pub_time = self._ana_info_misc(info_misc_hxs) doc = { 'data_source': '搜狗论坛搜索', 'site_name': site_name, 'title': title, 'pub_time': pub_time, 'overview': overview, 'url': url, } detail_url = url list_url = response.url query = response.meta.get('query') if not detail_url: next_request = None else: next_request = Request(detail_url, callback=self.parse_detail_page) item = BbsItem(doc=doc, next_request=next_request, list_url=list_url, query=query) yield self.item_or_request(item)
def parse_detail_page(self, response): item = response.meta['item'] html5_response = response_html5parse(response) hxs = HtmlXPathSelector(html5_response) texts1 = hxs.select('//table[@class="tb"]//td//text()').extract() texts2 = hxs.select('//div[@class="t2"]//text()').extract() result_doc1 = blur_ana_patent(texts1) result_doc2 = blur_ana_patent(texts2) patent_name = ''.join(hxs.select('//div[@class="t1"]//text()').extract()) abstract = ''.join(hxs.select('//div[@class="con2"]//text()').extract()) doc = item['doc'] doc.update(result_doc1) doc.update(result_doc2) doc['patent_name'] = patent_name doc['abstract'] = abstract doc['application_number'] = doc['application_number'].lstrip('/专利号: ') attachments = item['attachments'] attach1 = { 'url': response.url, 'data': response.body_as_unicode(), 'mime_type': get_mime_type_in_response(response) } attachments.append(attach1) image_urls = get_image_urls(response) item['attachment_urls'] += image_urls yield self.item_or_request(item)
def parse_list_page(self, response): multi_xpath = '//div[@class="jieguo"]' html5_response = response_html5parse(response) page_hxs = HtmlXPathSelector(html5_response) multi_hxs = page_hxs.select(multi_xpath) for hxs in multi_hxs: shop_name = ''.join(hxs.select('./div[2]/ul/li[1]/a//text()').extract()) shop_name = clean_string(shop_name) shop_site_url = ''.join(hxs.select('./div[2]/ul/li[1]/a/@href').extract()) shop_site_url = urllib.unquote(shop_site_url).strip() _shop_id = shop_site_url.split('/')[4] about_url = 'http://www.wjw.cn/companyprofile/%s/aboutus.xhtml' % _shop_id contact_url = 'http://www.wjw.cn/cardview/%s/card.xhtml' % _shop_id doc = { 'shop_name': shop_name, 'shop_site_url': shop_site_url, 'about_url': about_url, 'contact_url': contact_url, } query = response.meta['query'] list_url = response.url if not _shop_id: next_request = None else: headers = { 'referer': shop_site_url } next_request = Request(about_url, headers=headers, callback=self.parse_about_page) item = LegItem(collection=self.collection, doc=doc, next_request=next_request, list_url=list_url, query=query) yield self.item_or_request(item)
def parse_detail_page(self, response): item = response.meta["item"] html5_response = response_html5parse(response) hxs = HtmlXPathSelector(html5_response) texts = hxs.select('//table[@id="perildical2_dl"]//text()').extract() texts = clean_wanfang_texts(texts) result_doc = blur_ana_patent(texts) abstract = "".join(hxs.select('//div[@class="abstracts"]//text()').extract()) doc = item["doc"] doc.update(result_doc) doc["abstract"] = abstract attachments = item["attachments"] attach1 = { "url": response.url, "data": response.body_as_unicode(), "mime_type": get_mime_type_in_response(response), } attachments.append(attach1) image_urls = get_image_urls(response) item["attachment_urls"] += image_urls # more_url = response.url.replace('_free', '') # next_request = Request(more_url, callback=self.parse_more_page) # item['next_request'] = next_request # hotfix for patent_type patent_type = "".join(hxs.select('//th[contains(.//text(),"专利类型")]/../td//text()').extract()) doc["patent_type"] = patent_type yield self.item_or_request(item)
def parse_list_page(self, response): multi_xpath = '//div[@class="PatentBlock"]' html5_response = response_html5parse(response) hxs = HtmlXPathSelector(html5_response) multi_hxs = hxs.select(multi_xpath) for hxs in multi_hxs: url = ''.join(hxs.select('./div[2]/h2/a/@href').extract()) url = 'http://www2.soopat.com%s' % url url = urllib.unquote(url).strip() doc = { 'data_source': 'soopat中国专利搜索', 'url': url, } detail_url = url list_url = response.url query = response.meta.get('query') if not detail_url: next_request = None else: next_request = Request(detail_url, callback=self.parse_detail_page) item = PatentItem(doc=doc, next_request=next_request, list_url=list_url, query=query, attachments=[], attachment_urls=[]) yield self.item_or_request(item)
def parse_contact_page(self, response): item = response.meta['item'] doc = item['doc'] html5_response = response_html5parse(response) page_hxs = HtmlXPathSelector(html5_response) contact_hxs = page_hxs.select('//div[@class="contactInfo" or @class="contact-detail"]//text()') contact_dic = parse_contact(contact_hxs) doc.update(contact_dic) yield self.item_or_request(item)
def parse_pdf(self, response): item = response.meta['item'] html5_response = response_html5parse(response) hxs = HtmlXPathSelector(html5_response) pdf_url = ''.join(hxs.select('//a[text()="公开文本"]/@href')) if pdf_url: item['attachment_urls'].append(pdf_url) doc = item['doc'] dic = doc['dic'] idx = dic['StrANX'] images_url = 'http://searchtel.patentstar.com.cn/CPRS2010/comm/getzhaiyao.aspx?idx=%s'%idx next_request = Request(images_url,callback=self.parse_images) item['next_request'] = next_request yield self.item_or_request(item)
def parse_contact_page(self, response): item = response.meta['item'] doc = item['doc'] html5_response = response_html5parse(response) page_hxs = HtmlXPathSelector(html5_response) contact_hxs = page_hxs.select('//table//tr//text()') contact_dic = parse_contact(contact_hxs) shop_contacts = ''.join(page_hxs.select('//div[@class="card-detail"]/h3/a//text()').extract()) shop_contacts = shop_contacts.strip() doc['shop_contacts'] = shop_contacts doc.update(contact_dic) yield self.item_or_request(item)
def parse_contact_page(self, response): item = response.meta['item'] doc = item['doc'] html5_response = response_html5parse(response) page_hxs = HtmlXPathSelector(html5_response) contact2_hxs = page_hxs.select('//div[@class="contact"][1]//div//text()') contact2_dic = parse_contact2(contact2_hxs) qq_hxs = page_hxs.select('//div[@class="contact"][1]') shop_qq = parse_qq_num(qq_hxs) doc.update(contact2_dic) doc['shop_qq'] = shop_qq yield self.item_or_request(item)
def parse_contact_page(self, response): item = response.meta["item"] doc = item["doc"] html5_response = response_html5parse(response) page_hxs = HtmlXPathSelector(html5_response) contact_hxs = page_hxs.select('//div[@id="b740"]/div/dl//text()') contact_dic = parse_contact(contact_hxs) qq_hxs = page_hxs.select('//div[@id="b740"]') shop_qq = parse_qq_num(qq_hxs) doc.update(contact_dic) doc["shop_qq"] = shop_qq yield self.item_or_request(item)
def parse_detail_page(self, response): item = response.meta['item'] doc = item['doc'] html5_response = response_html5parse(response) page_hxs = HtmlXPathSelector(html5_response) contact_hxs = page_hxs.select('//div[@id="contrt"]/div[1]/dl/dd[1]/table//td//text()') contact_dic = parse_contact(contact_hxs) shop_products_hxs = page_hxs.select('//div[@id="tdsub_1"]//text()') shop_products_hxs = shop_products_hxs[3:] shop_products = parse_products(shop_products_hxs) doc.update(contact_dic) doc['shop_products'] = shop_products.strip() yield self.item_or_request(item)
def parse_contact_page(self, response): item = response.meta['item'] doc = item['doc'] html5_response = response_html5parse(response) page_hxs = HtmlXPathSelector(html5_response) contact_hxs = page_hxs.select('//div[@class="contact"]//text()') contact_dic = parse_contact(contact_hxs) shop_contacts = ''.join(page_hxs.select('/html/body/div[9]/div[2]/div/div[2]/strong[2]//text()').extract()) shop_contacts = shop_contacts.strip() doc.update(contact_dic) if shop_contacts: doc['shop_contacts'] = shop_contacts yield self.item_or_request(item)
def parse_contact_page(self, response): item = response.meta['item'] doc = item['doc'] html5_response = response_html5parse(response) page_hxs = HtmlXPathSelector(html5_response) contact_dic = {} contact1_hxs = page_hxs.select('//div[@class="com_contact"]/text()') contact1_dic = parse_contact1(contact1_hxs) contact2_hxs = page_hxs.select('//div[@class="com_contact"]//span//text()') contact2_dic = parse_contact2(contact2_hxs) contact_dic.update(contact2_dic) contact_dic.update(contact1_dic) doc.update(contact_dic) yield self.item_or_request(item)
def parse_contact_page(self, response): item = response.meta['item'] doc = item['doc'] html5_response = response_html5parse(response) page_hxs = HtmlXPathSelector(html5_response) contact1_hxs = page_hxs.select('//div[@class="MpBox MpBoxBg"]//p//text()') contact1_dic = parse_contact1(contact1_hxs) shop_contacts = ''.join(page_hxs.select('//div[@class="MpBox MpBoxBg"]/p[1]//text()').extract()) shop_contacts = clean_string(shop_contacts) qq_hxs = page_hxs.select('//div[@class="MainLeftBox1"]') shop_qq = parse_qq_num(qq_hxs) doc.update(contact1_dic) doc['shop_contacts'] = shop_contacts doc['shop_qq'] = shop_qq yield self.item_or_request(item)
def get_next_page_request(self, response): html5_response = response_html5parse(response) page_hxs = HtmlXPathSelector(html5_response) if not self._total_page: total_page = ''.join(page_hxs.select('//div[@id="tc_menu"]/div[2]/span[2]//text()').extract()) try: self._total_page = int(total_page) except ValueError: self._total_page = 0 sub_url = ''.join(page_hxs.select('//div[@id="tc_menu"]/div[2]/a[1]/@href').extract()) base_url = 'http://china.chemnet.com/company' page_num = response.meta['page_num'] if page_num < self._total_page - 1: p = page_num + 1 url = os.path.join(base_url, sub_url) #fast hack url = url.replace(';p=', '', 1) url += ';p=%d' % p return Request(url, callback=self.query_callback)