def parse_list_page(self, response):
        multi_xpath = '//div[@class="supply-cell" or @class="supply-cell supply-cell-bg"]'
        html5_response = response_html5parse(response)
        page_hxs = HtmlXPathSelector(html5_response)
        multi_hxs = page_hxs.select(multi_xpath)
        for hxs in multi_hxs:
            shop_name = ''.join(hxs.select('./div/div/span/a//text()').extract())
            shop_name = clean_string(shop_name)
            shop_site_url = ''.join(hxs.select('./div/div/span/a[1]/@href').extract())
            shop_site_url = urllib.unquote(shop_site_url).strip()

            detail_url = shop_site_url

            doc = {
                'shop_name': shop_name,
                'shop_site_url': shop_site_url,
            }

            query = response.meta['query']
            list_url = response.url

            if not shop_site_url:
                next_request = None
            else:
                headers = {
                    'referer': shop_site_url
                }
                next_request = Request(detail_url, headers=headers, callback=self.parse_about_page)
            item = LegItem(collection=self.collection, doc=doc,
                           next_request=next_request, list_url=list_url, query=query)
            yield self.item_or_request(item)
Example #2
0
    def parse_list_page(self, response):
        multi_xpath = '//*[@id="weibo"]/li'
        html5_response = response_html5parse(response)
        hxs = HtmlXPathSelector(html5_response)
        multi_hxs = hxs.select(multi_xpath)
        for hxs in multi_hxs:
            #
            #user_nick = ''.join(hxs.select('.//a[@name="weibo_rootnick"]/text()').extract())
            #user_url = ''.join(hxs.select('.//a[@name="weibo_rootnick"]/@href').extract())
            # at_users = []
            # for at_user_hxs in hxs.select('.//a[@name="weibo_nick"]'):
            #     #@Riffraff雞東
            #     at_user_nick = at_user_hxs.select('./text()')
            #     #http://weibo.com/n/Riffraff雞東
            #     at_user_url = at_user_hxs.select('./@href')
            #     at_users.append((at_user_nick,at_user_url))
            #3分钟前 - 新浪微博
            pubtime, site_name = ''.join(hxs.select('.//div[@class="m"]/text()').extract()).split(' - ', 1)
            #wait todo change pubtime
            pubtime = None
            title = None
            # :我还是大大方方的告诉
            # 了 私信太麻烦了。就是他爱我 我不爱他 她爱他 他不爱她 她爱他 她的他爱我 我爱她爱她的他 懂了吗.?
            #todo fix it
            #content是否抓取@
            #content = ''.join(hxs.select('./div/text()').extract())
            #http://weibo.com/2440278962/zwZ7p6yUK?type=comment
            url = ''.join(hxs.select('.//a[@name="weibo_ping"]/@href').extract())
            url = url_clean(url)
            #评论(0)
            comment_count = ''.join(hxs.select('.//a[@name="weibo_ping"]/text()').extract())
            comment_count = extract_number(comment_count)
            #转发(0)
            repost_count = ''.join(hxs.select('.//a[@name="weibo_trans"]/text()').extract())
            repost_count = extract_number(repost_count)

            #todo datatype
            url = urllib.unquote(url).strip()
            doc = {
                'site_name': site_name,
                'title': title,
                'pubtime': pubtime,
                #todo'content': content,
                'url': url,
                'reply_num': comment_count,
                'retweet_num': repost_count,
                'data_type': self.data_type
            }
            detail_url = url
            list_url = response.url
            query = response.meta.get('query')
            item = LegItem(collection='web_data', doc=doc,
                              detail_url=detail_url, list_url=list_url, query=query)
            if detail_url and self.visit_detail:
                detail_request = Request(detail_url, callback=self.parse_detail_page)
                detail_request.meta['item'] = item
                detail_request.meta['query'] = query
                yield detail_request
            else:
                yield item
    def parse_list_page(self, response):
        """
        """
        multi_xpath = '//li[@class="plResultTerms clearfix"]'
        html5_response = response_html5parse(response)
        page_hxs = HtmlXPathSelector(html5_response)
        multi_hxs = page_hxs.select(multi_xpath)
        for hxs in multi_hxs:
            #提取部分
            shop_name = ''.join(hxs.select('./div/h2/a//text()').extract())
            shop_name = clean_string(shop_name)
            shop_site_url = ''.join(hxs.select('./div/h2/a/@href').extract())
            shop_site_url = urllib.unquote(shop_site_url).strip()
            about_url = os.path.join(shop_site_url, 'about.html')
            contact_url = os.path.join(shop_site_url, 'contact.html')

            doc = {
                'shop_name': shop_name.strip(),
                'shop_site_url': shop_site_url.strip(),
                'about_url': about_url,
                'contact_url': contact_url,
            }

            query = response.meta['query']
            list_url = response.url

            if not shop_site_url:
                next_request = None
            else:
                next_request = Request(about_url, callback=self.parse_about_page)
            item = LegItem(collection=self.collection, doc=doc,
                           next_request=next_request, list_url=list_url, query=query)
            yield self.item_or_request(item)
    def parse_detail_page(self, response):
        item = response.meta['item']
        doc = item['doc']
        html5_response = response_html5parse(response)
        page_hxs = HtmlXPathSelector(html5_response)

        shop_products_hxs = page_hxs.select('//div[@id="about"]//text()')
        shop_products = parse_products(shop_products_hxs, junks=['更多'])

        shop_launch_time = ''.join(page_hxs.select('//div[@id="about"]/div[2]/table/tbody/tr[2]/td[2]//text()').extract())
        shop_launch_time = shop_launch_time.strip()

        contact2_hxs = page_hxs.select('//div[@id="contact"]//table//td//text()')
        contact_dic = parse_contact2(contact2_hxs)

        shop_email = ''.join(page_hxs.select('//div[@id="contact"]/div[2]/table/tbody/tr[8]/td[2]/a/@href').extract())
        shop_email = shop_email.strip().lstrip('mailto::')

        qq_hxs = page_hxs.select('//div[@id="contact"]')
        shop_qq = parse_qq_num(qq_hxs)

        doc.update(contact_dic)
        doc['shop_products'] = shop_products
        doc['shop_launch_time'] = shop_launch_time
        doc['shop_email'] = shop_email
        doc['shop_qq'] = shop_qq

        yield self.item_or_request(item)
Example #5
0
    def parse_list_page(self, response):
        multi_xpath = '//li[@class="g"]'
        html5_response = response_html5parse(response)
        hxs = HtmlXPathSelector(html5_response)
        multi_hxs = hxs.select(multi_xpath)
        for hxs in multi_hxs:
            #//li[@class="g"][1]//h3/a/@href
            url = ''.join(hxs.select('.//h3/a/@href').extract())

            url = urllib.unquote(url).strip()
            doc = {
                'data_source': 'google专利搜索',
                'url': url,
            }
            detail_url = fix_possible_missing_scheme(url)
            list_url = response.url
            query = response.meta.get('query')
            if not detail_url:
                next_request = None
            else:
                # detail_url = detail_url.replace('_free', '')
                next_request = Request(detail_url, callback=self.parse_detail_page)
            item = PatentItem(doc=doc,
                              next_request=next_request, list_url=list_url, query=query,
                              attachments=[], attachment_urls=[])
            yield self.item_or_request(item)
    def parse_list_page(self, response):
        """
        """
        multi_xpath = '//div[@class="border_b list_tiao"]'
        html5_response = response_html5parse(response)
        page_hxs = HtmlXPathSelector(html5_response)
        multi_hxs = page_hxs.select(multi_xpath)
        for hxs in multi_hxs:
            shop_name = "".join(hxs.select('.//div[@class="c_name"]/a//text()').extract())
            shop_name = clean_string(shop_name)
            shop_site_url = "".join(hxs.select('.//div[@class="c_name"]/a/@href').extract())
            shop_site_url = urllib.unquote(shop_site_url).strip()
            about_url = os.path.join(shop_site_url, "company")
            contact_url = os.path.join(shop_site_url, "contact")

            doc = {
                "shop_name": shop_name.strip(),
                "shop_site_url": shop_site_url.strip(),
                "about_url": about_url,
                "contact_url": contact_url,
            }

            query = response.meta["query"]
            list_url = response.url

            if not shop_site_url:
                next_request = None
            else:
                headers = {"referer": shop_site_url}
                next_request = Request(about_url, headers=headers, callback=self.parse_about_page)
            item = LegItem(
                collection=self.collection, doc=doc, next_request=next_request, list_url=list_url, query=query
            )
            yield self.item_or_request(item)
    def parse_list_page(self, response):
        multi_xpath = '//div[@class="shopListCon"]//tr'
        html5_response = response_html5parse(response)
        page_hxs = HtmlXPathSelector(html5_response)
        multi_hxs = page_hxs.select(multi_xpath)
        for hxs in multi_hxs:
            shop_name = ''.join(hxs.select('./td[2]//a//text()').extract())
            shop_name = clean_string(shop_name)
            shop_site_url = ''.join(hxs.select('./td[2]//a/@href').extract())
            shop_site_url = urllib.unquote(shop_site_url).strip()

            doc = {
                'shop_name': shop_name,
                'shop_site_url': shop_site_url,
            }

            detail_url = shop_site_url
            query = response.meta['query']
            list_url = response.url
            if not shop_site_url:
                next_request = None
            else:
                next_request = Request(detail_url, callback=self.parse_detail_page)
            item = LegItem(collection=self.collection, doc=doc,
                           next_request=next_request, list_url=list_url, query=query)
            yield self.item_or_request(item)
    def parse_about_page(self, response):
        item = response.meta['item']
        doc = item['doc']
        html5_response = response_html5parse(response)
        page_hxs = HtmlXPathSelector(html5_response)

        shop_products_hxs = page_hxs.select('//div[@class="side_2"]//text()')
        junks = [
            '加入失败',
            '您的询盘篮内信息已满0条!',
            '达到信息添加上限',
            '加入成功',
            '已成功添加到询盘篮!',
            '您的询盘篮中共有0家公司的0个产品',
            '继续浏览',
        ]
        shop_products = parse_products(shop_products_hxs, junks=junks)

        doc['shop_products'] = shop_products.strip()

        contact_url = doc.get('contact_url', '')
        if not contact_url:
            next_request = None
        else:
            next_request = Request(contact_url, callback=self.parse_contact_page)
        item['next_request'] = next_request
        yield self.item_or_request(item)
    def parse_about_page(self, response):
        item = response.meta['item']
        doc = item['doc']
        html5_response = response_html5parse(response)
        page_hxs = HtmlXPathSelector(html5_response)

        shop_products_hxs = page_hxs.select('//div[@class="content" or @class="m-content"]//text()')
        junks = [
            '当前位置:首页>企业简介','温馨提示:绿色字体部分为已审核企业信息'
        ]
        shop_products = parse_products(shop_products_hxs, junks=junks)

        contact2_hxs = page_hxs.select('//div[@class="companyInfo"]//td//text()')
        contact2_dic = parse_contact2(contact2_hxs)

        doc.update(contact2_dic)
        doc['shop_products'] = shop_products

        contact_url = doc.get('contact_url', '')
        if not contact_url:
            next_request = None
        else:
            next_request = Request(contact_url, callback=self.parse_contact_page)
        item['next_request'] = next_request
        yield self.item_or_request(item)
Example #10
0
    def parse_detail_page(self, response):
        item = response.meta['item']

        html5_response = response_html5parse(response)
        hxs = HtmlXPathSelector(html5_response)

        abstract = ''.join(hxs.select('//span[@id="txtAbstr"]//text()').extract())
        agent_institution = ''.join(hxs.select('//span[@id="tdANM"]//text()').extract())
        claims = ''.join(hxs.select('//span[@id="txtClaim"]//text()').extract())

        doc = item['doc']
        doc['abstract'] = abstract
        doc['agent_institution'] = agent_institution
        doc['claims'] = claims

        dic = doc['dic']
        pno = 'APP%s'%dic['StrANX']
        pdf_url = 'http://searchtel.patentstar.com.cn/CPRS2010/Docdb/GetBns.aspx?PNo=%s'%pno
        next_request = Request(pdf_url,callback=self.parse_pdf)
        item['next_request'] = next_request

        attachments = item['attachments']
        attach1 = {
            'url': response.url,
            'data': response.body_as_unicode(),
            'mime_type': get_mime_type_in_response(response)
        }
        attachments.append(attach1)
        yield self.item_or_request(item)
    def parse_list_page(self, response):
        multi_xpath = '//div[@class="xia_xuxian list_tiao"]'
        html5_response = response_html5parse(response)
        page_hxs = HtmlXPathSelector(html5_response)
        multi_hxs = page_hxs.select(multi_xpath)
        for hxs in multi_hxs:
            shop_name = ''.join(hxs.select('.//ul[@class="list_link"]/li/a//text()').extract())
            shop_name = clean_string(shop_name)
            shop_site_url = ''.join(hxs.select('.//ul[@class="list_link"]/li/a/@href').extract())
            shop_site_url = urllib.unquote(shop_site_url).strip()
            about_url = os.path.join(shop_site_url, 'profile.html')
            contact_url = os.path.join(shop_site_url, 'contact.html')

            doc = {
                'shop_name': shop_name,
                'shop_site_url': shop_site_url,
                'about_url': about_url,
                'contact_url': contact_url,
            }

            query = response.meta['query']
            list_url = response.url

            if not shop_site_url:
                next_request = None
            else:
                headers = {
                    'referer': shop_site_url
                }
                next_request = Request(about_url, headers=headers, callback=self.parse_about_page)
            item = LegItem(collection=self.collection, doc=doc,
                           next_request=next_request, list_url=list_url, query=query)
            yield self.item_or_request(item)
Example #12
0
 def parse_list_page(self, response):
     multi_xpath = '//div[@id="module-list"]/dl'
     html5_response = response_html5parse(response)
     hxs = HtmlXPathSelector(html5_response)
     multi_hxs = hxs.select(multi_xpath)
     for hxs in multi_hxs:
         title = ''.join(hxs.select('./dt[@class="title"]/a//text()').extract())
         pub_time = ''.join(hxs.select('./dt[@class="title"]/span//text()').extract())
         overview = ''.join(hxs.select('./dd[@class="content"]//text()').extract())
         url = ''.join(hxs.select('./dt[@class="title"]/a/@href').extract())
         info_misc_hxs = hxs.select('./dd[@class="info"]//text()')
         author, site_name, view_count, reply_count = self._ana_info_misc(info_misc_hxs)
         url = urllib.unquote(url).strip()
         doc = {
             'data_source': '奇虎论坛搜索',
             'site_name': site_name,
             'title': title,
             'pub_time': pub_time,
             'overview': overview,
             'url': url,
             'author': author,
             'view_count': view_count,
             'reply_count': reply_count,
         }
         detail_url = url
         list_url = response.url
         query = response.meta.get('query')
         if not detail_url:
             next_request = None
         else:
             next_request = Request(detail_url, callback=self.parse_detail_page)
         item = BbsItem(doc=doc,
                        next_request=next_request, list_url=list_url, query=query)
         yield self.item_or_request(item)
Example #13
0
    def parse_list_page(self, response):

        multi_xpath = '//div[@class="special-area-cont"]/div'
        html5_response = response_html5parse(response)
        hxs = HtmlXPathSelector(html5_response)
        multi_hxs = hxs.select(multi_xpath)
        query = response.meta.get('query')
        for hxs in multi_hxs:
            #//li[@class="g"][1]//h3/a/@href
            title = ''.join(hxs.select('.//h3//text()').extract()).strip()
            _video_name = ''.join(hxs.select('.//h3//span//font//text()').extract())
            if _video_name != query:
                continue
            url = ''.join(hxs.select('.//h3//a/@href').extract())
            _id = get_url_query(url)['id']
            doc = {
                'data_source': 'baidu视频搜索',
                'url': url,
                'title': title,
                'id': _id,
            }
            list_url = response.url
            json_list_url = 'http://video.baidu.com/htvshowsingles/?id=%s' % _id
            next_request = Request(json_list_url, callback=self.parse_site_list)
            item = VideoZjcmItem(doc=doc,
                                 next_request=next_request, list_url=list_url, query=query,
                                 attachments=[], attachment_urls=[])
            yield self.item_or_request(item)
Example #14
0
 def parse_list_page(self, response):
     multi_xpath = '//*[@id="r"]/table'
     html5_response = response_html5parse(response)
     hxs = HtmlXPathSelector(html5_response)
     multi_hxs = hxs.select(multi_xpath)
     for hxs in multi_hxs:
         site_name, pub_time = ''.join(hxs.select('.//nobr//text()').extract()).split(' ', 1)
         title = ''.join(hxs.select('.//span/b//text()').extract())
         overview = ''.join(hxs.select('.//font[@size="-1"]//text()').extract())
         url = ''.join(hxs.select('.//span/../@href').extract())
         url = urllib.unquote(url).strip()
         doc = {
             'data_source': '百度新闻搜索',
             'site_name': site_name,
             'pub_time': pub_time,
             'title': title,
             'overview': overview,
             'url': url,
         }
         detail_url = fix_possible_missing_scheme(url)
         list_url = response.url
         query = response.meta.get('query')
         if not detail_url:
             next_request = None
         else:
             next_request = Request(detail_url, callback=self.parse_detail_page)
         item = NewsItem(doc=doc,
                         next_request=next_request, list_url=list_url, query=query)
         yield self.item_or_request(item)
    def parse_list_page(self, response):
        """
        """
        multi_xpath = '//div[@class="tc_qytitle1"]'
        html5_response = response_html5parse(response)
        page_hxs = HtmlXPathSelector(html5_response)
        multi_hxs = page_hxs.select(multi_xpath)
        for hxs in multi_hxs:
            #提取部分
            shop_name = ''.join(hxs.select('./div/dl/dt/a//text()').extract())
            shop_name = clean_string(shop_name)
            shop_site_url = ''.join(hxs.select('./div/dl/dt/a/@href').extract())
            shop_site_url = urllib.unquote(shop_site_url).strip()
            detail_url = os.path.join(shop_site_url, 'clist--.html')

            doc = {
                'shop_name': shop_name.strip(),
                'shop_site_url': shop_site_url.strip(),
                'detail_url': detail_url,
            }

            query = response.meta['query']
            list_url = response.url

            if not shop_site_url:
                next_request = None
            else:
                headers = {
                    'referer': shop_site_url
                }
                next_request = Request(detail_url, headers=headers, callback=self.parse_detail_page)
            item = LegItem(collection=self.collection, doc=doc,
                           next_request=next_request, list_url=list_url, query=query)
            yield self.item_or_request(item)
Example #16
0
 def parse_list_page(self, response):
     multi_xpath = '//div[@class="results"]/div'
     html5_response = response_html5parse(response)
     hxs = HtmlXPathSelector(html5_response)
     multi_hxs = hxs.select(multi_xpath)
     for hxs in multi_hxs:
         title = ''.join(hxs.select('./h3//text()').extract())
         overview = ''.join(hxs.select('./div[@class="ft"]//text()').extract()
                            + hxs.select('./table[@class="vrbox"]//text()').extract())
         url = ''.join(hxs.select('./h3/a/@href').extract())
         url = urllib.unquote(url).strip()
         info_misc_hxs = hxs.select('.//cite//text()')
         site_name, pub_time = self._ana_info_misc(info_misc_hxs)
         doc = {
             'data_source': '搜狗论坛搜索',
             'site_name': site_name,
             'title': title,
             'pub_time': pub_time,
             'overview': overview,
             'url': url,
         }
         detail_url = url
         list_url = response.url
         query = response.meta.get('query')
         if not detail_url:
             next_request = None
         else:
             next_request = Request(detail_url, callback=self.parse_detail_page)
         item = BbsItem(doc=doc,
                        next_request=next_request, list_url=list_url, query=query)
         yield self.item_or_request(item)
Example #17
0
    def parse_detail_page(self, response):
        item = response.meta['item']

        html5_response = response_html5parse(response)
        hxs = HtmlXPathSelector(html5_response)

        texts1 = hxs.select('//table[@class="tb"]//td//text()').extract()
        texts2 = hxs.select('//div[@class="t2"]//text()').extract()
        result_doc1 = blur_ana_patent(texts1)
        result_doc2 = blur_ana_patent(texts2)
        patent_name = ''.join(hxs.select('//div[@class="t1"]//text()').extract())
        abstract = ''.join(hxs.select('//div[@class="con2"]//text()').extract())

        doc = item['doc']
        doc.update(result_doc1)
        doc.update(result_doc2)
        doc['patent_name'] = patent_name
        doc['abstract'] = abstract
        doc['application_number'] = doc['application_number'].lstrip('/专利号: ')
        attachments = item['attachments']
        attach1 = {
            'url': response.url,
            'data': response.body_as_unicode(),
            'mime_type': get_mime_type_in_response(response)
        }
        attachments.append(attach1)
        image_urls = get_image_urls(response)
        item['attachment_urls'] += image_urls
        yield self.item_or_request(item)
    def parse_list_page(self, response):
        multi_xpath = '//div[@class="jieguo"]'
        html5_response = response_html5parse(response)
        page_hxs = HtmlXPathSelector(html5_response)
        multi_hxs = page_hxs.select(multi_xpath)
        for hxs in multi_hxs:
            shop_name = ''.join(hxs.select('./div[2]/ul/li[1]/a//text()').extract())
            shop_name = clean_string(shop_name)
            shop_site_url = ''.join(hxs.select('./div[2]/ul/li[1]/a/@href').extract())
            shop_site_url = urllib.unquote(shop_site_url).strip()
            _shop_id = shop_site_url.split('/')[4]
            about_url = 'http://www.wjw.cn/companyprofile/%s/aboutus.xhtml' % _shop_id
            contact_url = 'http://www.wjw.cn/cardview/%s/card.xhtml' % _shop_id

            doc = {
                'shop_name': shop_name,
                'shop_site_url': shop_site_url,
                'about_url': about_url,
                'contact_url': contact_url,
            }

            query = response.meta['query']
            list_url = response.url

            if not _shop_id:
                next_request = None
            else:
                headers = {
                    'referer': shop_site_url
                }
                next_request = Request(about_url, headers=headers, callback=self.parse_about_page)
            item = LegItem(collection=self.collection, doc=doc,
                           next_request=next_request, list_url=list_url, query=query)
            yield self.item_or_request(item)
Example #19
0
    def parse_detail_page(self, response):
        item = response.meta["item"]

        html5_response = response_html5parse(response)
        hxs = HtmlXPathSelector(html5_response)

        texts = hxs.select('//table[@id="perildical2_dl"]//text()').extract()
        texts = clean_wanfang_texts(texts)
        result_doc = blur_ana_patent(texts)

        abstract = "".join(hxs.select('//div[@class="abstracts"]//text()').extract())

        doc = item["doc"]
        doc.update(result_doc)
        doc["abstract"] = abstract
        attachments = item["attachments"]
        attach1 = {
            "url": response.url,
            "data": response.body_as_unicode(),
            "mime_type": get_mime_type_in_response(response),
        }
        attachments.append(attach1)
        image_urls = get_image_urls(response)
        item["attachment_urls"] += image_urls
        # more_url = response.url.replace('_free', '')
        # next_request = Request(more_url, callback=self.parse_more_page)
        # item['next_request'] = next_request

        # hotfix for patent_type
        patent_type = "".join(hxs.select('//th[contains(.//text(),"专利类型")]/../td//text()').extract())
        doc["patent_type"] = patent_type

        yield self.item_or_request(item)
Example #20
0
    def parse_list_page(self, response):
        multi_xpath = '//div[@class="PatentBlock"]'
        html5_response = response_html5parse(response)
        hxs = HtmlXPathSelector(html5_response)
        multi_hxs = hxs.select(multi_xpath)

        for hxs in multi_hxs:
            url = ''.join(hxs.select('./div[2]/h2/a/@href').extract())
            url = 'http://www2.soopat.com%s' % url
            url = urllib.unquote(url).strip()
            doc = {
                'data_source': 'soopat中国专利搜索',
                'url': url,
            }
            detail_url = url
            list_url = response.url
            query = response.meta.get('query')
            if not detail_url:
                next_request = None
            else:
                next_request = Request(detail_url, callback=self.parse_detail_page)
            item = PatentItem(doc=doc,
                              next_request=next_request, list_url=list_url, query=query,
                              attachments=[], attachment_urls=[])
            yield self.item_or_request(item)
    def parse_contact_page(self, response):
        item = response.meta['item']
        doc = item['doc']
        html5_response = response_html5parse(response)
        page_hxs = HtmlXPathSelector(html5_response)

        contact_hxs = page_hxs.select('//div[@class="contactInfo" or @class="contact-detail"]//text()')
        contact_dic = parse_contact(contact_hxs)
        doc.update(contact_dic)

        yield self.item_or_request(item)
Example #22
0
 def parse_pdf(self, response):
     item = response.meta['item']
     html5_response = response_html5parse(response)
     hxs = HtmlXPathSelector(html5_response)
     pdf_url = ''.join(hxs.select('//a[text()="公开文本"]/@href'))
     if pdf_url:
         item['attachment_urls'].append(pdf_url)
     doc = item['doc']
     dic = doc['dic']
     idx = dic['StrANX']
     images_url = 'http://searchtel.patentstar.com.cn/CPRS2010/comm/getzhaiyao.aspx?idx=%s'%idx
     next_request = Request(images_url,callback=self.parse_images)
     item['next_request'] = next_request
     yield self.item_or_request(item)
    def parse_contact_page(self, response):
        item = response.meta['item']
        doc = item['doc']
        html5_response = response_html5parse(response)
        page_hxs = HtmlXPathSelector(html5_response)

        contact_hxs = page_hxs.select('//table//tr//text()')
        contact_dic = parse_contact(contact_hxs)

        shop_contacts = ''.join(page_hxs.select('//div[@class="card-detail"]/h3/a//text()').extract())
        shop_contacts = shop_contacts.strip()

        doc['shop_contacts'] = shop_contacts
        doc.update(contact_dic)
        yield self.item_or_request(item)
    def parse_contact_page(self, response):
        item = response.meta['item']
        doc = item['doc']
        html5_response = response_html5parse(response)
        page_hxs = HtmlXPathSelector(html5_response)

        contact2_hxs = page_hxs.select('//div[@class="contact"][1]//div//text()')
        contact2_dic = parse_contact2(contact2_hxs)

        qq_hxs = page_hxs.select('//div[@class="contact"][1]')
        shop_qq = parse_qq_num(qq_hxs)

        doc.update(contact2_dic)
        doc['shop_qq'] = shop_qq

        yield self.item_or_request(item)
    def parse_contact_page(self, response):
        item = response.meta["item"]
        doc = item["doc"]
        html5_response = response_html5parse(response)
        page_hxs = HtmlXPathSelector(html5_response)

        contact_hxs = page_hxs.select('//div[@id="b740"]/div/dl//text()')
        contact_dic = parse_contact(contact_hxs)

        qq_hxs = page_hxs.select('//div[@id="b740"]')
        shop_qq = parse_qq_num(qq_hxs)

        doc.update(contact_dic)
        doc["shop_qq"] = shop_qq

        yield self.item_or_request(item)
    def parse_detail_page(self, response):
        item = response.meta['item']
        doc = item['doc']
        html5_response = response_html5parse(response)
        page_hxs = HtmlXPathSelector(html5_response)

        contact_hxs = page_hxs.select('//div[@id="contrt"]/div[1]/dl/dd[1]/table//td//text()')
        contact_dic = parse_contact(contact_hxs)

        shop_products_hxs = page_hxs.select('//div[@id="tdsub_1"]//text()')
        shop_products_hxs = shop_products_hxs[3:]
        shop_products = parse_products(shop_products_hxs)

        doc.update(contact_dic)
        doc['shop_products'] = shop_products.strip()

        yield self.item_or_request(item)
    def parse_contact_page(self, response):
        item = response.meta['item']
        doc = item['doc']
        html5_response = response_html5parse(response)
        page_hxs = HtmlXPathSelector(html5_response)

        contact_hxs = page_hxs.select('//div[@class="contact"]//text()')
        contact_dic = parse_contact(contact_hxs)

        shop_contacts = ''.join(page_hxs.select('/html/body/div[9]/div[2]/div/div[2]/strong[2]//text()').extract())
        shop_contacts = shop_contacts.strip()

        doc.update(contact_dic)
        if shop_contacts:
            doc['shop_contacts'] = shop_contacts

        yield self.item_or_request(item)
Example #28
0
    def parse_contact_page(self, response):
        item = response.meta['item']
        doc = item['doc']

        html5_response = response_html5parse(response)
        page_hxs = HtmlXPathSelector(html5_response)

        contact_dic = {}
        contact1_hxs = page_hxs.select('//div[@class="com_contact"]/text()')
        contact1_dic = parse_contact1(contact1_hxs)
        contact2_hxs = page_hxs.select('//div[@class="com_contact"]//span//text()')
        contact2_dic = parse_contact2(contact2_hxs)

        contact_dic.update(contact2_dic)
        contact_dic.update(contact1_dic)

        doc.update(contact_dic)

        yield self.item_or_request(item)
    def parse_contact_page(self, response):
        item = response.meta['item']
        doc = item['doc']
        html5_response = response_html5parse(response)
        page_hxs = HtmlXPathSelector(html5_response)

        contact1_hxs = page_hxs.select('//div[@class="MpBox MpBoxBg"]//p//text()')
        contact1_dic = parse_contact1(contact1_hxs)

        shop_contacts = ''.join(page_hxs.select('//div[@class="MpBox MpBoxBg"]/p[1]//text()').extract())
        shop_contacts = clean_string(shop_contacts)
        qq_hxs = page_hxs.select('//div[@class="MainLeftBox1"]')
        shop_qq = parse_qq_num(qq_hxs)

        doc.update(contact1_dic)
        doc['shop_contacts'] = shop_contacts
        doc['shop_qq'] = shop_qq

        yield self.item_or_request(item)
 def get_next_page_request(self, response):
     html5_response = response_html5parse(response)
     page_hxs = HtmlXPathSelector(html5_response)
     if not self._total_page:
         total_page = ''.join(page_hxs.select('//div[@id="tc_menu"]/div[2]/span[2]//text()').extract())
         try:
             self._total_page = int(total_page)
         except ValueError:
             self._total_page = 0
     sub_url = ''.join(page_hxs.select('//div[@id="tc_menu"]/div[2]/a[1]/@href').extract())
     base_url = 'http://china.chemnet.com/company'
     page_num = response.meta['page_num']
     if page_num < self._total_page - 1:
         p = page_num + 1
         url = os.path.join(base_url, sub_url)
         #fast hack
         url = url.replace(';p=', '', 1)
         url += ';p=%d' % p
         return Request(url, callback=self.query_callback)