def parse(self, response):
     url = response.url
     if "research" in url:
         categories = response.xpath(".//*[@class='catec']")
         for i in xrange(len(categories) - 1):
             large_categories = categories[i].xpath(".//*[@class='fl']")
             large_category_name = clean_text(
                 large_categories.xpath(".//text()").extract()[0].strip())
             mid_categories = categories[i].xpath(".//span")
             for mid_category in mid_categories:
                 mid_category_name = clean_text(
                     mid_category.xpath(".//text()").extract()[0].strip())
                 page_url = mid_category.xpath(".//@href").extract()[0]
                 request = FormRequest(page_url,
                                       callback=self._parse_page_research,
                                       dont_filter=True)
                 request.meta["large_category"] = large_category_name
                 request.meta["mid_category"] = mid_category_name
                 request.meta["first_url"] = page_url
                 yield request
     elif "free" in url:
         large_categories = response.xpath(".//*[@class='tul2']//h2//a")
         for i in xrange(len(large_categories)):
             large_category_name = clean_text(large_categories[i].xpath(
                 ".//text()").extract()[0].strip())
             page_url = large_categories[i].xpath("./@href").extract()[0]
             request = FormRequest(page_url,
                                   callback=self._parse_page_free,
                                   dont_filter=True)
             request.meta["large_category"] = large_category_name
             request.meta["first_url"] = page_url
             yield request
 def _parse_research(self, response):
     reports = response.xpath(".//*[@id='ulNewsList']//li")
     if len(reports) > 0:
         for report in reports:
             item = IndustryReportSpiderItem()
             item["industry_large_category"] = response.meta[
                 "large_category"]
             item["industry_mid_category"] = response.meta["mid_category"]
             item["report_name"] = clean_text(
                 report.xpath(".//dt//text()").extract()[0].strip())
             industry_small_chs_name = parseIndustryName(
                 item["report_name"])
             if industry_small_chs_name != None:
                 item["industry_small_chs_name"] = industry_small_chs_name
             page_url = report.xpath(".//@href").extract()[0]
             item["report_link"] = page_url
             report_time = clean_text(
                 report.xpath(".//*[@class='time']").extract()[0].strip())
             if len(report_time) > 0:
                 item["report_revision_time"] = report_time.split(u":")[1]
                 date, date_precision = parse_date(
                     item["report_revision_time"])
                 try:
                     item["report_revision_time_standard"] = date.replace(
                         tzinfo=pytz.timezone('Asia/Shanghai'))
                 except:
                     pass
             item["source_domain"] = self.allowed_domains[0]
             item["source_name"] = u"中国产业洞察网"
             item["price_free"] = False
             yield item
Ejemplo n.º 3
0
    def _parse_item(self, response):
        domain_url = "http://www.chinairn.com/"
        reports = response.xpath("//p[@class='maintittle']")
        for report in reports:
            item = IndustryReportSpiderItem()
            item["industry_large_category"] = response.meta["large_category"]
            item["industry_mid_category"] = response.meta["mid_category"]
            item["report_name"] = clean_text(report.xpath(".//text()").extract()[0].strip())
            if parseIndustryName(item["report_name"]) != None:
                item["industry_small_chs_name"] = parseIndustryName(item["report_name"])
            page_url = report.xpath(".//@href").extract()[0]
            item["report_link"] = urljoin(domain_url, page_url)
            item["source_domain"] = self.allowed_domains[0]
            item["source_name"] = u"中国行业研究网"
            try:
                self.report_para(item, report)
            except:
                log.msg("Report revision time missed: %s"%item["report_link"], level=log.WARNING)
            item["price_free"] = False
            yield item

        Current_Page = clean_text(response.xpath(".//*[@class='hover']/text()").extract()[0])
        if Page_Limit > 0 and int(Current_Page) > Page_Limit:return

        nextPage = response.xpath("//a[contains(@class,'down')]")[0]
        lastPageurl = nextPage.xpath("./following-sibling::a[1]/@href").extract()[0]
        nextPageurl = nextPage.xpath("./@href").extract()[0]
        if lastPageurl != nextPageurl:
            url = urljoin(self.base_url, nextPageurl)
            request = FormRequest(url, callback=self._parse_item, dont_filter=True)
            request.meta["large_category"] = response.meta["large_category"]
            request.meta["mid_category"] = response.meta["mid_category"]
            yield request
Ejemplo n.º 4
0
 def parse_item(self, response):
     item = IndustryReportSpiderItem()
     item['report_link'] = response.url
     item['source_name'] = u"艾瑞网"
     item['source_domain'] = self.allowed_domains[0]
     item['report_name'] = clean_text(response.xpath("//*[@class='content_title']/text()").extract()[0].strip())
     price = response.xpath(u"//*[contains(text(), '价格')]/text()").extract()[0]
     item['price_free'] = True if u"免费" in price else False
     infodatas = response.xpath("//*[@class='content_titleinfoa']/span//text()").extract()
     for text in infodatas:
         try:
             if u"页数" in text:item['report_page_count'] = re.findall(ur'([0-9]+)', text)[0]
         except:pass
         try:
             if u"图表" in text:item['report_graph_count'] = re.findall(ur'([0-9]+)', text)[0]
         except:pass
         try:
             if u"-" in text:
                 item['report_revision_time'] = text
                 item['report_revision_time_standard'] = parse_date(item['report_revision_time'])
         except:pass
     item['industry_large_category'] =u"信息传输、软件和信息技术服务业"
     try:
         item['industry_mid_category'] = clean_text(response.xpath("//*[@class='content_titleinfoa']//a/text()").extract()[0].strip())
     except:
         pass
     # if item['price_free']:
         # self.browser.get(response.url)
         # self.browser.find_element_by_xpath("//*[@class='download']/a").click()
         # WebDriverWait(self.browser, 20).until(EC.presence_of_element_located((By.XPATH, ".//*[@id='ButtonBox']/input")))
         # Confirm = self.browser.find_element_by_xpath(".//*[@id='ButtonBox']/input")
         # Confirm.click()
         # WebDriverWait(self.browser, 20).until(EC.staleness_of(Confirm))
         # if ".pdf" in self.browser.current_url:item['pdf_Link'] = self.browser.current_url
     return item
Ejemplo n.º 5
0
    def _parse_hg(self, response):
        reports = response.xpath(".//*[@class='yahei f14']")
        if len(reports) > 0:
            for report in reports:
                item = IndustryReportSpiderItem()
                item["industry_large_category"] = response.meta[
                    "large_category"]
                item["report_name"] = clean_text(
                    report.xpath(".//a/text()").extract()[0].strip())
                page_url = report.xpath(".//a//@href").extract()[0]
                item["report_link"] = page_url
                report_time = clean_text(
                    report.xpath(".//*[@name='deliveddate']/text()").extract()
                    [0].strip())
                if report_time != None:
                    item["report_revision_time"] = report_time
                item["source_domain"] = self.allowed_domains[0]
                item["source_name"] = u"国研网"
                date, date_precision = parse_date(item["report_revision_time"])
                item["report_revision_time_standard"] = date.replace(
                    tzinfo=pytz.timezone('Asia/Shanghai'))

                dict = self.parseContent(page_url)
                if dict["free"] == False:
                    item["price_free"] = False
                else:
                    item["price_free"] = True
                    if (dict["url"][0] == "pdf"):
                        item["pdf_Link"] = dict["url"][1]
                    else:
                        item["content_Link"] = dict["url"][1]
                yield item
 def _parse_free(self, response):
     reports = response.xpath(".//*[@class='tul3']//li")
     if len(reports)>0:
         for report in reports:
             item = IndustryReportSpiderItem()
             item["industry_large_category"] = response.meta["large_category"]
             item["report_name"] = clean_text(report.xpath(".//a//text()").extract()[0].strip())
             industry_small_chs_name = parseIndustryName(item["report_name"])
             if industry_small_chs_name != None:
                 item["industry_small_chs_name"] = industry_small_chs_name
             page_url = report.xpath(".//@href").extract()[0]
             item["report_link"] = page_url
             report_content = self.parseTimeContent(page_url)
             if report_content != None:
                 item["report_content"] = report_content
             report_time = clean_text(report.xpath(".//span//text()").extract()[0].strip())
             if (len(report_time) > 0):
                 item["report_revision_time"] = report_time
                 date, date_precision = parse_date(item["report_revision_time"])
                 try:
                     item["report_revision_time_standard"] = date.replace(tzinfo=pytz.timezone('Asia/Shanghai'))
                 except:
                     pass
             item["source_domain"] = self.allowed_domains[0]
             item["source_name"] = u"中国产业洞察网"
             item["price_free"] = True
             yield item
 def _parse_research(self, response):
     reports = response.xpath(".//*[@id='ulNewsList']//li")
     if len(reports)>0:
         for report in reports:
             item = IndustryReportSpiderItem()
             item["industry_large_category"] = response.meta["large_category"]
             item["industry_mid_category"] = response.meta["mid_category"]
             item["report_name"] = clean_text(report.xpath(".//dt//text()").extract()[0].strip())
             industry_small_chs_name = parseIndustryName(item["report_name"])
             if industry_small_chs_name != None:
                 item["industry_small_chs_name"] = industry_small_chs_name
             page_url = report.xpath(".//@href").extract()[0]
             item["report_link"] = page_url
             report_time = clean_text(report.xpath(".//*[@class='time']").extract()[0].strip())
             if len(report_time) >0:
                 item["report_revision_time"] = report_time.split(u":")[1]
                 date, date_precision = parse_date(item["report_revision_time"])
                 try:
                     item["report_revision_time_standard"] = date.replace(tzinfo=pytz.timezone('Asia/Shanghai'))
                 except:
                     pass
             item["source_domain"] = self.allowed_domains[0]
             item["source_name"] = u"中国产业洞察网"
             item["price_free"] = False
             yield item
 def _parse_free(self, response):
     reports = response.xpath(".//*[@class='tul3']//li")
     if len(reports) > 0:
         for report in reports:
             item = IndustryReportSpiderItem()
             item["industry_large_category"] = response.meta[
                 "large_category"]
             item["report_name"] = clean_text(
                 report.xpath(".//a//text()").extract()[0].strip())
             industry_small_chs_name = parseIndustryName(
                 item["report_name"])
             if industry_small_chs_name != None:
                 item["industry_small_chs_name"] = industry_small_chs_name
             page_url = report.xpath(".//@href").extract()[0]
             item["report_link"] = page_url
             report_content = self.parseTimeContent(page_url)
             if report_content != None:
                 item["report_content"] = report_content
             report_time = clean_text(
                 report.xpath(".//span//text()").extract()[0].strip())
             if (len(report_time) > 0):
                 item["report_revision_time"] = report_time
                 date, date_precision = parse_date(
                     item["report_revision_time"])
                 try:
                     item["report_revision_time_standard"] = date.replace(
                         tzinfo=pytz.timezone('Asia/Shanghai'))
                 except:
                     pass
             item["source_domain"] = self.allowed_domains[0]
             item["source_name"] = u"中国产业洞察网"
             item["price_free"] = True
             yield item
 def parse(self, response):
     url  = response.url
     if "research" in url:
         categories = response.xpath(".//*[@class='catec']")
         for i in xrange(len(categories)-1):
             large_categories = categories[i].xpath(".//*[@class='fl']")
             large_category_name = clean_text(large_categories.xpath(".//text()").extract()[0].strip())
             mid_categories = categories[i].xpath(".//span")
             for mid_category in mid_categories:
                 mid_category_name = clean_text(mid_category.xpath(".//text()").extract()[0].strip())
                 page_url = mid_category.xpath(".//@href").extract()[0]
                 request = FormRequest(page_url, callback=self._parse_page_research, dont_filter=True)
                 request.meta["large_category"] = large_category_name
                 request.meta["mid_category"] = mid_category_name
                 request.meta["first_url"] = page_url
                 yield request
     elif "free" in url:
         large_categories = response.xpath(".//*[@class='tul2']//h2//a")
         for i in xrange(len(large_categories)):
             large_category_name = clean_text(large_categories[i].xpath(".//text()").extract()[0].strip())
             page_url = large_categories[i].xpath("./@href").extract()[0]
             request = FormRequest(page_url, callback=self._parse_page_free, dont_filter=True)
             request.meta["large_category"] = large_category_name
             request.meta["first_url"] = page_url
             yield request
Ejemplo n.º 10
0
 def parse(self, response):
     large_categories = response.xpath("//*[@class='tabContent bluelink']//*[contains(@style, 'padding')]/a")
     for large_category in large_categories:
         large_category_name = clean_text(large_category.xpath(".//text()").extract()[0].strip())
         mid_categorys = large_category.xpath("./parent::*/following-sibling::*[1]/a")
         for mid_category in mid_categorys:
             mid_category_name = clean_text(mid_category.xpath("./text()").extract()[0])
             mid_category_url = urljoin(self.base_url, mid_category.xpath("./@href").extract()[0])
             request = FormRequest(mid_category_url, callback=self.parse_middle_category, dont_filter=True)
             request.meta["large_category_name"] = large_category_name
             request.meta["mid_category_name"] = mid_category_name
             yield request
Ejemplo n.º 11
0
 def parse(self, response):
     large_categories = response.xpath(".//*[@class='shopleft_bt']//a")
     middle_categories = response.xpath(".//*[@class='shopnav2']")
     for i in xrange(len(large_categories)):
         large_category_name = clean_text(large_categories[i].xpath("./text()").extract()[0].strip())
         middle_category_list = middle_categories[i].xpath(".//*[@class='shopleft_wt']")
         for middle_category in middle_category_list:
             middle_category_name = clean_text(middle_category.xpath(".//a/text()").extract())
             page_url = middle_category.xpath(".//a//@href").extract()[0]
             url = urljoin(self.base_url, page_url)
             request = FormRequest(url, callback=self._parse_item, dont_filter=True)
             request.meta["large_category"] = large_category_name
             request.meta["mid_category"] = middle_category_name
             yield request
Ejemplo n.º 12
0
    def _parse_item(self, response):
        reports = response.xpath(".//*[@class='info']")
        if len(reports) > 0:
            for report in reports:
                item = IndustryReportSpiderItem()
                item["industry_large_category"] = response.meta[
                    "large_category"]
                item["industry_mid_category"] = response.meta["mid_category"]
                item["report_name"] = clean_text(
                    report.xpath(".//h3//a/text()").extract()[0].strip())
                industry_small_chs_name = parseIndustryName(
                    item["report_name"])
                if industry_small_chs_name != None:
                    item["industry_small_chs_name"] = industry_small_chs_name
                page_url = report.xpath(".//@href").extract()[0]
                url = urljoin(self.base_url, page_url)
                item["report_link"] = url
                string = clean_text(
                    report.xpath(" //*[@class='rdate']//span/text()").extract(
                    )[0].strip())
                temp = self.parseItem(string)
                if len(temp) == 1:
                    item["report_revision_time"] = temp[0][0]
                    item["report_page_count"] = temp[0][1]
                    item["report_graph_count"] = temp[0][2]
                    date, date_precision = parse_date(
                        item["report_revision_time"])
                    item["report_revision_time_standard"] = date.replace(
                        tzinfo=pytz.timezone('Asia/Shanghai'))

                item["source_domain"] = self.allowed_domains[0]
                item["source_name"] = u"中国投资咨询网"
                item["price_free"] = False
                yield item

            if_nextpage = response.xpath(".//*[@class='zw']")

            if len(if_nextpage) > 0:
                if (if_nextpage.xpath(".//text()").extract()[-1]
                    ) == u'下一页':  #存在翻页
                    page_url = if_nextpage.xpath(".//@href").extract()[-1]
                    url = urljoin(self.base_url, page_url)
                    request = FormRequest(url,
                                          callback=self._parse_item,
                                          dont_filter=True)
                    request.meta["large_category"] = response.meta[
                        "large_category"]
                    request.meta["mid_category"] = response.meta[
                        "mid_category"]
                    yield request
Ejemplo n.º 13
0
    def _parse_item(self, response):
        reports = response.xpath(".//*[@class='img_des']/a")
        if len(reports) > 0:
            for report in reports:
                item = IndustryReportSpiderItem()
                item["industry_large_category"] = response.meta[
                    "large_category"]
                item["industry_mid_category"] = response.meta["mid_category"]
                item["report_name"] = clean_text(
                    report.xpath("./text()").extract()[0].strip())
                industry_small_chs_name = parseIndustryName(
                    item["report_name"])
                if industry_small_chs_name != None:
                    item["industry_small_chs_name"] = industry_small_chs_name
                page_url = report.xpath(".//@href").extract()[0]
                url = urljoin(self.base_url, page_url)
                item["report_link"] = url
                report_time = self.parseTime(item["report_link"])
                if report_time != None:
                    item["report_revision_time"] = report_time
                item["source_domain"] = self.allowed_domains[0]
                item["source_name"] = u"欧咨网"
                date, date_precision = parse_date(item["report_revision_time"])
                item["report_revision_time_standard"] = date.replace(
                    tzinfo=pytz.timezone('Asia/Shanghai'))
                item["price_free"] = False
                yield item

            if len(response.xpath(".//*[@class='page']//@href")) > 1:  #存在翻页
                page_len = clean_text(
                    response.xpath(
                        ".//*[@class='page']//*[@class='fl_l']/text()").
                    extract()[0].strip())
                nextPageurl = response.xpath(
                    ".//*[@class='page']//@href").extract()[-1]
                finds = self.pattern_page.findall(page_len)
                currentPage = finds[0][0]
                totlePage = finds[0][1]
                if currentPage != totlePage:
                    url = urljoin(self.base_url, nextPageurl)
                    request = FormRequest(url,
                                          callback=self._parse_item,
                                          dont_filter=True)
                    request.meta["large_category"] = response.meta[
                        "large_category"]
                    request.meta["mid_category"] = response.meta[
                        "mid_category"]
                    yield request
Ejemplo n.º 14
0
 def report_para(self, item, report):
     revision_time = clean_text(report.xpath("..//*[@class='sp1']/text()").extract()[0].split(u":")[1].strip())
     if self.pattern.match(revision_time):
         item["report_revision_time"] = revision_time
     else:
         textlst = report.xpath("../*[@class='main']/text()").extract()[0].replace(u"】 ", u"【").split(u"【")
         for text in range(len(textlst)+1):
             if textlst[text].endswith(u"日期"):
                 item["report_revision_time"] = clean_text(textlst[text+1].strip())
                 break
     try:
         date, date_precision = parse_date(item["report_revision_time"])
         dateTimezone = date.replace(tzinfo=pytz.timezone('Asia/Shanghai'))
         item["report_revision_time_standard"] = dateTimezone
     except:
         pass
 def parse(self, response):
     large_categories = response.xpath(".//*[@id='cateitems']//h3//a")
     for large_category in large_categories:
         large_category_name = clean_text(large_category.xpath("./text()").extract()[0].strip())
         page_url = large_category.xpath("./@href").extract()[0]
         request = FormRequest(page_url, callback=self.parse_middle_category, dont_filter=True)
         request.meta["large_category"] = large_category_name
         yield request
Ejemplo n.º 16
0
 def parse(self, response):
     large_categories = response.xpath(".//*[@class='rptmap']//strong//a")
     for large_category in large_categories:
         large_category_name = clean_text(large_category.xpath("./text()").extract()[0].strip())
         page_url = large_category.xpath("./@href").extract()[0]
         url = urljoin(self.base_url, page_url)
         request = FormRequest(url, callback=self.parse_middle_category, dont_filter=True)
         request.meta["large_category"] = large_category_name
         yield request
Ejemplo n.º 17
0
 def parse_middle_category(self, response):
     report_types = response.xpath(u"//li[contains(text(),'报告')]")
     for report_type in report_types:
         mid_category_url = urljoin(self.base_url, report_type.xpath(u"./preceding-sibling::span[1]/a/@href").extract()[0])
         request = FormRequest(mid_category_url, callback=self.parse_page, dont_filter=True)
         request.meta["large_category_name"] = response.meta["large_category_name"]
         request.meta["mid_category_name"] = response.meta["mid_category_name"]
         request.meta["report_type"] = clean_text(report_type.xpath("./text()").extract()[0].strip())
         request.meta["page_base_url"] = mid_category_url
         yield request
Ejemplo n.º 18
0
 def parse(self, response):
     large_categories = response.xpath(".//*[@class='shopleft_bt']//a")
     middle_categories = response.xpath(".//*[@class='shopnav2']")
     for i in xrange(len(large_categories)):
         large_category_name = clean_text(
             large_categories[i].xpath("./text()").extract()[0].strip())
         middle_category_list = middle_categories[i].xpath(
             ".//*[@class='shopleft_wt']")
         for middle_category in middle_category_list:
             middle_category_name = clean_text(
                 middle_category.xpath(".//a/text()").extract())
             page_url = middle_category.xpath(".//a//@href").extract()[0]
             url = urljoin(self.base_url, page_url)
             request = FormRequest(url,
                                   callback=self._parse_item,
                                   dont_filter=True)
             request.meta["large_category"] = large_category_name
             request.meta["mid_category"] = middle_category_name
             yield request
Ejemplo n.º 19
0
 def parse(self, response):
     large_categories = response.xpath("//a[contains(@id, 'xTrade')][not(contains(@id, 'All'))]")
     for large_category in large_categories:
         large_category_name = clean_text(large_category.xpath("./text()").extract()[0].strip())
         id = large_category.xpath("./@id").extract()[0]
         if id[-1] == u'l': continue
         page_url = large_category.xpath("./@href").extract()[0]
         url = urljoin(self.base_url, page_url)
         request = FormRequest(url, callback=self.parse_middle_category, dont_filter=True)
         request.meta["large_category"] = large_category_name
         yield request
Ejemplo n.º 20
0
 def parse_middle_category(self, response):
     mid_categories = response.xpath(".//*[@class='report2']//h2//a")
     for mid_category in mid_categories:
         mid_category_name = clean_text(mid_category.xpath("./text()").extract()[0].strip())
         page_url = mid_category.xpath("./@href").extract()[0]
         url = urljoin(self.base_url, page_url)
         request = FormRequest(url, callback=self._parse_item, dont_filter=True)
         request.meta["large_category"] = response.meta["large_category"]
         request.meta["mid_category"] = mid_category_name
         request.meta["first_url"] = url
         yield request
Ejemplo n.º 21
0
 def parse(self, response):
     large_categories = response.xpath(".//*[@class='rptmap']//strong//a")
     for large_category in large_categories:
         large_category_name = clean_text(
             large_category.xpath("./text()").extract()[0].strip())
         page_url = large_category.xpath("./@href").extract()[0]
         url = urljoin(self.base_url, page_url)
         request = FormRequest(url,
                               callback=self.parse_middle_category,
                               dont_filter=True)
         request.meta["large_category"] = large_category_name
         yield request
Ejemplo n.º 22
0
 def parse_item(self, response):
     item = IndustryReportSpiderItem()
     item["industry_large_category"] = response.meta["large_category_name"]
     item["industry_mid_category"] = response.meta["mid_category_name"]
     item["report_name"] = clean_text(response.xpath("//h1/text()").extract()[0].strip())
     item["report_type"] = response.meta["report_type"]
     item["industry_small_chs_name"] = parseIndustryName(item["report_name"])
     item["price_free"] = self._parse_price(response)
     item["report_link"] = response.url
     item["source_domain"] = self.base_url
     item["source_name"] = u"中国产业发展研究网"
     yield item
 def _parse_page_free(self, response):
     total_pages = int(clean_text(response.xpath(".//*[@class='pages']//a//text()").extract()[-2].strip()))
     first_url = response.meta["first_url"]
     request = FormRequest(first_url, callback=self._parse_free, dont_filter=True)
     request.meta["large_category"] = response.meta["large_category"]
     yield request
     if total_pages>1:
         for i in xrange(1,total_pages):
             next_page = first_url[:-5] + '-p' + str(i+1) + '.html'
             request = FormRequest(next_page, callback=self._parse_free, dont_filter=True)
             request.meta["large_category"] = response.meta["large_category"]
             yield request
Ejemplo n.º 24
0
 def parse(self, response):
     large_categories = response.xpath(
         "//*[@class='tabContent bluelink']//*[contains(@style, 'padding')]/a"
     )
     for large_category in large_categories:
         large_category_name = clean_text(
             large_category.xpath(".//text()").extract()[0].strip())
         mid_categorys = large_category.xpath(
             "./parent::*/following-sibling::*[1]/a")
         for mid_category in mid_categorys:
             mid_category_name = clean_text(
                 mid_category.xpath("./text()").extract()[0])
             mid_category_url = urljoin(
                 self.base_url,
                 mid_category.xpath("./@href").extract()[0])
             request = FormRequest(mid_category_url,
                                   callback=self.parse_middle_category,
                                   dont_filter=True)
             request.meta["large_category_name"] = large_category_name
             request.meta["mid_category_name"] = mid_category_name
             yield request
 def parse_middle_category(self, response):
     mid_categories = response.xpath(".//*[@id='catgory_container']//a")
     for mid_category in mid_categories:
         mid_category_name = clean_text(mid_category.xpath("./text()").extract()[0].strip())
         page_url = mid_category.xpath("./@href").extract()[0]
         if((mid_category_name!=u'不限') & ("report" in page_url)):
             url = urljoin(self.base_url, page_url)
             request = FormRequest(url, callback=self._parse_firstPage, dont_filter=True)
             request.meta["large_category"] = response.meta["large_category"]
             request.meta["mid_category"] = mid_category_name
             request.meta["first_url"] = url
             yield request
Ejemplo n.º 26
0
    def _parse_item(self, response):
        reports = response.xpath(".//*[@class='info']")
        if len(reports)>0:
            for report in reports:
                item = IndustryReportSpiderItem()
                item["industry_large_category"] = response.meta["large_category"]
                item["industry_mid_category"] = response.meta["mid_category"]
                item["report_name"] = clean_text(report.xpath(".//h3//a/text()").extract()[0].strip())
                industry_small_chs_name = parseIndustryName(item["report_name"])
                if industry_small_chs_name != None:
                    item["industry_small_chs_name"] = industry_small_chs_name
                page_url = report.xpath(".//@href").extract()[0]
                url = urljoin(self.base_url, page_url)
                item["report_link"] = url
                string =clean_text(report.xpath(" //*[@class='rdate']//span/text()").extract()[0].strip())
                temp = self.parseItem(string)
                if len(temp)==1:
                    item["report_revision_time"] = temp[0][0]
                    item["report_page_count"] = temp[0][1]
                    item["report_graph_count"] = temp[0][2]
                    date, date_precision = parse_date(item["report_revision_time"])
                    item["report_revision_time_standard"] = date.replace(tzinfo=pytz.timezone('Asia/Shanghai'))

                item["source_domain"] = self.allowed_domains[0]
                item["source_name"] = u"中国投资咨询网"
                item["price_free"] = False
                yield item

            if_nextpage = response.xpath(".//*[@class='zw']")

            if len(if_nextpage)>0:
                if (if_nextpage.xpath(".//text()").extract()[-1])==u'下一页': #存在翻页
                    page_url =if_nextpage.xpath(".//@href").extract()[-1]
                    url = urljoin(self.base_url, page_url)
                    request = FormRequest(url, callback=self._parse_item, dont_filter=True)
                    request.meta["large_category"] = response.meta["large_category"]
                    request.meta["mid_category"] = response.meta["mid_category"]
                    yield request
Ejemplo n.º 27
0
    def parse_index_page(self,response):
        self.current_page += 1
        industry = response.xpath('//*[@id="DataList1"]//a')
        for r in industry:
            industry_small_chs_names= clean_text(r.xpath('./text()').extract()[0].strip())
            self.industryList.append(industry_small_chs_names.encode("utf-8"))

        if self.current_page % 30 == 0:
            self.f.write("\n".join(self.industryList))
            self.f.write("\n")
            print "!" * 50, "写入文件", "!" * 50
            self.industryList = []
        print "*" * 30, response.meta["page"], "*" * 30
        print "*" * 30, "第", self.current_page, "页", "*" * 30
Ejemplo n.º 28
0
 def parse_middle_category(self, response):
     mid_categories = response.xpath(".//*[@class='report2']//h2//a")
     for mid_category in mid_categories:
         mid_category_name = clean_text(
             mid_category.xpath("./text()").extract()[0].strip())
         page_url = mid_category.xpath("./@href").extract()[0]
         url = urljoin(self.base_url, page_url)
         request = FormRequest(url,
                               callback=self._parse_item,
                               dont_filter=True)
         request.meta["large_category"] = response.meta["large_category"]
         request.meta["mid_category"] = mid_category_name
         request.meta["first_url"] = url
         yield request
Ejemplo n.º 29
0
 def parse_item(self, response):
     item = IndustryReportSpiderItem()
     item["industry_large_category"] = response.meta["large_category_name"]
     item["industry_mid_category"] = response.meta["mid_category_name"]
     item["report_name"] = clean_text(
         response.xpath("//h1/text()").extract()[0].strip())
     item["report_type"] = response.meta["report_type"]
     item["industry_small_chs_name"] = parseIndustryName(
         item["report_name"])
     item["price_free"] = self._parse_price(response)
     item["report_link"] = response.url
     item["source_domain"] = self.base_url
     item["source_name"] = u"中国产业发展研究网"
     yield item
 def _parse_item(self, response):
     reports = response.xpath(".//*[@class='clistdl']")
     for report in reports:
         item = IndustryReportSpiderItem()
         item["industry_large_category"] = response.meta["large_category"]
         item["industry_mid_category"] = response.meta["mid_category"]
         item["report_name"] = clean_text(report.xpath(".//dt/a/text()").extract()[0].strip())
         if len(report.xpath(".//dd//*[@class='cxgrep']//@title"))>0:
             industry = clean_text(report.xpath(".//dd//*[@class='cxgrep']//@title").extract()[0].strip())
         else:
             industry = item["report_name"]
         industry_small_chs_name = parseIndustryName(industry)
         if industry_small_chs_name != None:
                 item["industry_small_chs_name"] = industry_small_chs_name
         page_url = report.xpath(".//@href").extract()[0]
         item["report_link"] = page_url
         item["report_revision_time"] = clean_text(report.xpath(".//dt/span/text()").extract()[0].strip())
         item["source_domain"] = self.allowed_domains[0]
         item["source_name"] = u"中国报告大厅"
         date, date_precision = parse_date(item["report_revision_time"])
         item["report_revision_time_standard"] = date.replace(tzinfo=pytz.timezone('Asia/Shanghai'))
         item["price_free"] = False
         yield item
Ejemplo n.º 31
0
 def _parse_hy_large(self, response):
     large_categories = response.xpath(".//*[@class='yahei f16 fB']")
     for large_category in large_categories:
         large_category_name = clean_text(
             large_category.xpath(".//text()").extract()[0].strip())
         if u"区域重点行业中小企业季报" not in large_category_name:
             page_url = large_category.xpath(".//@href").extract()[0]
             url = urljoin(self.base_url, page_url)
             request = FormRequest(url,
                                   callback=self._parse_hg_mid,
                                   dont_filter=True)
             request.meta["large_category"] = parseIndustryName(
                 large_category_name)
             yield request
Ejemplo n.º 32
0
 def _parse_first(self, response):
     total_pages = clean_text(
         response.xpath(
             ".//*[@id='Content_WebPageDocumentsByUId1_span_totalpage']//text()"
         ).extract()[0].strip())
     if total_pages >= 1:
         for i in xrange(0, int(total_pages)):
             next_page = response.url + '&curpage=' + str(i + 1)
             request = FormRequest(next_page,
                                   callback=response.meta["callback"],
                                   dont_filter=True)
             request.meta["large_category"] = response.meta[
                 "large_category"]
             yield request
Ejemplo n.º 33
0
    def parse_index_page(self, response):
        self.current_page += 1
        industry = response.xpath('//*[@id="DataList1"]//a')
        for r in industry:
            industry_small_chs_names = clean_text(
                r.xpath('./text()').extract()[0].strip())
            self.industryList.append(industry_small_chs_names.encode("utf-8"))

        if self.current_page % 30 == 0:
            self.f.write("\n".join(self.industryList))
            self.f.write("\n")
            print "!" * 50, "写入文件", "!" * 50
            self.industryList = []
        print "*" * 30, response.meta["page"], "*" * 30
        print "*" * 30, "第", self.current_page, "页", "*" * 30
Ejemplo n.º 34
0
    def _parse_item(self, response):
        reports = response.xpath(".//*[@class='img_des']/a")
        if len(reports)>0:
            for report in reports:
                item = IndustryReportSpiderItem()
                item["industry_large_category"] = response.meta["large_category"]
                item["industry_mid_category"] = response.meta["mid_category"]
                item["report_name"] = clean_text(report.xpath("./text()").extract()[0].strip())
                industry_small_chs_name = parseIndustryName(item["report_name"])
                if industry_small_chs_name != None:
                    item["industry_small_chs_name"] = industry_small_chs_name
                page_url = report.xpath(".//@href").extract()[0]
                url = urljoin(self.base_url, page_url)
                item["report_link"] = url
                report_time = self.parseTime(item["report_link"])
                if report_time != None:
                    item["report_revision_time"] = report_time
                item["source_domain"] = self.allowed_domains[0]
                item["source_name"] = u"欧咨网"
                date, date_precision = parse_date(item["report_revision_time"])
                item["report_revision_time_standard"] = date.replace(tzinfo=pytz.timezone('Asia/Shanghai'))
                item["price_free"] = False
                yield item

            if len(response.xpath(".//*[@class='page']//@href"))>1: #存在翻页
                page_len = clean_text(response.xpath(".//*[@class='page']//*[@class='fl_l']/text()").extract()[0].strip())
                nextPageurl = response.xpath(".//*[@class='page']//@href").extract()[-1]
                finds = self.pattern_page.findall(page_len)
                currentPage = finds[0][0]
                totlePage = finds[0][1]
                if currentPage != totlePage:
                    url = urljoin(self.base_url, nextPageurl)
                    request = FormRequest(url, callback=self._parse_item, dont_filter=True)
                    request.meta["large_category"] = response.meta["large_category"]
                    request.meta["mid_category"] = response.meta["mid_category"]
                    yield request
Ejemplo n.º 35
0
 def parse_middle_category(self, response):
     report_types = response.xpath(u"//li[contains(text(),'报告')]")
     for report_type in report_types:
         mid_category_url = urljoin(
             self.base_url,
             report_type.xpath(
                 u"./preceding-sibling::span[1]/a/@href").extract()[0])
         request = FormRequest(mid_category_url,
                               callback=self.parse_page,
                               dont_filter=True)
         request.meta["large_category_name"] = response.meta[
             "large_category_name"]
         request.meta["mid_category_name"] = response.meta[
             "mid_category_name"]
         request.meta["report_type"] = clean_text(
             report_type.xpath("./text()").extract()[0].strip())
         request.meta["page_base_url"] = mid_category_url
         yield request
 def _parse_firstPage(self, response):
     if len(response.xpath(".//*[@class='counter']/text()"))>=1: #存在翻页
         first_url = response.meta["first_url"]
         page_len = clean_text(response.xpath(".//*[@class='counter']/text()").extract()[0].strip())
         finds = self.pattern_page.findall(page_len)
         totlePage = finds[0][1]
         for i in xrange(1,int(totlePage)):
             nextPageurl = "index_" + str(i+1) + ".html"
             url = urljoin(first_url, nextPageurl)
             request = FormRequest(url, callback=self._parse_item, dont_filter=True)
             request.meta["large_category"] = response.meta["large_category"]
             request.meta["mid_category"] = response.meta["mid_category"]
             request.meta["first_url"] = first_url
             yield request
     else:
         request = FormRequest(response.url, callback=self._parse_item, dont_filter=True)
         request.meta["large_category"] = response.meta["large_category"]
         request.meta["mid_category"] = response.meta["mid_category"]
         yield request
 def _parse_page_free(self, response):
     total_pages = int(
         clean_text(
             response.xpath(".//*[@class='pages']//a//text()").extract()
             [-2].strip()))
     first_url = response.meta["first_url"]
     request = FormRequest(first_url,
                           callback=self._parse_free,
                           dont_filter=True)
     request.meta["large_category"] = response.meta["large_category"]
     yield request
     if total_pages > 1:
         for i in xrange(1, total_pages):
             next_page = first_url[:-5] + '-p' + str(i + 1) + '.html'
             request = FormRequest(next_page,
                                   callback=self._parse_free,
                                   dont_filter=True)
             request.meta["large_category"] = response.meta[
                 "large_category"]
             yield request