Python parseIndustryName 예제들, IndustryReportSpider.Utils.parseIndustryName Python 예제들

예제 #1

0

파일 보기

파일: IndustryReportSpiders.py 프로젝트: hanwei2008/crawl

    def _parse_item(self, response):
        domain_url = "http://www.chinairn.com/"
        reports = response.xpath("//p[@class='maintittle']")
        for report in reports:
            item = IndustryReportSpiderItem()
            item["industry_large_category"] = response.meta["large_category"]
            item["industry_mid_category"] = response.meta["mid_category"]
            item["report_name"] = clean_text(report.xpath(".//text()").extract()[0].strip())
            if parseIndustryName(item["report_name"]) != None:
                item["industry_small_chs_name"] = parseIndustryName(item["report_name"])
            page_url = report.xpath(".//@href").extract()[0]
            item["report_link"] = urljoin(domain_url, page_url)
            item["source_domain"] = self.allowed_domains[0]
            item["source_name"] = u"中国行业研究网"
            try:
                self.report_para(item, report)
            except:
                log.msg("Report revision time missed: %s"%item["report_link"], level=log.WARNING)
            item["price_free"] = False
            yield item

        Current_Page = clean_text(response.xpath(".//*[@class='hover']/text()").extract()[0])
        if Page_Limit > 0 and int(Current_Page) > Page_Limit:return

        nextPage = response.xpath("//a[contains(@class,'down')]")[0]
        lastPageurl = nextPage.xpath("./following-sibling::a[1]/@href").extract()[0]
        nextPageurl = nextPage.xpath("./@href").extract()[0]
        if lastPageurl != nextPageurl:
            url = urljoin(self.base_url, nextPageurl)
            request = FormRequest(url, callback=self._parse_item, dont_filter=True)
            request.meta["large_category"] = response.meta["large_category"]
            request.meta["mid_category"] = response.meta["mid_category"]
            yield request

예제 #2

0

파일 보기

파일: IndustryReportSpider51report.py 프로젝트: hanwei2008/crawl

 def _parse_free(self, response):
     reports = response.xpath(".//*[@class='tul3']//li")
     if len(reports)>0:
         for report in reports:
             item = IndustryReportSpiderItem()
             item["industry_large_category"] = response.meta["large_category"]
             item["report_name"] = clean_text(report.xpath(".//a//text()").extract()[0].strip())
             industry_small_chs_name = parseIndustryName(item["report_name"])
             if industry_small_chs_name != None:
                 item["industry_small_chs_name"] = industry_small_chs_name
             page_url = report.xpath(".//@href").extract()[0]
             item["report_link"] = page_url
             report_content = self.parseTimeContent(page_url)
             if report_content != None:
                 item["report_content"] = report_content
             report_time = clean_text(report.xpath(".//span//text()").extract()[0].strip())
             if (len(report_time) > 0):
                 item["report_revision_time"] = report_time
                 date, date_precision = parse_date(item["report_revision_time"])
                 try:
                     item["report_revision_time_standard"] = date.replace(tzinfo=pytz.timezone('Asia/Shanghai'))
                 except:
                     pass
             item["source_domain"] = self.allowed_domains[0]
             item["source_name"] = u"中国产业洞察网"
             item["price_free"] = True
             yield item

예제 #3

0

파일 보기

파일: IndustryReportSpider51report.py 프로젝트: hanwei2008/crawl

 def _parse_research(self, response):
     reports = response.xpath(".//*[@id='ulNewsList']//li")
     if len(reports)>0:
         for report in reports:
             item = IndustryReportSpiderItem()
             item["industry_large_category"] = response.meta["large_category"]
             item["industry_mid_category"] = response.meta["mid_category"]
             item["report_name"] = clean_text(report.xpath(".//dt//text()").extract()[0].strip())
             industry_small_chs_name = parseIndustryName(item["report_name"])
             if industry_small_chs_name != None:
                 item["industry_small_chs_name"] = industry_small_chs_name
             page_url = report.xpath(".//@href").extract()[0]
             item["report_link"] = page_url
             report_time = clean_text(report.xpath(".//*[@class='time']").extract()[0].strip())
             if len(report_time) >0:
                 item["report_revision_time"] = report_time.split(u"：")[1]
                 date, date_precision = parse_date(item["report_revision_time"])
                 try:
                     item["report_revision_time_standard"] = date.replace(tzinfo=pytz.timezone('Asia/Shanghai'))
                 except:
                     pass
             item["source_domain"] = self.allowed_domains[0]
             item["source_name"] = u"中国产业洞察网"
             item["price_free"] = False
             yield item

예제 #4

0

파일 보기

파일: IndustryReportSpider51report.py 프로젝트: hanwei2008/crawl

 def _parse_research(self, response):
     reports = response.xpath(".//*[@id='ulNewsList']//li")
     if len(reports) > 0:
         for report in reports:
             item = IndustryReportSpiderItem()
             item["industry_large_category"] = response.meta[
                 "large_category"]
             item["industry_mid_category"] = response.meta["mid_category"]
             item["report_name"] = clean_text(
                 report.xpath(".//dt//text()").extract()[0].strip())
             industry_small_chs_name = parseIndustryName(
                 item["report_name"])
             if industry_small_chs_name != None:
                 item["industry_small_chs_name"] = industry_small_chs_name
             page_url = report.xpath(".//@href").extract()[0]
             item["report_link"] = page_url
             report_time = clean_text(
                 report.xpath(".//*[@class='time']").extract()[0].strip())
             if len(report_time) > 0:
                 item["report_revision_time"] = report_time.split(u"：")[1]
                 date, date_precision = parse_date(
                     item["report_revision_time"])
                 try:
                     item["report_revision_time_standard"] = date.replace(
                         tzinfo=pytz.timezone('Asia/Shanghai'))
                 except:
                     pass
             item["source_domain"] = self.allowed_domains[0]
             item["source_name"] = u"中国产业洞察网"
             item["price_free"] = False
             yield item

예제 #5

0

파일 보기

파일: IndustryReportSpider51report.py 프로젝트: hanwei2008/crawl

 def _parse_free(self, response):
     reports = response.xpath(".//*[@class='tul3']//li")
     if len(reports) > 0:
         for report in reports:
             item = IndustryReportSpiderItem()
             item["industry_large_category"] = response.meta[
                 "large_category"]
             item["report_name"] = clean_text(
                 report.xpath(".//a//text()").extract()[0].strip())
             industry_small_chs_name = parseIndustryName(
                 item["report_name"])
             if industry_small_chs_name != None:
                 item["industry_small_chs_name"] = industry_small_chs_name
             page_url = report.xpath(".//@href").extract()[0]
             item["report_link"] = page_url
             report_content = self.parseTimeContent(page_url)
             if report_content != None:
                 item["report_content"] = report_content
             report_time = clean_text(
                 report.xpath(".//span//text()").extract()[0].strip())
             if (len(report_time) > 0):
                 item["report_revision_time"] = report_time
                 date, date_precision = parse_date(
                     item["report_revision_time"])
                 try:
                     item["report_revision_time_standard"] = date.replace(
                         tzinfo=pytz.timezone('Asia/Shanghai'))
                 except:
                     pass
             item["source_domain"] = self.allowed_domains[0]
             item["source_name"] = u"中国产业洞察网"
             item["price_free"] = True
             yield item

예제 #6

0

파일 보기

파일: chinaidrSpider.py 프로젝트: hanwei2008/crawl

 def parse_item(self, response):
     item = IndustryReportSpiderItem()
     item["industry_large_category"] = response.meta["large_category_name"]
     item["industry_mid_category"] = response.meta["mid_category_name"]
     item["report_name"] = clean_text(response.xpath("//h1/text()").extract()[0].strip())
     item["report_type"] = response.meta["report_type"]
     item["industry_small_chs_name"] = parseIndustryName(item["report_name"])
     item["price_free"] = self._parse_price(response)
     item["report_link"] = response.url
     item["source_domain"] = self.base_url
     item["source_name"] = u"中国产业发展研究网"
     yield item

예제 #7

0

파일 보기

파일: chinaidrSpider.py 프로젝트: hanwei2008/crawl

 def parse_item(self, response):
     item = IndustryReportSpiderItem()
     item["industry_large_category"] = response.meta["large_category_name"]
     item["industry_mid_category"] = response.meta["mid_category_name"]
     item["report_name"] = clean_text(
         response.xpath("//h1/text()").extract()[0].strip())
     item["report_type"] = response.meta["report_type"]
     item["industry_small_chs_name"] = parseIndustryName(
         item["report_name"])
     item["price_free"] = self._parse_price(response)
     item["report_link"] = response.url
     item["source_domain"] = self.base_url
     item["source_name"] = u"中国产业发展研究网"
     yield item

예제 #8

0

파일 보기

파일: IndustryReportSpiderDrcnet.py 프로젝트: hanwei2008/crawl

 def _parse_hy_large(self, response):
     large_categories = response.xpath(".//*[@class='yahei f16 fB']")
     for large_category in large_categories:
         large_category_name = clean_text(
             large_category.xpath(".//text()").extract()[0].strip())
         if u"区域重点行业中小企业季报" not in large_category_name:
             page_url = large_category.xpath(".//@href").extract()[0]
             url = urljoin(self.base_url, page_url)
             request = FormRequest(url,
                                   callback=self._parse_hg_mid,
                                   dont_filter=True)
             request.meta["large_category"] = parseIndustryName(
                 large_category_name)
             yield request

예제 #9

0

파일 보기

파일: IndustryReportSpiderOcn.py 프로젝트: hanwei2008/crawl

    def _parse_item(self, response):
        reports = response.xpath(".//*[@class='info']")
        if len(reports) > 0:
            for report in reports:
                item = IndustryReportSpiderItem()
                item["industry_large_category"] = response.meta[
                    "large_category"]
                item["industry_mid_category"] = response.meta["mid_category"]
                item["report_name"] = clean_text(
                    report.xpath(".//h3//a/text()").extract()[0].strip())
                industry_small_chs_name = parseIndustryName(
                    item["report_name"])
                if industry_small_chs_name != None:
                    item["industry_small_chs_name"] = industry_small_chs_name
                page_url = report.xpath(".//@href").extract()[0]
                url = urljoin(self.base_url, page_url)
                item["report_link"] = url
                string = clean_text(
                    report.xpath(" //*[@class='rdate']//span/text()").extract(
                    )[0].strip())
                temp = self.parseItem(string)
                if len(temp) == 1:
                    item["report_revision_time"] = temp[0][0]
                    item["report_page_count"] = temp[0][1]
                    item["report_graph_count"] = temp[0][2]
                    date, date_precision = parse_date(
                        item["report_revision_time"])
                    item["report_revision_time_standard"] = date.replace(
                        tzinfo=pytz.timezone('Asia/Shanghai'))

                item["source_domain"] = self.allowed_domains[0]
                item["source_name"] = u"中国投资咨询网"
                item["price_free"] = False
                yield item

            if_nextpage = response.xpath(".//*[@class='zw']")

            if len(if_nextpage) > 0:
                if (if_nextpage.xpath(".//text()").extract()[-1]
                    ) == u'下一页':  #存在翻页
                    page_url = if_nextpage.xpath(".//@href").extract()[-1]
                    url = urljoin(self.base_url, page_url)
                    request = FormRequest(url,
                                          callback=self._parse_item,
                                          dont_filter=True)
                    request.meta["large_category"] = response.meta[
                        "large_category"]
                    request.meta["mid_category"] = response.meta[
                        "mid_category"]
                    yield request

예제 #10

0

파일 보기

파일: IndustryReportSpiderOlxoz.py 프로젝트: hanwei2008/crawl

    def _parse_item(self, response):
        reports = response.xpath(".//*[@class='img_des']/a")
        if len(reports) > 0:
            for report in reports:
                item = IndustryReportSpiderItem()
                item["industry_large_category"] = response.meta[
                    "large_category"]
                item["industry_mid_category"] = response.meta["mid_category"]
                item["report_name"] = clean_text(
                    report.xpath("./text()").extract()[0].strip())
                industry_small_chs_name = parseIndustryName(
                    item["report_name"])
                if industry_small_chs_name != None:
                    item["industry_small_chs_name"] = industry_small_chs_name
                page_url = report.xpath(".//@href").extract()[0]
                url = urljoin(self.base_url, page_url)
                item["report_link"] = url
                report_time = self.parseTime(item["report_link"])
                if report_time != None:
                    item["report_revision_time"] = report_time
                item["source_domain"] = self.allowed_domains[0]
                item["source_name"] = u"欧咨网"
                date, date_precision = parse_date(item["report_revision_time"])
                item["report_revision_time_standard"] = date.replace(
                    tzinfo=pytz.timezone('Asia/Shanghai'))
                item["price_free"] = False
                yield item

            if len(response.xpath(".//*[@class='page']//@href")) > 1:  #存在翻页
                page_len = clean_text(
                    response.xpath(
                        ".//*[@class='page']//*[@class='fl_l']/text()").
                    extract()[0].strip())
                nextPageurl = response.xpath(
                    ".//*[@class='page']//@href").extract()[-1]
                finds = self.pattern_page.findall(page_len)
                currentPage = finds[0][0]
                totlePage = finds[0][1]
                if currentPage != totlePage:
                    url = urljoin(self.base_url, nextPageurl)
                    request = FormRequest(url,
                                          callback=self._parse_item,
                                          dont_filter=True)
                    request.meta["large_category"] = response.meta[
                        "large_category"]
                    request.meta["mid_category"] = response.meta[
                        "mid_category"]
                    yield request

예제 #11

0

파일 보기

파일: IndustryReportSpiderOcn.py 프로젝트: hanwei2008/crawl

    def _parse_item(self, response):
        reports = response.xpath(".//*[@class='info']")
        if len(reports)>0:
            for report in reports:
                item = IndustryReportSpiderItem()
                item["industry_large_category"] = response.meta["large_category"]
                item["industry_mid_category"] = response.meta["mid_category"]
                item["report_name"] = clean_text(report.xpath(".//h3//a/text()").extract()[0].strip())
                industry_small_chs_name = parseIndustryName(item["report_name"])
                if industry_small_chs_name != None:
                    item["industry_small_chs_name"] = industry_small_chs_name
                page_url = report.xpath(".//@href").extract()[0]
                url = urljoin(self.base_url, page_url)
                item["report_link"] = url
                string =clean_text(report.xpath(" //*[@class='rdate']//span/text()").extract()[0].strip())
                temp = self.parseItem(string)
                if len(temp)==1:
                    item["report_revision_time"] = temp[0][0]
                    item["report_page_count"] = temp[0][1]
                    item["report_graph_count"] = temp[0][2]
                    date, date_precision = parse_date(item["report_revision_time"])
                    item["report_revision_time_standard"] = date.replace(tzinfo=pytz.timezone('Asia/Shanghai'))

                item["source_domain"] = self.allowed_domains[0]
                item["source_name"] = u"中国投资咨询网"
                item["price_free"] = False
                yield item

            if_nextpage = response.xpath(".//*[@class='zw']")

            if len(if_nextpage)>0:
                if (if_nextpage.xpath(".//text()").extract()[-1])==u'下一页': #存在翻页
                    page_url =if_nextpage.xpath(".//@href").extract()[-1]
                    url = urljoin(self.base_url, page_url)
                    request = FormRequest(url, callback=self._parse_item, dont_filter=True)
                    request.meta["large_category"] = response.meta["large_category"]
                    request.meta["mid_category"] = response.meta["mid_category"]
                    yield request

예제 #12

0

파일 보기

파일: IndustryReportSpiderChinabgao.py 프로젝트: hanwei2008/crawl

 def _parse_item(self, response):
     reports = response.xpath(".//*[@class='clistdl']")
     for report in reports:
         item = IndustryReportSpiderItem()
         item["industry_large_category"] = response.meta["large_category"]
         item["industry_mid_category"] = response.meta["mid_category"]
         item["report_name"] = clean_text(report.xpath(".//dt/a/text()").extract()[0].strip())
         if len(report.xpath(".//dd//*[@class='cxgrep']//@title"))>0:
             industry = clean_text(report.xpath(".//dd//*[@class='cxgrep']//@title").extract()[0].strip())
         else:
             industry = item["report_name"]
         industry_small_chs_name = parseIndustryName(industry)
         if industry_small_chs_name != None:
                 item["industry_small_chs_name"] = industry_small_chs_name
         page_url = report.xpath(".//@href").extract()[0]
         item["report_link"] = page_url
         item["report_revision_time"] = clean_text(report.xpath(".//dt/span/text()").extract()[0].strip())
         item["source_domain"] = self.allowed_domains[0]
         item["source_name"] = u"中国报告大厅"
         date, date_precision = parse_date(item["report_revision_time"])
         item["report_revision_time_standard"] = date.replace(tzinfo=pytz.timezone('Asia/Shanghai'))
         item["price_free"] = False
         yield item

예제 #13

0

파일 보기

파일: IndustryReportSpiderOlxoz.py 프로젝트: hanwei2008/crawl

    def _parse_item(self, response):
        reports = response.xpath(".//*[@class='img_des']/a")
        if len(reports)>0:
            for report in reports:
                item = IndustryReportSpiderItem()
                item["industry_large_category"] = response.meta["large_category"]
                item["industry_mid_category"] = response.meta["mid_category"]
                item["report_name"] = clean_text(report.xpath("./text()").extract()[0].strip())
                industry_small_chs_name = parseIndustryName(item["report_name"])
                if industry_small_chs_name != None:
                    item["industry_small_chs_name"] = industry_small_chs_name
                page_url = report.xpath(".//@href").extract()[0]
                url = urljoin(self.base_url, page_url)
                item["report_link"] = url
                report_time = self.parseTime(item["report_link"])
                if report_time != None:
                    item["report_revision_time"] = report_time
                item["source_domain"] = self.allowed_domains[0]
                item["source_name"] = u"欧咨网"
                date, date_precision = parse_date(item["report_revision_time"])
                item["report_revision_time_standard"] = date.replace(tzinfo=pytz.timezone('Asia/Shanghai'))
                item["price_free"] = False
                yield item

            if len(response.xpath(".//*[@class='page']//@href"))>1: #存在翻页
                page_len = clean_text(response.xpath(".//*[@class='page']//*[@class='fl_l']/text()").extract()[0].strip())
                nextPageurl = response.xpath(".//*[@class='page']//@href").extract()[-1]
                finds = self.pattern_page.findall(page_len)
                currentPage = finds[0][0]
                totlePage = finds[0][1]
                if currentPage != totlePage:
                    url = urljoin(self.base_url, nextPageurl)
                    request = FormRequest(url, callback=self._parse_item, dont_filter=True)
                    request.meta["large_category"] = response.meta["large_category"]
                    request.meta["mid_category"] = response.meta["mid_category"]
                    yield request