Beispiel #1
0
    def parse_item(self, response):
        if 'json' in response.url:
            rows = json.loads(response.text)['data']
            for row in rows:
                item = GGFundNoticeItem()
                item['sitename'] = self.sitename
                item['channel'] = self.channel
                item['url_entry'] = self.url_entry
                url = row['weburl']
                item['url'] = 'http://www.ixzzcgl.com' + url
                item['title'] = row['short_title']
                publish_time = row['notice_date']
                item['publish_time'] = datetime.strptime(
                    publish_time, '%Y-%m-%d') if publish_time else None

                yield item
        else:
            rows = response.xpath('//div[@class="dync_state dashed_bd"][2]/a')
            for row in rows:
                item = GGFundNoticeItem()
                item['sitename'] = self.sitename
                item['channel'] = self.channel
                item['url_entry'] = self.url_entry
                url = row.xpath('./@href').extract_first()
                item['url'] = 'http://www.ixzzcgl.com' + url
                item['title'] = row.xpath('./b/text()').extract_first()
                publish_time = row.xpath('./em/text()').extract_first()
                item['publish_time'] = datetime.strptime(
                    publish_time, '%Y-%m-%d') if publish_time else None
                yield item
Beispiel #2
0
    def parse_item(self, response):
        rows = response.xpath('//div[@class="list"]')
        for row in rows:
            title = row.xpath('./div[@class="l"]/a/@title').extract_first()
            url = row.xpath('./div[@class="l"]/a/@href').extract_first()
            url = urljoin(get_base_url(response), url)
            publish_time = row.xpath('./div[@class="r"]/text()').re_first(
                '\d+-\d+-\d+')

            item = GGFundNoticeItem()
            item['sitename'] = self.sitename
            item['channel'] = self.channel
            item['url_entry'] = self.entry
            item['title'] = title
            item['url'] = url
            item['publish_time'] = datetime.strptime(publish_time, '%Y-%m-%d')
            yield item

        next_url = response.xpath(
            '//div[@class="page"]/a[contains(text(),"下一页")]/@href'
        ).extract_first()
        if next_url:
            self.ips.append({
                'url': urljoin(get_base_url(response), next_url),
                'ref': response.url
            })
Beispiel #3
0
 def parse_item(self, response):
     rows = response.xpath(
         '//*[@id="discover"]//div[@class="newslist"]/ul/li')
     next_url = response.xpath(
         '//*[@id="discover"]/div[1]//a[text()="下一页"]/@href').extract_first(
         )
     last_url = response.xpath(
         '//*[@id="discover"]/div[1]//a[text()="末页"]/@href').extract_first(
         )
     for row in rows:
         url = row.xpath('./div[2]/h3/a/@href').extract_first()
         url = urljoin(get_base_url(response), url)
         title = row.xpath('./div[2]/h3/a/text()').extract_first()
         publish_year = row.xpath('./div[1]/span[2]/text()').extract_first()
         publish_day = row.xpath('./div[1]/span[1]/text()').extract_first()
         publish_time = str(publish_year) + '-' + str(publish_day)
         publish_time = datetime.strptime(publish_time, '%Y-%m-%d')
         item = GGFundNoticeItem()
         item['sitename'] = self.sitename
         item['channel'] = self.channel
         item['url_entry'] = self.entry
         item['url'] = url
         item['title'] = title
         item['publish_time'] = publish_time
         yield item
     if next_url and next_url != last_url:
         next_url = urljoin(get_base_url(response), next_url)
         self.ips.append({'url': next_url, 'ref': response.url})
Beispiel #4
0
    def parse_item(self, response):
        rows = response.xpath('//table[@class="views-view-grid cols-1"]//tr')

        for row in rows:
            url = row.xpath('./td/span/span/a/@href').extract_first()
            url = urljoin(get_base_url(response), url)

            title = row.xpath('./td/span/span/a/text()').extract_first().replace('\t', '').replace('\r', '').replace('\n', '')

            publish_time = row.xpath('./td/div/div/span/@content').extract_first()
            publish_time = publish_time[0:10]
            publish_time = datetime.strptime(publish_time, '%Y-%m-%d')

            item = GGFundNoticeItem()
            item['sitename'] = self.sitename
            item['channel'] = self.channel
            item['url_entry'] = self.entry
            item['url'] = url
            item['title'] = title
            item['publish_time'] = publish_time

            yield item

        next_url = response.xpath('//ul[@class="pager"]/li/a[contains(text(),"下一页")]/@href').extract_first()
        if next_url:
            next_url = urljoin(get_base_url(response), next_url)
            self.ips.append({
                'url': next_url,
                'ref': response.url
            })
 def parse_item(self, response):
     ext = response.meta['ext']
     page = int(ext['page'])
     rows = json.loads(response.text)
     total_pages = int(rows['totalPages'])
     rows = rows['recordList']
     for row in rows:
         url = row['href']
         url = urljoin(get_base_url(response), url)
         title = row['subject']
         publish_time = row['createTime']
         publish_time = datetime.strptime(publish_time, '%Y-%m-%d %H:%M:%S')
         item = GGFundNoticeItem()
         item['sitename'] = self.sitename
         item['channel'] = self.channel
         item['url_entry'] = self.entry
         item['url'] = url
         item['title'] = title
         item['publish_time'] = publish_time
         yield item
     if page < total_pages:
         url = 'http://www.nanhua.net/jSearch/queryNewsListByTypeForJson.shtm?site=newnanhua&type=201201&limit=16&start='
         url = url + str(page + 1)
         self.ips.append({
             'url': url,
             'ref': response.url,
             'ext': {
                 'page': str(page + 1)
             }
         })
Beispiel #6
0
    def parse_item(self, response):
        rows = response.xpath('//div[@class="simu-site-list"]/ul/a')

        for row in rows:
            url = row.xpath('./@href').extract_first()
            url = urljoin(get_base_url(response), url)

            title = row.xpath(
                './li/div/div[1]/text()').extract_first().replace(
                    '\t', '').replace('\r', '').replace('\n', '')

            publish_time = row.xpath(
                './li/div/div[3]/div[2]/text()').extract_first()
            publish_time = datetime.strptime(publish_time, '%Y-%m-%d')

            item = GGFundNoticeItem()
            item['sitename'] = self.sitename
            item['channel'] = self.channel
            item['url_entry'] = self.entry
            item['url'] = url
            item['title'] = title
            item['publish_time'] = publish_time

            yield item

        next_url = response.xpath(
            '//div[@class="simu-site-pagination"]/ul/li/a[contains(text(),"下一页")]/@href'
        ).extract_first()
        if next_url != 'javascript:;':
            next_url = urljoin(get_base_url(response), next_url)
            self.ips.append({'url': next_url, 'ref': response.url})
        yield self.request_next()
Beispiel #7
0
    def parse_list(self, response):
        notices = response.xpath('/html/body/div[4]//div[@class="trends"]/dl')
        for notice in notices:
            url = notice.xpath('./dd/a/@href').extract_first().strip()
            url = urljoin(get_base_url(response), url)
            title = notice.xpath('./dd/a/h3/text()').extract_first()
            publish_time_year = notice.xpath('./dt/text()').extract_first()
            publish_time_day = notice.xpath('./dt/b/text()').extract_first()
            publish_time = publish_time_year + '-' + publish_time_day

            item = GGFundNoticeItem()
            item['sitename'] = self.sitename
            item['channel'] = self.channel
            item['url_entry'] = self.entry
            item['url'] = url
            item['title'] = title
            item['publish_time'] = datetime.strptime(publish_time.strip(),
                                                     '%Y-%m-%d')
            yield item
        next_page = response.xpath(
            '/html/body/div[4]/div[2]/div[@class="paging"]//a[text()="下一页"]/@href'
        ).extract_first()
        if next_page is not None and next_page != 'javascript:':
            next_url = urljoin(get_base_url(response), next_page)
            self.lps.append({
                'url': next_url,
                'ref': response.url,
            })
Beispiel #8
0
    def parse_item(self, response):
        rows = response.xpath('//ul[@class="cmnList"]/li')

        for row in rows:
            url = row.xpath('./a/@href').extract_first()
            if url == 'javascript:void(0)':
                # <a href="javascript:void(0)"  onclick="getPdf('/home','/projectText/201510201386/1010/101002/144643001676888.pdf')">
                # r",\'(\S+)\'" 意思是: 以逗号开始的截取后面的单引号里面的多个非空字符,单引号里面的东西用()定位
                on_click = row.xpath('./a/@onclick').re_first(r",\'(\S+)\'")
                on_click = str('/uploads/fore') + on_click
                url = on_click
            url = urljoin(get_base_url(response), url)

            title = row.xpath('./a/text()').extract_first().replace(
                '\t', '').replace('\r', '').replace('\n', '')

            publish_time = row.xpath('./span/text()').extract_first()
            publish_time = datetime.strptime(publish_time, '%Y/%m/%d')

            item = GGFundNoticeItem()
            item['sitename'] = self.sitename
            item['channel'] = self.channel
            item['url_entry'] = self.entry
            item['url'] = url
            item['title'] = title
            item['publish_time'] = publish_time

            yield item

        next_url = response.xpath(
            '//div[@class="pdtPaging"]/a[text()="下一页"]/@href').extract_first()
        if next_url:
            next_url = urljoin(get_base_url(response), next_url)
            self.ips.append({'url': next_url, 'ref': response.url})
    def parse_item(self, response):
        rows = response.xpath('//ul[@class="jList fadeUp"]/li')

        for row in rows:
            url = row.xpath('./a[1]/@href').extract_first()
            url = urljoin(get_base_url(response), url)

            title = row.xpath('./a[1]/text()').extract_first()

            publish_time = row.xpath('./a[2]/text()').extract_first()
            if publish_time == str(
                    '0201-12-24'):  # 特殊情况存在: #网页时间格式有错误: 0201-12-24
                publish_time = str('2010-12-24')
            publish_time = datetime.strptime(publish_time, '%Y-%m-%d')

            item = GGFundNoticeItem()
            item['sitename'] = self.sitename
            item['channel'] = self.channel
            item['url_entry'] = self.entry
            item['url'] = url
            item['title'] = title
            item['publish_time'] = publish_time

            yield item

        next_url = response.xpath(
            '//div[@class="pages fadeUp"]/a[text()=">"]/@href').extract_first(
            )
        if next_url:
            next_url = urljoin(get_base_url(response), next_url)
            self.ips.append({'url': next_url, 'ref': response.url})
        yield self.request_next()
Beispiel #10
0
    def parse_item(self, response):
        rows = response.xpath('//div[@class="news_list"]/ul/li')
        for row in rows:
            url = row.xpath('./a/@href').extract_first()
            url = urljoin(get_base_url(response), url)

            title = row.xpath('./a/text()').extract_first().strip().replace(
                '\t', '').replace('\r', '').replace('\n', '')

            publish_time = row.xpath(
                './span/text()').extract_first().strip().replace(
                    '\t', '').replace('\r', '').replace('\n', '')
            publish_time = datetime.strptime(publish_time, '%Y-%m-%d')

            item = GGFundNoticeItem()
            item['sitename'] = self.sitename
            item['channel'] = self.channel
            item['url_entry'] = self.entry
            item['url'] = url
            item['title'] = title
            item['publish_time'] = publish_time
            yield item

        next_url = response.xpath(
            '//div[@class="page"]/a[text()="下一页"]/@href').extract_first()
        if next_url:
            next_url = urljoin(get_base_url(response), next_url)
            if response.url != next_url:
                self.ips.append({
                    'url': next_url,
                })
Beispiel #11
0
    def parse_item(self, response):
        rows = response.xpath('//div[@class="news_list team_box"]/ul/li')

        for row in rows:
            url = row.xpath('./a/@href').extract_first()
            url = urljoin(get_base_url(response), url)

            title = row.xpath('./a/@title').extract_first().replace('\t', '').replace('\r', '').replace('\n', '')

            publish_time = row.xpath('./a/em/text()').extract_first()#2018-05-28
            publish_time = datetime.strptime(publish_time, '%Y-%m-%d')

            item = GGFundNoticeItem()
            item['sitename'] = self.sitename
            item['channel'] = self.channel
            item['url_entry'] = self.entry
            item['url'] = url
            item['title'] = title
            item['publish_time'] = publish_time

            yield item

        next_url = response.xpath('//div[@class="page_list f_r"]/div/a[contains(text(),"下一页")]/@href').extract_first()
        if next_url:
            next_url = urljoin(get_base_url(response), next_url)
            self.ips.append({
                'url': next_url,
                'ref': response.url
            })
 def parse_list(self, response):
     noticeList = response.xpath(
         '/html/body/div/div[2]/div[2]/div[3]//div[@class="zp11_1 mg2 xi14"]'
     )
     next_page = response.xpath(
         '/html/body/div/div[2]/div[2]/div[3]/div[@class="fy cen xi14"]/a[text()="下一页"]/@href'
     ).extract_first()
     cur_page = int(
         response.xpath(
             '/html/body/div/div[2]/div[2]/div[3]/div[@class="fy cen xi14"]/span/text()'
         ).extract_first())
     next_page_index = int(re.search(r'page=(\d+)', next_page).group(1))
     for notice in noticeList:
         noticeLink = notice.xpath('./a/@href').extract_first().strip()
         title = notice.xpath('./a/text()').extract_first()
         publish_time = notice.xpath('./span/text()').extract_first()
         item = GGFundNoticeItem()
         item['sitename'] = self.sitename
         item['channel'] = self.channel
         item['url_entry'] = self.entry
         item['url'] = urljoin(get_base_url(response), noticeLink)
         item['title'] = title
         item['publish_time'] = datetime.strptime(publish_time, '%Y-%m-%d')
         yield item
     if next_page and cur_page != next_page_index:
         next_url = urljoin(get_base_url(response), next_page)
         self.lps.append({
             'url': next_url,
             'ref': response.url,
         })
Beispiel #13
0
    def parse_item(self, response):
        datas = response.xpath('//div[@class="lh-160p"]/div[@class="m-b-10"]')
        for notice in datas[1:]:
            href = notice.xpath('./div[1]/a/@href').extract_first()
            url = urljoin(get_base_url(response), href)
            title = notice.xpath('./div[1]/a/text()').extract_first()
            publish_time = notice.xpath('./div[2]/text()').extract_first()

            item = GGFundNoticeItem()
            item['sitename'] = self.sitename
            item['channel'] = self.channel
            item['url_entry'] = self.entry
            item['url'] = url
            item['title'] = title
            item['publish_time'] = datetime.strptime(publish_time, '%Y-%m-%d')
            yield item

        next_url = response.xpath(
            '//div[@class="pager"]/table/tr/td[3]/image/@onclick').re_first(
                '\d+')
        if next_url is not None:
            self.ips.append({
                'url':
                'http://www.gwxstrust.com/cn/page.jsp?id=23&pageIndex=' +
                next_url,
                'ref':
                response.url
            })
Beispiel #14
0
    def parse_item(self, response):
        datas = json.loads(response.text)
        rows = datas['collection']
        for row in rows:
            title = row['title']
            url = '/news/detail/' + str(row['articleId'])
            url = urljoin(get_base_url(response), url)
            publish_time = row['publishTime']
            publish_time = datetime.strptime(publish_time, '%Y-%m-%d')

            item = GGFundNoticeItem()
            item['sitename'] = self.sitename
            item['channel'] = self.channel
            item['url_entry'] = self.entry
            item['url'] = url
            item['title'] = title
            item['publish_time'] = publish_time
            yield item

        tp = datas['property']['pages']
        cp = response.meta['pg']
        if cp < tp:
            cp = cp + 1
            self.ips.append({
                'url':
                'http://www.szhvc.com/api/pc/message/news/paging?tagl2=REPORTS&pageSize=3&pageNum='
                + str(cp) + '&_=1527477855432',
                'ref':
                response.url,
                'pg':
                cp
            })
Beispiel #15
0
    def parse_item(self, response):
        datas = response.xpath('//div[contains(@class, "newsList")]')
        for notice in datas:
            href = notice.xpath(
                './div[@class="newsName"]/a/@href').extract_first()
            url = urljoin(get_base_url(response), href)
            title = notice.xpath(
                './div[@class="newsName"]/a/text()').extract_first()
            publish_time = notice.xpath(
                './div[@class="newsDate"]/text()').extract_first()

            item = GGFundNoticeItem()
            item['sitename'] = self.sitename
            item['channel'] = self.channel
            item['url_entry'] = self.entry
            item['url'] = url
            item['title'] = title
            item['publish_time'] = datetime.strptime(publish_time, '%Y-%m-%d')
            yield item

        next_url = response.xpath(
            '//a[contains(text(), "下一页")]/@href').extract_first()
        if next_url is not None and next_url != '':
            url = self.entry + next_url
            self.ips.append({'url': url, 'ref': response.url})
Beispiel #16
0
 def parse_list(self, response):
     funds = response.xpath(
         '//td[@class="TDbgcolor5"]/table/tbody/tr[4]/td/table/tbody/tr/td[2]/a'
     )
     for fund in funds:
         url = fund.xpath('./@href').extract_first()
         title = fund.xpath('./text()').extract_first()
         publish_time = re.findall('[P0|t]+(\d{8})', url)[0]
         if 'pdf' not in url:
             self.ips.append({
                 'url': urljoin(get_base_url(response), url),
                 'ref': response.url,
             })
         else:
             item = GGFundNoticeItem()
             item['sitename'] = self.sitename
             item['channel'] = self.channel
             item['url_entry'] = self.entry
             item['url'] = url
             item['title'] = title
             item['publish_time'] = datetime.strptime(
                 publish_time, '%Y%m%d')
             yield item
     tp = response.css('script').re_first(r'[1-9]\d+')
     pg = response.meta['pg']
     next_pg = int(pg) + 1
     if next_pg < int(tp):
         next_url = urljoin(get_base_url(response),
                            'index_' + str(next_pg) + '.htm')
         self.lps.append({
             'url': next_url,
             'ref': response.url,
             'pg': next_pg,
         })
Beispiel #17
0
    def parse_item(self, response):
        rows = response.xpath('//ul[@class="xcpl"]/li[not(@class)]')
        for row in rows:
            url = row.xpath('./span[@class="xmcn_ry"]/a/@href').extract_first()
            if 'javascript:void(0);' != url:
                url = urljoin(get_base_url(response), url)
            else:
                # 加密公告取当前所在页面地址
                url = response.url
            title = row.xpath(
                './span[@class="xmcn_ry"]/a/text()').extract_first()
            publish_time = row.xpath(
                './span[@class="xmcn_njy"]/text()').re_first('\d+-\d+-\d+')

            item = GGFundNoticeItem()
            item['sitename'] = self.sitename
            item['channel'] = self.channel
            item['url_entry'] = self.entry
            item['url'] = url
            item['title'] = title
            item['publish_time'] = datetime.strptime(publish_time, '%Y-%m-%d')
            yield item

        next_url = response.xpath(
            '//div[@class="pages"]/a[contains(text(),"下一页")]/@href'
        ).extract_first()
        if 'javascript:void(0);' != next_url:
            self.ips.append({
                'url': urljoin(get_base_url(response), next_url),
                'ref': response.url
            })
Beispiel #18
0
    def parse_item(self, response):
        rows = response.xpath('//div[@class="cont_rig"]/ul/li')
        for row in rows:
            url = row.xpath('./a/@href').extract_first()
            url = urljoin(get_base_url(response), url)
            title = row.xpath(
                './a/span[@class="title"]/text()').extract_first()
            publish_time = row.xpath(
                './a/span[@class="time"]/text()').re_first('\d+-\d+-\d+')

            item = GGFundNoticeItem()
            item['sitename'] = self.sitename
            item['channel'] = self.channel
            item['url_entry'] = self.entry
            item['url'] = url
            item['title'] = title
            item['publish_time'] = datetime.strptime(publish_time, '%Y-%m-%d')
            yield item

        tp = response.xpath('//script/text()').re_first(
            'var PAGE_COUNT =  (\d+)')
        if tp:
            pg = response.meta['pg'] + 1
            if pg <= int(tp):
                self.ips.append({
                    'url':
                    'http://www.ajxt.com.cn/ajxt/gywm/cpgg/index_{0}.shtml?page={1}'
                    .format(pg - 1, pg),
                    'ref':
                    response.url,
                    'pg':
                    pg
                })
Beispiel #19
0
    def parse_item(self, response):
        datas = response.xpath(
            '/html/body/section[2]/div//ul[@class="bulletin_main clearfix"]/li'
        )
        next_url = response.xpath(
            '/html/body/section[2]/div//div[@class="page"]/div/a[last()]/@href'
        ).extract_first()
        for notice in datas:
            href = notice.xpath('./a/@href').extract_first()
            url = urljoin(get_base_url(response), href)
            title = notice.xpath('./a/span/text()').extract_first().strip()
            publish_time = notice.xpath('./a/time/text()').extract_first()

            item = GGFundNoticeItem()
            item['sitename'] = self.sitename
            item['channel'] = self.channel
            item['url_entry'] = self.entry
            item['url'] = url
            item['title'] = title
            item['publish_time'] = datetime.strptime(publish_time, '%Y-%m-%d')
            yield item
        if next_url is not None and next_url != '':
            url = self.entry + next_url
            self.ips.append({'url': url, 'ref': response.url})
        yield self.request_next()
Beispiel #20
0
    def parse_item(self, response):
        line = response.text

        titles = re.findall(r'\\"\\">(\S+\s?\S+)\s*<\\/span>', line)

        dates = re.findall(r' text-right\\">\\n\s+(\d+年\d+月\d+日)\\n\s+<', line)
        urls = re.findall(r'data-remote=\\"true\\" href=\\"(\S+)\\">', line)
        next_url = re.search(r'href=\\"(/announcements\?\S+)\\">下一页', line)
        if next_url:
            next_url = next_url.group(1)
        for title in titles:
            url = urls.pop(0)
            url = urljoin(get_base_url(response), url)
            publish_time = dates.pop(0)
            publish_time = datetime.strptime(publish_time, '%Y年%m月%d日')
            item = GGFundNoticeItem()
            item['sitename'] = self.sitename
            item['channel'] = self.channel
            item['url_entry'] = self.entry
            item['url'] = url
            item['title'] = title   #
            item['publish_time'] = publish_time
            yield item
            #self.log(item)
        if next_url:
            next_url = urljoin(get_base_url(response), next_url)
            csrf_token = response.xpath('/html/head/meta[@name="csrf-token"]/@content').extract_first()
            self.ips.append({
                'url': next_url, 'ref': response.url,
                'X-Requested-With': 'XMLHttpRequest',
                'headers': {
                    'X-CSRF-Token': csrf_token,
                    'X-Requested-With': 'XMLHttpRequest',
                    'Accept': 'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01'
                }})
Beispiel #21
0
    def parse_item(self, response):
        rows = response.xpath('//div[@class="snowview_bot"]/ul/li')
        for row in rows:
            title = row.xpath('./a/text()').extract_first()
            url = row.xpath('./a/@href').extract_first()
            url = urljoin(get_base_url(response), url)
            publish_time = row.xpath('./span/text()').extract_first()

            item = GGFundNoticeItem()
            item['sitename'] = self.sitename
            item['channel'] = self.channel
            item['url_entry'] = self.entry
            item['title'] = title
            item['url'] = url
            item['publish_time'] = datetime.strptime(publish_time, '%Y-%m-%d')
            yield item

        tp = response.xpath(
            '//div[@class="left tx_fye"]/a[contains(text(),"末页")]').re_first(
                '\d+')
        pg = response.meta['pg']
        if pg < int(tp):
            pg = pg + 1
            next_url = 'http://www.snowlightcapital.cn/moreNews?pageNum=' + tp + '&pageNo=' + str(
                pg)
            self.ips.append({'url': next_url, 'ref': response.url, 'pg': pg})
    def parse_item(self, response):
        rows = response.xpath('//div[@class="alist2"]/div[not(@class)]')
        for row in rows:
            url = row.xpath('./div[1]/a/@href').extract_first()
            url = urljoin(get_base_url(response), url)

            title = row.xpath('./div[1]/a/text()').extract_first()

            publish_time = row.xpath('./div[2]/text()').extract_first()
            publish_time = datetime.strptime(publish_time, '%Y-%m-%d')

            item = GGFundNoticeItem()
            item['sitename'] = self.sitename
            item['channel'] = self.channel
            item['url_entry'] = self.entry
            item['url'] = url
            item['title'] = title
            item['publish_time'] = publish_time
            yield item

        next_page = response.xpath(
            '//table[@class="w-all h-30"]/tr/td[3]/image[@class]/@onclick'
        ).re_first('\d+')
        next_url = 'http://www.zhongtaitrust.com/cn/fortune/products/info.jsp?pageIndex=' + str(
            next_page)
        if next_page:
            self.ips.append({
                'url': next_url,
            })
Beispiel #23
0
    def parse_item(self, response):
        datas = response.xpath('//ul[@id="ggul"]/li')
        for notice in datas:
            href = notice.xpath('./a/@href').extract_first()
            url = urljoin(get_base_url(response), href)
            title = notice.xpath('normalize-space(./a/text())').extract_first()
            publish_time = notice.xpath('./span/text()').extract_first()

            item = GGFundNoticeItem()
            item['sitename'] = self.sitename
            item['channel'] = self.channel
            item['url_entry'] = self.entry
            item['url'] = url
            item['title'] = title
            item['publish_time'] = datetime.strptime(publish_time, '%Y/%m/%d')
            yield item

        tp = response.xpath(
            '//div[@class="page"]/table/tr/td/script').extract_first()
        tp = re.findall('var countPage = (\d+)', tp)[0]
        cp = response.meta['pg']
        if (cp + 1) < int(tp):
            cp = cp + 1
            next_url = urljoin(get_base_url(response),
                               'index_' + str(cp) + '.html')
            self.ips.append({'url': next_url, 'ref': response.url, 'pg': cp})
Beispiel #24
0
 def parse_item(self, response):
     ext = response.meta['ext']
     report_type = int(ext['report_type'])
     if report_type == 1:
         rows = response.xpath(
             '/html/body/div[2]/div[2]//div[@class="rightdiv_n"]/ul/li')
         next_url = response.xpath(
             '//*[@id="page"]/div[2]/ul/a[last()]/@href').extract_first()
     else:
         rows = response.xpath('//li')
         next_url = response.xpath(
             '//*[@id="page"]//a[text()="下一页"]/@href').extract_first()
     for row in rows:
         url = row.xpath('./a/@href').extract_first()
         url = urljoin(get_base_url(response), url)
         title = row.xpath('./a//text()').extract_first().strip()
         publish_time = row.xpath('./span/text()').extract_first()
         publish_time = datetime.strptime(publish_time, '%Y-%m-%d')
         item = GGFundNoticeItem()
         item['sitename'] = self.sitename
         item['channel'] = self.channel
         item['url_entry'] = self.entry
         item['url'] = url
         item['title'] = title
         item['publish_time'] = publish_time
         yield item
     if next_url and next_url != 'javascript:void(0);':
         next_url = urljoin(get_base_url(response), next_url)
         self.ips.append({
             'url': next_url,
             'ref': response.url,
             'ext': {
                 'report_type': str(report_type)
             }
         })
Beispiel #25
0
    def parse_item(self, response):
        rows = response.xpath('//div[@class="news01"]/ul/li')
        for row in rows:
            url = row.xpath('./div[@class="p2"]/a/@href').extract_first()
            url = urljoin(get_base_url(response), url)
            title = row.xpath('./div[@class="p2"]/a/text()').extract_first()
            publish_time = '2018-' + row.xpath(
                './div[@class="p1"]/text()').re_first('\d+-\d+')

            item = GGFundNoticeItem()
            item['sitename'] = self.sitename
            item['channel'] = self.channel
            item['url_entry'] = self.entry
            item['url'] = url
            item['title'] = title
            item['publish_time'] = datetime.strptime(publish_time, '%Y-%m-%d')
            yield item

        tp = response.xpath(
            '//div[@class="page"]/span/a[contains(text(),"尾页")]/@href'
        ).re_first('page=(\d+)')
        if tp:
            pg = response.meta['pg'] + 1
            if pg <= int(tp):
                self.ips.append({
                    'url':
                    'http://www.hexifund.com/info.php?class_id=102102&page={0}'
                    .format(pg),
                    'ref':
                    response.url,
                    'pg':
                    pg
                })
 def parse_item(self, response):
     ext = response.meta['ext']
     page = int(ext['page'])
     total_page = int(
         response.xpath('//div[@class="digg"]/a[text()=">"]/@href').
         re_first(r'page=(\d+)'))
     notices = response.xpath('//td[@class="newslist"]')
     years = response.xpath('//td[@class="newslist2"]/text()').extract()
     for row in notices:
         url = row.xpath('./a/@href').extract_first()
         url = urljoin(get_base_url(response), url)
         title = row.xpath('./a//text()').extract_first()
         publish_time = years.pop(0)
         publish_time = datetime.strptime(publish_time, '%Y-%m-%d')
         item = GGFundNoticeItem()
         item['sitename'] = self.sitename
         item['channel'] = self.channel
         item['url_entry'] = self.entry
         item['url'] = url
         item['title'] = title
         item['publish_time'] = publish_time
         yield item
     if page < total_page:
         self.ips.append({
             'url':
             'http://www.urich.cn/news.asp?page=' + str(page + 1),
             'ref':
             response.url,
             'ext': {
                 'page': str(page + 1)
             }
         })
    def parse_list(self, response):
        noticeList = response.xpath('//div[@class="r_news"]/ul/li')
        for notice in noticeList:
            noticeLink = notice.xpath('./a/@href').extract_first().strip()
            noticeLink = urljoin(get_base_url(response), noticeLink)
            title = notice.xpath('./a/text()').extract_first()
            publish_time = notice.xpath('./em/text()').extract_first()

            item = GGFundNoticeItem()
            item['sitename'] = self.sitename
            item['channel'] = self.channel
            item['url_entry'] = self.entry
            item['url'] = noticeLink
            item['title'] = title
            item['publish_time'] = datetime.strptime(publish_time, '%Y-%m-%d')
            yield item

        next_url = response.xpath(
            '//div[@class="quotes"]/a[contains(text(), "下一页")]/@href'
        ).extract_first()
        if next_url is not None:
            self.lps.append({
                'url': urljoin(get_base_url(response), next_url),
                'ref': response.url
            })
 def parse_item(self, response):
     ext = response.meta['ext']
     page = int(ext['page'])
     fund_id = ext['fund_id']
     if response.text:
         rows = json.loads(response.text)
         for row in rows:
             item = GGFundNoticeItem()
             item['sitename'] = self.sitename
             item['channel'] = self.channel
             item['url_entry'] = self.entry
             url = row['url']
             title = row['title']
             publish_time = row['displayTime']
             publish_time = datetime.strptime(publish_time, '%Y.%m.%d')
             item['url'] = url
             item['title'] = title
             item['publish_time'] = publish_time
             yield item
         url = 'http://www.cqitic.com/more/' + fund_id + '_' + str(
             page + 1) + '_20.shtml'
         self.ips.append({
             'url': url,
             'ref': response.url,
             'ext': {
                 'page': str(page + 1),
                 'fund_id': fund_id
             }
         })
 def parse_item(self, response):
     ext = response.meta['ext']
     page = int(ext['page'])
     total_page = re.search(r'pagecount\:(\d+),', response.text)
     if total_page:
         total_page = int(total_page.group(1))
     else:
         total_page = 0
     rows = response.xpath('//ul/li/a')
     for row in rows:
         title = row.xpath('./strong/text()').extract_first()
         url = row.xpath('./@href').extract_first()
         publish_time = row.xpath('./span/text()').extract_first()
         publish_time = datetime.strptime(publish_time, '%Y.%m.%d')
         item = GGFundNoticeItem()
         item['sitename'] = self.sitename
         item['channel'] = self.channel
         item['url_entry'] = self.entry
         item['url'] = urljoin(get_base_url(response), url)
         item['title'] = title
         item['publish_time'] = publish_time
         yield item
     if page < total_page:
         form = {'curPage': str(page + 1), 'numPerPage': '10', 'type': '1'}
         self.ips.append({
             'url': response.url,
             'form': form,
             'ext': {
                 'page': str(page + 1)
             },
             'ref': response.url
         })
    def parse_item(self, response):
        rows = response.css('ul.mid_ul_span li')
        for row in rows:
            url = row.xpath('./p[1]/a/@href').extract_first()#获取路径
            url = urljoin(get_base_url(response), url)#拼接绝对路径

            title = row.xpath('./p[1]/a/span/text()').extract_first().strip().replace('\t', '').replace('\r', '').replace('\n', '')#标题

            publish_time = row.xpath('./p[2]/span/text()').extract_first().strip().replace('\t', '').replace('\r', '').replace('\n', '')#时间格式
            publish_time = datetime.strptime(publish_time, '%Y.%m.%d')#转换时间格式

            item = GGFundNoticeItem()
            item['sitename'] = self.sitename
            item['channel'] = self.channel
            item['url_entry'] = self.entry
            item['url'] = url
            item['title'] = title
            item['publish_time'] = publish_time
            yield item

        next_url = response.xpath('//ul[@class="pagination mid_page"]/li/a[text()="下一页"]/@href').extract_first()
        if next_url:
            next_url = urljoin(get_base_url(response), next_url)
            self.ips.append({
                'url': next_url,
                'ref': response.url
            })
        yield self.request_next()