Ejemplo n.º 1
0
    def parse(self, response):
        links = response.xpath(
            "//div[@class='container row main in2']/div/ul/li/a/@href"
        ).extract()
        times = response.xpath(
            "//div[@class='container row main in2']/div/ul/li/span/text()"
        ).extract()

        l = len(links)
        print_new_number(self.counts, 'HFUT', self.name)
        for i in range(l):
            report_time = get_localtime(times[i])

            if report_time < now_time:
                return
            report_url = self.domain + links[i][1:]
            yield scrapy.Request(report_url,
                                 callback=self.parse_pages,
                                 meta={
                                     'link': report_url,
                                     'number': i + 1
                                 })

        number = int(response.url.split('-')[-1].split('.')[0])
        last_number = response.xpath(
            "//div[@id='pages']/a/text()").extract()[-2]
        if number < last_number:
            new_url = 'http://news.hfut.edu.cn/list-28-%d.html' % (number + 1)
            yield scrapy.Request(new_url, callback=self.parse)
        else:
            return
Ejemplo n.º 2
0
    def parse(self, response):
        messages = response.xpath("//table[@class='winstyle54630']").xpath(
            ".//tr[@height='26']")

        for i in xrange(len(messages)):
            report_name = messages[i].xpath(".//td")[0].xpath(
                ".//a/text()").extract()[0]
            if u'学术报告' not in report_name:
                continue

            report_url = self.domain + messages[i].xpath(".//td")[0].xpath(
                ".//a/@href").extract()[0][3:]
            report_time = get_localtime(messages[i].xpath(".//td")[1].xpath(
                ".//span/text()").extract()[0].strip().replace('/', '-'))

            if report_time > end_time:
                continue
            if report_time < now_time:
                return

            yield scrapy.Request(report_url,
                                 callback=self.parse_pages,
                                 meta={
                                     'link': report_url,
                                     'number': i + 1
                                 })
            # return

        now_number = int(
            response.xpath("//tr[@valign='middle']/td/text()").extract()
            [0].strip().split('/')[0][-1])
        last_number = int(
            response.xpath("//tr[@valign='middle']/td/text()").extract()
            [0].strip().split('/')[-1])
Ejemplo n.º 3
0
	def parse(self, response):
		messages = response.xpath("//ul[@class='list-none metlist']/li")
		print_new_number(self.counts, 'USTC', self.name)

		for i in xrange(len(messages)):
			report_url = self.domain + messages[i].xpath(".//a/@href").extract()[0][2:]
			report_time = get_localtime(messages[i].xpath(".//span/text()").extract()[0].strip())

			if report_time < now_time:
				return
			yield scrapy.Request(report_url, callback=self.parse_pages, meta={'link': report_url, 'number': i + 1})
Ejemplo n.º 4
0
	def parse(self, response):
		messages = response.xpath("//div[@class='box_list']/ul/li")


		for i in xrange(len(messages)):
			report_url = self.domain + messages[i].xpath(".//a/@href").extract()[0][1:]
			report_time = get_localtime(messages[i].xpath(".//p/text()").extract()[0].strip())

			if report_time < now_time:
				return
			yield scrapy.Request(report_url, callback=self.parse_pages, meta={'link': report_url, 'number': i + 1})
Ejemplo n.º 5
0
	def parse(self, response):
		messages = response.xpath("//td[@class='middle']").xpath(".//tr")
		print_new_number(self.counts, 'USTC', self.name)

		for i in xrange(len(messages)):
			report_title = messages[i].xpath(".//span/a/text()").extract()[0]
			report_url = self.domain + messages[i].xpath(".//span/a/@href").extract()[0]
			report_time = get_localtime(messages[i].xpath(".//span/a/text()").extract()[-1].strip('()'))
			if report_time < now_time:
				return
			if u'本周报告' in report_title:
				continue
			yield scrapy.Request(report_url, callback=self.parse_pages, meta={'link': report_url, 'number': i + 1})
Ejemplo n.º 6
0
	def parse(self, response):
		links = response.xpath("//li[@width='30%']/a/@href").extract()
		times = response.xpath("//li[@width='30%']/span/text()").extract()
		print_new_number(self.counts, 'USTC', self.name)

		l = len(links)
		for i in range(l):
			report_url = self.domain + links[i][2:]
			report_time = get_localtime(times[i])

			if report_time < now_time:
				return
			yield scrapy.Request(report_url, callback=self.parse_pages, meta={'link': report_url, 'number': i + 1})
Ejemplo n.º 7
0
	def parse(self, response):
		messages = response.xpath("//div[@class='list']/ul/li")
		print_new_number(self.counts, 'USTC', self.name)

		for i in xrange(len(messages)):
			if u'青年论坛' in messages[i].xpath(".//a/text()").extract()[0]:
				report_url = messages[i].xpath(".//a/@href").extract()[0]
			else:
				report_url = self.domain + messages[i].xpath(".//a/@href").extract()[0][9:]
			if 'Colloquium' in report_url:
				continue
			report_time = get_localtime('20' + messages[i].xpath(".//span/text()").extract()[0].strip('[]'))

			if report_time < now_time:
				return
			yield scrapy.Request(report_url, callback=self.parse_pages, meta={'link': report_url, 'number': i + 1})
Ejemplo n.º 8
0
    def parse(self, response):
        messages = response.xpath("//div[@id='container']/dl/dd")

        for i in xrange(len(messages)):
            report_url = self.domain + messages[i].xpath(
                ".//a/@href").extract()[0][1:]
            report_time = get_localtime(
                messages[i].xpath(".//i/text()").extract()[0].split(' ')[0])
            if report_time < now_time:
                return
            yield scrapy.Request(report_url,
                                 callback=self.parse_pages,
                                 meta={
                                     'link': report_url,
                                     'number': i + 1
                                 })
Ejemplo n.º 9
0
	def parse(self, response):
		messages = response.xpath("//div[@id='container']/dl/dd")
		print_new_number(self.counts, 'WHU', self.name)

		for i in xrange(len(messages)):
			report_url = self.domain + messages[i].xpath(".//a/@href").extract()[0][1:]
			report_time = get_localtime(messages[i].xpath(".//i/text()").extract()[0].split(' ')[0])
			if report_time < now_time:
				return
			yield scrapy.Request(report_url, callback=self.parse_pages, meta={'link': report_url, 'number': i + 1})

		now_number = int(response.xpath("//div[@class='page fn_clear']/ul/li[@class='thisclass']/text()").extract()[0])
		last_number = int(response.xpath("//span[@class='pageinfo']/strong")[0].xpath(".//text()").extract()[0])

		if not (now_number < last_number):
			return
		new_url = 'http://cs.whu.edu.cn/a/xueshujiangzuofabu/list_39_{}.html'.format(now_number + 1)
		yield scrapy.Request(new_url, callback=self.parse)
Ejemplo n.º 10
0
    def parse(self, response):
        messages = response.xpath(
            "//div[@class='view-content']/table/tbody/tr")
        print_new_number(self.counts, 'USTC', self.name)

        sign = 0
        for i in xrange(len(messages)):
            message = messages[i].xpath(".//td")
            report_url = self.domain + message[0].xpath(
                ".//a/@href").extract()[0][1:]
            report_class = message[1].xpath(".//text()").extract()[0].strip()
            report_time = get_localtime(
                message[2].xpath(".//text()").extract()[0].strip())
            if u'学术报告' not in report_class:
                continue
            if report_time < now_time:
                sign = 1
                continue
            yield scrapy.Request(report_url,
                                 callback=self.parse_pages,
                                 meta={
                                     'link': report_url,
                                     'number': i + 1
                                 })

        # The report time of this page is not sorted, so we only stop the procedure in the end of a page.
        if sign:
            return
        now_number = response.xpath(
            "//ul[@class='pager']/li[@class='pager-current first']/text()"
        ).extract()
        if len(now_number) == 0:
            now_number = int(
                response.xpath(
                    "//ul[@class='pager']/li[@class='pager-current']/text()").
                extract()[0])
        else:
            now_number = int(now_number[0])
        next_url = 'http://ess.ustc.edu.cn/notice?page=%d' % now_number

        yield scrapy.Request(next_url, callback=self.parse)
Ejemplo n.º 11
0
	def parse(self, response):
		messages = response.xpath("//td[@class='middle']").xpath(".//tr")
		print_new_number(self.counts, 'USTC', self.name)

		for i in xrange(len(messages)):
			report_title = messages[i].xpath(".//span/a/text()").extract()[0]
			report_url = self.domain + messages[i].xpath(".//span/a/@href").extract()[0]
			report_time = get_localtime(messages[i].xpath(".//span/a/text()").extract()[-1].strip('()'))
			if report_time < now_time:
				return
			if u'本周报告' in report_title:
				continue
			yield scrapy.Request(report_url, callback=self.parse_pages, meta={'link': report_url, 'number': i + 1})

		now_number = int(response.xpath("//a[@href='#']").xpath(".//text()").extract()[0])
		last_number = int(response.xpath("//a[@href='#']").xpath(".//text()").extract()[-1][1:])
		if now_number > last_number:
			return
		next_url = 'http://math.ustc.edu.cn/new/list.php?fid=35&page=%d' % (now_number + 1)

		yield scrapy.Request(next_url, callback=self.parse)
Ejemplo n.º 12
0
    def parse_pages(self, response):
        messages = response.xpath("//div[@class='bbs-info']")

        title = self.try_get_message(messages.xpath(".//h2/text()").extract())

        time = messages.xpath(".//p")[0].xpath(".//text()").extract()[1]

        address = messages.xpath(".//p")[1].xpath(".//text()").extract()[1]

        speaker = messages.xpath(".//p")[2].xpath(".//text()").extract()[1]

        other = response.xpath("//div[@class='show-new']")

        if len(other) == 0:
            content = ''
        else:
            content = other.xpath(".//text()").extract()[0].strip()
            if u'简介:' in content or 'Abstract:' in content or u'简介:' in content or 'Abstract:' in content:
                content = self.connect_messages(
                    content, ':'
                ) if u'简介:' in content or 'Abstract:' in content else self.connect_messages(
                    content, ':')
            else:
                pass

        report_time = get_localtime(
            response.xpath("//div[@class='wtime']/text()").extract()
            [0].strip())
        if report_time < now_time:
            title = ''
        else:
            self.counts += 1
        print_new_number(self.counts, 'THU', self.name)

        all_messages = save_messages('THU', self.name, title, time, address,
                                     speaker, '', content, '',
                                     response.meta['link'],
                                     response.meta['number'], u'清华大学')

        return all_messages
Ejemplo n.º 13
0
    def parse(self, response):
        messages = response.xpath("//div[@class='full-page-list']/ul/li")
        print_new_number(self.counts, 'SYSU', self.name)

        for i in xrange(len(messages)):
            report_name = messages[i].xpath(".//a/text()").extract()[0]
            if u'学术报告:' not in report_name and u'学术报告:' not in report_name:
                continue
            report_url = self.domains + messages[i].xpath(
                ".//a/@href").extract()[0][1:]
            report_time = get_localtime(
                messages[i].xpath(".//span/text()").extract()[0].replace(
                    '/', '-'))

            if report_time < now_time:
                return
            yield scrapy.Request(report_url,
                                 callback=self.parse_pages,
                                 meta={
                                     'link': report_url,
                                     'number': i + 1
                                 })
Ejemplo n.º 14
0
    def parse(self, response):
        messages = response.xpath(
            "//div[@class='view-content']/table/tbody/tr")
        print_new_number(self.counts, 'USTC', self.name)

        sign = 0
        for i in xrange(len(messages)):
            message = messages[i].xpath(".//td")
            report_url = self.domain + message[0].xpath(
                ".//a/@href").extract()[0][1:]
            report_class = message[1].xpath(".//text()").extract()[0].strip()
            report_time = get_localtime(
                message[2].xpath(".//text()").extract()[0].strip())
            if u'学术报告' not in report_class:
                continue
            if report_time < now_time:
                sign = 1
                continue
            yield scrapy.Request(report_url,
                                 callback=self.parse_pages,
                                 meta={
                                     'link': report_url,
                                     'number': i + 1
                                 })
Ejemplo n.º 15
0
# -*- coding:utf-8 -*-

import sys
reload(sys)
sys.setdefaultencoding('utf-8')

import time
import scrapy
from Global_function import get_localtime, print_new_number, save_messages

now_time = get_localtime(time.strftime("%Y-%m-%d", time.localtime()))
# now_time = 20170401

class USTC006_Spider(scrapy.Spider):
	name = 'USTC006'
	start_urls = ['http://biox.ustc.edu.cn/xsbg/']
	domain = 'http://biox.ustc.edu.cn/xsbg/'
	counts = 0

	def parse(self, response):
		messages = response.xpath("//ul[@class='list-none metlist']/li")
		print_new_number(self.counts, 'USTC', self.name)

		for i in xrange(len(messages)):
			report_url = self.domain + messages[i].xpath(".//a/@href").extract()[0][2:]
			report_time = get_localtime(messages[i].xpath(".//span/text()").extract()[0].strip())

			if report_time < now_time:
				return
			yield scrapy.Request(report_url, callback=self.parse_pages, meta={'link': report_url, 'number': i + 1})
			# return