コード例 #1
0
ファイル: NWPU001.py プロジェクト: AnselCmy/ARPS
    def parse(self, response):
        messages = response.xpath("//table[@class='winstyle54630']").xpath(
            ".//tr[@height='26']")

        for i in xrange(len(messages)):
            report_name = messages[i].xpath(".//td")[0].xpath(
                ".//a/text()").extract()[0]
            if u'学术报告' not in report_name:
                continue

            report_url = self.domain + messages[i].xpath(".//td")[0].xpath(
                ".//a/@href").extract()[0][3:]
            report_time = get_localtime(messages[i].xpath(".//td")[1].xpath(
                ".//span/text()").extract()[0].strip().replace('/', '-'))

            if report_time > end_time:
                continue
            if report_time < now_time:
                return

            yield scrapy.Request(report_url,
                                 callback=self.parse_pages,
                                 meta={
                                     'link': report_url,
                                     'number': i + 1,
                                     'publication': report_time
                                 })
コード例 #2
0
    def parse(self, response):
        messages = response.xpath("//div[@class='new']/div")

        for i, message in enumerate(messages[:-1]):
            report_name = message.xpath(".//a/@title").extract()[0]
            if re.search(u"(报告|讲座)", report_name) is None:
                continue

            report_time = get_localtime(
                message.xpath("div/span/text()").extract()[0].strip().strip(
                    "()"))
            if report_time > end_time:
                continue
            if report_time < now_time:
                return

            report_url = self.domain + message.xpath(".//a/@href").extract()[0]
            yield scrapy.Request(report_url,
                                 callback=self.parse_pages,
                                 meta={
                                     'link': report_url,
                                     'number': i + 1,
                                     'publication': report_time,
                                     'title': report_name
                                 })
コード例 #3
0
    def parse(self, response):
        messages = response.xpath("//div[@class='article_list']/ul/li")

        for i, message in enumerate(messages):
            report_name = message.xpath(".//a/text()").extract()[0]
            if u"学术报告预告" not in report_name:
                continue

            report_time = get_localtime(
                message.xpath(".//div[@class='p_date']/text()").extract()
                [0].replace('/', '-'))
            if report_time > end_time:
                continue
            if report_time < now_time:
                return

            report_url = self.domain + message.xpath(
                ".//a/@href").extract()[0][1:]
            yield scrapy.Request(report_url,
                                 callback=self.parse_pages,
                                 meta={
                                     'link': report_url,
                                     'number': i + 1,
                                     'publication': report_time
                                 })
コード例 #4
0
ファイル: SDU001.py プロジェクト: AnselCmy/ARPS
    def parse(self, response):
        messages = response.xpath("//div[@class='sub_text']").xpath(
            ".//div[@class='news-list']")

        for i, message in enumerate(messages):
            report_name = message.xpath(".//a/text()").extract()[0]
            if re.search(u"学术(报告)|(讲座)", report_name) is None:
                continue

            report_url = self.domain + message.xpath(
                ".//a/@href").extract()[0][1:]
            report_time = get_localtime(
                message.xpath(".//div[@class='lastTime']/text()").extract()[0])

            if report_time > end_time:
                continue
            if report_time < now_time:
                return

            yield scrapy.Request(report_url,
                                 callback=self.parse_pages,
                                 meta={
                                     'link': report_url,
                                     'number': i + 1,
                                     'publication': report_time
                                 })
コード例 #5
0
	def parse(self, response):
		messages = response.xpath("//div[@class='news-list']/ul/li")

		for i, message in enumerate(messages):
			report_time = get_localtime(re.sub(u"[]\[]", '', message.xpath("span/text()").extract()[0].strip()))
			if report_time > end_time:
				continue
			if report_time < now_time:
				return

			report_url = self.domain + message.xpath(".//a/@href").extract()[0][1:]
			yield scrapy.Request(report_url, callback=self.parse_pages,
			                     meta={'link': report_url, 'number': i + 1, 'publication': report_time})
コード例 #6
0
	def get_year(self, text, day, month):
		year = re.search(u"[\d]*((?![\u4e00-\u9fa5])[\W])*(?=年)", text)
		if year is not None:
			year = year.group()
			if len(year.strip()) < 4:
				year = "20" + year
		elif day is not None and month is not None:
			now_year = int(self.now_time.split('-')[0])
			now_month_day = int(str(get_localtime(self.now_time))[4:])
			report_month_day = int(month) * 100 + int(day)
			if report_month_day < now_month_day:
				year = str(now_year + 1)
			else:
				year = str(now_year)

		return year
コード例 #7
0
ファイル: BNU001.py プロジェクト: AnselCmy/ARPS
	def parse(self, response):
		messages = response.xpath("//div[@class='twelve columns alpha']/ul/li")

		for i, message in enumerate(messages):
			report_name = message.xpath(".//a/@title").extract()[0]
			if u"【预告】" not in report_name or u"论坛" in report_name:
				continue

			report_time = get_localtime(message.xpath("span/text()").extract()[0])
			if report_time > end_time:
				continue
			if report_time < now_time:
				return

			report_url = self.domain + message.xpath(".//a/@href").extract()[0]
			yield scrapy.Request(report_url, callback=self.parse_pages,
			                     meta={'link': report_url, 'number': i + 1, 'publication': report_time})
コード例 #8
0
	def parse(self, response):
		messages = response.xpath("//div[@class='uc_lanmu_content']/ul/li")

		for i, message in enumerate(messages):
			report_name = message.xpath(".//a/text()").extract()[0]
			if re.search(u"[::]", report_name, re.S) != None:
				report_name = re.split(u"[::]", report_name)[-1]

			print report_name
			report_time = get_localtime(message.xpath(".//span[@class='article_date']/text()").extract()[0])
			if report_time > end_time:
				continue
			if report_time < now_time:
				return

			report_url = self.domain + message.xpath(".//a/@href").extract()[0][1:]
			yield scrapy.Request(report_url, callback=self.parse_pages,
			                     meta={'link': report_url, 'number': i + 1, 'name': report_name, 'publication': report_time})
コード例 #9
0
	def get_time(self, text):
		day = self.get_day(text)
		month = self.get_month(text, day)
		year = self.get_year(text, day, month)

		start_time = None
		if day is not None and month is not None and year is not None:
			start_time = year + '-' + month + '-' + day
		else:
			start_time = re.search(u"([\d]*)[-~.,,]*([\d]{1,})[-~.,,]{1,}([\d]{1,})", text)
			if start_time is not None:
				start_time = re.split(u"[-~.,,]*", start_time.group())
				if len(start_time) == 3:
					start_time = start_time[0] + '-' + start_time[1] + '-' + start_time[2]
				elif len(start_time) == 2:
					day = start_time[1]
					month = start_time[0]
					year = self.get_year('', day, month)
					start_time = year + '-' + month + '-' + day
				else:
					start_time = None
			else:
				weekday = re.findall(u"(?:星期|周)(一|二|三|四|五|六|七|日|天|末|[\d])", text)[0]
				if re.sub(u"\\s+", '', weekday) != '':
					if self.week2day.has_key(weekday):
						weekday = int(self.week2day[weekday])
					else:
						weekday = int(weekday)

					now_weekday = datetime.datetime.now().weekday() + 1
					if weekday < now_weekday:
						start_time = str(datetime.datetime.now() + datetime.timedelta(days=weekday + 7 - now_weekday)).split(' ')[0]
					else:
						start_time = str(datetime.datetime.now() + datetime.timedelta(days=weekday - now_weekday)).split(' ')[0]
					print start_time

		if start_time is None or re.sub(u"\\s+", '', start_time) == '':
			return None
		else:
			try:
				return get_localtime(start_time)
			except:
				return None
コード例 #10
0
    def parse(self, response):
        messages = response.xpath("//div[@id='rightPageContent']/dl/dd")

        for i, message in enumerate(messages):
            report_time = get_localtime(
                message.xpath("span/text()").extract()[0])
            if report_time > end_time:
                continue
            if report_time < now_time:
                return

            report_url = message.xpath(".//a/@href").extract()[0]
            yield scrapy.Request(report_url,
                                 callback=self.parse_pages,
                                 meta={
                                     'link': report_url,
                                     'number': i + 1,
                                     'publication': report_time
                                 })
コード例 #11
0
    def parse(self, response):
        messages = response.xpath("//div[@id='container']/dl/dd")

        for i in xrange(len(messages)):
            report_url = self.domain + messages[i].xpath(
                ".//a/@href").extract()[0][1:]
            report_time = get_localtime(
                messages[i].xpath(".//i/text()").extract()[0].split(' ')[0])

            if report_time > end_time:
                continue
            if report_time < now_time:
                return
            yield scrapy.Request(report_url,
                                 callback=self.parse_pages,
                                 meta={
                                     'link': report_url,
                                     'number': i + 1,
                                     'publication': report_time
                                 })
コード例 #12
0
    def parse_pages(self, response):
        report_time = get_localtime(
            re.split(
                u"[::]",
                response.xpath("//td[@height='32']/div/strong")[0].xpath(
                    "text()").extract()[0])[-1])

        if report_time < now_time or report_time > end_time:
            return

        messages = response.xpath("//span[contains(@class, 'content')]/p")

        return {
            'text': messages,
            'number': response.meta['number'],
            'organizer': u'华东师范大学大学计算机科学技术系',
            'faculty': self.name,
            'link': response.meta['link'],
            'publication': report_time,
            'location': u"华东:上海市"
        }
コード例 #13
0
ファイル: SYSU001.py プロジェクト: AnselCmy/ARPS
    def parse(self, response):
        messages = response.xpath("//div[@class='full-page-list']/ul/li")

        for i in xrange(len(messages)):
            report_url = self.domains + messages[i].xpath(
                ".//a/@href").extract()[0][1:]
            report_time = get_localtime(
                messages[i].xpath(".//span/text()").extract()[0].replace(
                    '/', '-'))

            if report_time > end_time:
                continue
            if report_time < now_time:
                return
            yield scrapy.Request(report_url,
                                 callback=self.parse_pages,
                                 meta={
                                     'link': report_url,
                                     'number': i + 1,
                                     'publication': report_time
                                 })
コード例 #14
0
    def parse(self, response):
        messages = response.xpath(
            "//table[@width='100%']/tbody/tr/td/table").xpath(".//tr")

        for i, message in enumerate(messages[:-1]):
            report_url = self.domain + message.xpath(
                ".//a/@href").extract()[0][1:]
            report_time = get_localtime(
                message.xpath(".//font/text()").extract()[0].strip('[]'))

            if report_time > end_time:
                continue
            if report_time < now_time:
                return

            yield scrapy.Request(report_url,
                                 callback=self.parse_pages,
                                 meta={
                                     'link': report_url,
                                     'number': i + 1,
                                     'publication': report_time
                                 })
コード例 #15
0
ファイル: CSU001.py プロジェクト: AnselCmy/ARPS
    def parse(self, response):
        messages = response.xpath(
            "//div[@class='article-list-right']/div[@class='article-list-right-li new-article']"
        )

        for i, message in enumerate(messages):
            report_time = get_localtime(
                message.xpath(".//div[@class='article-list-left-li-r']/text()"
                              ).extract()[0])
            if report_time > end_time:
                continue
            if report_time < now_time:
                return

            report_url = self.domain + message.xpath(
                ".//a/@href").extract()[0][3:]
            yield scrapy.Request(report_url,
                                 callback=self.parse_pages,
                                 meta={
                                     'link': report_url,
                                     'number': i + 1,
                                     'publication': report_time
                                 })
コード例 #16
0
ファイル: SEU001.py プロジェクト: AnselCmy/ARPS
    def parse(self, response):
        messages = response.xpath("//table[@class='datatable']/tr")

        for i, message in enumerate(messages[:len(messages) - 1]):
            report_name = message.xpath(".//a/@title").extract()[0]
            if u"讲座" not in report_name:
                continue

            report_time = get_localtime(
                message.xpath("td")[-1].xpath("span/text()").extract()[0])
            if report_time > end_time:
                continue
            if report_time < now_time:
                return

            report_url = self.domain + message.xpath(".//a/@href").extract()[0]
            yield scrapy.Request(report_url,
                                 callback=self.parse_pages,
                                 meta={
                                     'link': report_url,
                                     'number': i + 1,
                                     'publication': report_time
                                 })
コード例 #17
0
    def parse(self, response):
        messages = response.xpath("//table[@id='dgrdNews']/tr")

        for i, message in enumerate(messages[:len(messages) - 1]):
            report_name = message.xpath(".//a/text()").extract()[0]
            if re.search(u"讲座|报告", report_name) is None:
                continue

            report_time = get_localtime("20" + message.xpath(".//td")[2].xpath(
                ".//text()").extract()[0].split(' ')[0])
            if report_time > end_time:
                continue
            if report_time < now_time:
                return

            report_url = self.domain + message.xpath(".//a/@href").extract()[0]
            yield scrapy.Request(report_url,
                                 callback=self.parse_pages,
                                 meta={
                                     'link': report_url,
                                     'number': i + 1,
                                     'publication': report_time
                                 })
コード例 #18
0
ファイル: SUDA001.py プロジェクト: AnselCmy/ARPS
    def parse(self, response):
        messages = response.xpath("//div[@id='TextList_time']/table")[0].xpath(
            "tr")

        for i, message in enumerate(messages):
            report_sign = message.xpath(".//a")
            if len(report_sign) == 0:
                continue

            report_time = get_localtime(
                message.xpath("td")[-1].xpath(".//text()").extract()[0])
            if report_time > end_time:
                continue
            elif report_time < now_time:
                return

            report_url = self.domain + message.xpath(".//a/@href").extract()[0]
            yield scrapy.Request(report_url,
                                 callback=self.parse_pages,
                                 meta={
                                     'link': report_url,
                                     'number': i + 1,
                                     'publication': report_time
                                 })
コード例 #19
0
# -*- coding:utf-8 -*-
from __future__ import print_function

import os
import time
import shutil
import traceback
from report_crawler.spiders.__Global_function import get_localtime
from report_crawler.spiders.__Global_variable import REPORT_SAVEDIR

now_time = get_localtime(time.strftime("%Y-%m-%d", time.localtime()))

DATADIR = REPORT_SAVEDIR + '/' + str(now_time)


class Spider_starter(object):

	def crawl(self):
		self.X001()

	def run_spider(self, spider_name):
		dirname = REPORT_SAVEDIR + '/' + str(now_time) + '/' + spider_name[len(spider_name)-3:] + '/' + spider_name[0:len(spider_name)-3]
		# If the dir is exist, clear the dir(today)
		if os.path.exists(dirname):
			shutil.rmtree(dirname, True)
		# If one of the spiders has error, the print_exc() function will tell us which is criminal
		try:
			if not os.path.exists(DATADIR):
				os.makedirs(DATADIR)
			os.system('scrapy crawl ' + spider_name)
		except: