def parse(self, response): links = response.xpath( "//div[@class='container row main in2']/div/ul/li/a/@href" ).extract() times = response.xpath( "//div[@class='container row main in2']/div/ul/li/span/text()" ).extract() l = len(links) print_new_number(self.counts, 'HFUT', self.name) for i in range(l): report_time = get_localtime(times[i]) if report_time < now_time: return report_url = self.domain + links[i][1:] yield scrapy.Request(report_url, callback=self.parse_pages, meta={ 'link': report_url, 'number': i + 1 }) number = int(response.url.split('-')[-1].split('.')[0]) last_number = response.xpath( "//div[@id='pages']/a/text()").extract()[-2] if number < last_number: new_url = 'http://news.hfut.edu.cn/list-28-%d.html' % (number + 1) yield scrapy.Request(new_url, callback=self.parse) else: return
def parse(self, response): messages = response.xpath("//table[@class='winstyle54630']").xpath( ".//tr[@height='26']") for i in xrange(len(messages)): report_name = messages[i].xpath(".//td")[0].xpath( ".//a/text()").extract()[0] if u'学术报告' not in report_name: continue report_url = self.domain + messages[i].xpath(".//td")[0].xpath( ".//a/@href").extract()[0][3:] report_time = get_localtime(messages[i].xpath(".//td")[1].xpath( ".//span/text()").extract()[0].strip().replace('/', '-')) if report_time > end_time: continue if report_time < now_time: return yield scrapy.Request(report_url, callback=self.parse_pages, meta={ 'link': report_url, 'number': i + 1 }) # return now_number = int( response.xpath("//tr[@valign='middle']/td/text()").extract() [0].strip().split('/')[0][-1]) last_number = int( response.xpath("//tr[@valign='middle']/td/text()").extract() [0].strip().split('/')[-1])
def parse(self, response): messages = response.xpath("//ul[@class='list-none metlist']/li") print_new_number(self.counts, 'USTC', self.name) for i in xrange(len(messages)): report_url = self.domain + messages[i].xpath(".//a/@href").extract()[0][2:] report_time = get_localtime(messages[i].xpath(".//span/text()").extract()[0].strip()) if report_time < now_time: return yield scrapy.Request(report_url, callback=self.parse_pages, meta={'link': report_url, 'number': i + 1})
def parse(self, response): messages = response.xpath("//div[@class='box_list']/ul/li") for i in xrange(len(messages)): report_url = self.domain + messages[i].xpath(".//a/@href").extract()[0][1:] report_time = get_localtime(messages[i].xpath(".//p/text()").extract()[0].strip()) if report_time < now_time: return yield scrapy.Request(report_url, callback=self.parse_pages, meta={'link': report_url, 'number': i + 1})
def parse(self, response): messages = response.xpath("//td[@class='middle']").xpath(".//tr") print_new_number(self.counts, 'USTC', self.name) for i in xrange(len(messages)): report_title = messages[i].xpath(".//span/a/text()").extract()[0] report_url = self.domain + messages[i].xpath(".//span/a/@href").extract()[0] report_time = get_localtime(messages[i].xpath(".//span/a/text()").extract()[-1].strip('()')) if report_time < now_time: return if u'本周报告' in report_title: continue yield scrapy.Request(report_url, callback=self.parse_pages, meta={'link': report_url, 'number': i + 1})
def parse(self, response): links = response.xpath("//li[@width='30%']/a/@href").extract() times = response.xpath("//li[@width='30%']/span/text()").extract() print_new_number(self.counts, 'USTC', self.name) l = len(links) for i in range(l): report_url = self.domain + links[i][2:] report_time = get_localtime(times[i]) if report_time < now_time: return yield scrapy.Request(report_url, callback=self.parse_pages, meta={'link': report_url, 'number': i + 1})
def parse(self, response): messages = response.xpath("//div[@class='list']/ul/li") print_new_number(self.counts, 'USTC', self.name) for i in xrange(len(messages)): if u'青年论坛' in messages[i].xpath(".//a/text()").extract()[0]: report_url = messages[i].xpath(".//a/@href").extract()[0] else: report_url = self.domain + messages[i].xpath(".//a/@href").extract()[0][9:] if 'Colloquium' in report_url: continue report_time = get_localtime('20' + messages[i].xpath(".//span/text()").extract()[0].strip('[]')) if report_time < now_time: return yield scrapy.Request(report_url, callback=self.parse_pages, meta={'link': report_url, 'number': i + 1})
def parse(self, response): messages = response.xpath("//div[@id='container']/dl/dd") for i in xrange(len(messages)): report_url = self.domain + messages[i].xpath( ".//a/@href").extract()[0][1:] report_time = get_localtime( messages[i].xpath(".//i/text()").extract()[0].split(' ')[0]) if report_time < now_time: return yield scrapy.Request(report_url, callback=self.parse_pages, meta={ 'link': report_url, 'number': i + 1 })
def parse(self, response): messages = response.xpath("//div[@id='container']/dl/dd") print_new_number(self.counts, 'WHU', self.name) for i in xrange(len(messages)): report_url = self.domain + messages[i].xpath(".//a/@href").extract()[0][1:] report_time = get_localtime(messages[i].xpath(".//i/text()").extract()[0].split(' ')[0]) if report_time < now_time: return yield scrapy.Request(report_url, callback=self.parse_pages, meta={'link': report_url, 'number': i + 1}) now_number = int(response.xpath("//div[@class='page fn_clear']/ul/li[@class='thisclass']/text()").extract()[0]) last_number = int(response.xpath("//span[@class='pageinfo']/strong")[0].xpath(".//text()").extract()[0]) if not (now_number < last_number): return new_url = 'http://cs.whu.edu.cn/a/xueshujiangzuofabu/list_39_{}.html'.format(now_number + 1) yield scrapy.Request(new_url, callback=self.parse)
def parse(self, response): messages = response.xpath( "//div[@class='view-content']/table/tbody/tr") print_new_number(self.counts, 'USTC', self.name) sign = 0 for i in xrange(len(messages)): message = messages[i].xpath(".//td") report_url = self.domain + message[0].xpath( ".//a/@href").extract()[0][1:] report_class = message[1].xpath(".//text()").extract()[0].strip() report_time = get_localtime( message[2].xpath(".//text()").extract()[0].strip()) if u'学术报告' not in report_class: continue if report_time < now_time: sign = 1 continue yield scrapy.Request(report_url, callback=self.parse_pages, meta={ 'link': report_url, 'number': i + 1 }) # The report time of this page is not sorted, so we only stop the procedure in the end of a page. if sign: return now_number = response.xpath( "//ul[@class='pager']/li[@class='pager-current first']/text()" ).extract() if len(now_number) == 0: now_number = int( response.xpath( "//ul[@class='pager']/li[@class='pager-current']/text()"). extract()[0]) else: now_number = int(now_number[0]) next_url = 'http://ess.ustc.edu.cn/notice?page=%d' % now_number yield scrapy.Request(next_url, callback=self.parse)
def parse(self, response): messages = response.xpath("//td[@class='middle']").xpath(".//tr") print_new_number(self.counts, 'USTC', self.name) for i in xrange(len(messages)): report_title = messages[i].xpath(".//span/a/text()").extract()[0] report_url = self.domain + messages[i].xpath(".//span/a/@href").extract()[0] report_time = get_localtime(messages[i].xpath(".//span/a/text()").extract()[-1].strip('()')) if report_time < now_time: return if u'本周报告' in report_title: continue yield scrapy.Request(report_url, callback=self.parse_pages, meta={'link': report_url, 'number': i + 1}) now_number = int(response.xpath("//a[@href='#']").xpath(".//text()").extract()[0]) last_number = int(response.xpath("//a[@href='#']").xpath(".//text()").extract()[-1][1:]) if now_number > last_number: return next_url = 'http://math.ustc.edu.cn/new/list.php?fid=35&page=%d' % (now_number + 1) yield scrapy.Request(next_url, callback=self.parse)
def parse_pages(self, response): messages = response.xpath("//div[@class='bbs-info']") title = self.try_get_message(messages.xpath(".//h2/text()").extract()) time = messages.xpath(".//p")[0].xpath(".//text()").extract()[1] address = messages.xpath(".//p")[1].xpath(".//text()").extract()[1] speaker = messages.xpath(".//p")[2].xpath(".//text()").extract()[1] other = response.xpath("//div[@class='show-new']") if len(other) == 0: content = '' else: content = other.xpath(".//text()").extract()[0].strip() if u'简介:' in content or 'Abstract:' in content or u'简介:' in content or 'Abstract:' in content: content = self.connect_messages( content, ':' ) if u'简介:' in content or 'Abstract:' in content else self.connect_messages( content, ':') else: pass report_time = get_localtime( response.xpath("//div[@class='wtime']/text()").extract() [0].strip()) if report_time < now_time: title = '' else: self.counts += 1 print_new_number(self.counts, 'THU', self.name) all_messages = save_messages('THU', self.name, title, time, address, speaker, '', content, '', response.meta['link'], response.meta['number'], u'清华大学') return all_messages
def parse(self, response): messages = response.xpath("//div[@class='full-page-list']/ul/li") print_new_number(self.counts, 'SYSU', self.name) for i in xrange(len(messages)): report_name = messages[i].xpath(".//a/text()").extract()[0] if u'学术报告:' not in report_name and u'学术报告:' not in report_name: continue report_url = self.domains + messages[i].xpath( ".//a/@href").extract()[0][1:] report_time = get_localtime( messages[i].xpath(".//span/text()").extract()[0].replace( '/', '-')) if report_time < now_time: return yield scrapy.Request(report_url, callback=self.parse_pages, meta={ 'link': report_url, 'number': i + 1 })
def parse(self, response): messages = response.xpath( "//div[@class='view-content']/table/tbody/tr") print_new_number(self.counts, 'USTC', self.name) sign = 0 for i in xrange(len(messages)): message = messages[i].xpath(".//td") report_url = self.domain + message[0].xpath( ".//a/@href").extract()[0][1:] report_class = message[1].xpath(".//text()").extract()[0].strip() report_time = get_localtime( message[2].xpath(".//text()").extract()[0].strip()) if u'学术报告' not in report_class: continue if report_time < now_time: sign = 1 continue yield scrapy.Request(report_url, callback=self.parse_pages, meta={ 'link': report_url, 'number': i + 1 })
# -*- coding:utf-8 -*- import sys reload(sys) sys.setdefaultencoding('utf-8') import time import scrapy from Global_function import get_localtime, print_new_number, save_messages now_time = get_localtime(time.strftime("%Y-%m-%d", time.localtime())) # now_time = 20170401 class USTC006_Spider(scrapy.Spider): name = 'USTC006' start_urls = ['http://biox.ustc.edu.cn/xsbg/'] domain = 'http://biox.ustc.edu.cn/xsbg/' counts = 0 def parse(self, response): messages = response.xpath("//ul[@class='list-none metlist']/li") print_new_number(self.counts, 'USTC', self.name) for i in xrange(len(messages)): report_url = self.domain + messages[i].xpath(".//a/@href").extract()[0][2:] report_time = get_localtime(messages[i].xpath(".//span/text()").extract()[0].strip()) if report_time < now_time: return yield scrapy.Request(report_url, callback=self.parse_pages, meta={'link': report_url, 'number': i + 1}) # return