def parse(self, response): messages = response.xpath("//div[@class='box_info_list']/ul/li") report_num = 0 print_new_number(self.counts, 'THU', self.name) for i in xrange(len(messages)): report_num += 1 name = messages[i].xpath(".//a/text()").extract() report_name = name[0].strip() + name[1].strip() report_url = self.domain + messages[i].xpath( ".//a/@href").extract()[0][1:] if u'安排预告' in report_name: report_num -= 1 continue if report_name == self.last_name: return elif report_num == 1: sent_first('THU', self.name, report_name) yield scrapy.Request(report_url, callback=self.parse_pages, meta={ 'link': report_url, 'number': i + 1 })
def parse(self, response): links = response.xpath( "//div[@class='container row main in2']/div/ul/li/a/@href" ).extract() times = response.xpath( "//div[@class='container row main in2']/div/ul/li/span/text()" ).extract() l = len(links) print_new_number(self.counts, 'HFUT', self.name) for i in range(l): report_time = get_localtime(times[i]) if report_time < now_time: return report_url = self.domain + links[i][1:] yield scrapy.Request(report_url, callback=self.parse_pages, meta={ 'link': report_url, 'number': i + 1 }) number = int(response.url.split('-')[-1].split('.')[0]) last_number = response.xpath( "//div[@id='pages']/a/text()").extract()[-2] if number < last_number: new_url = 'http://news.hfut.edu.cn/list-28-%d.html' % (number + 1) yield scrapy.Request(new_url, callback=self.parse) else: return
def parse_pages(self, response): # title title = response.xpath( "//td[@height='30' and @align='center']/h4/text()").extract()[0] # crawling img url = response.xpath("//td[@align='left' and @class='cc']").xpath( './/img/@src').extract()[0][2:] # get img url img_domain = response.meta['link'].split('/') img_url = '' for i in range(len(img_domain) - 1): img_url += img_domain[i] + '/' img_url += url if title != '': self.counts += 1 print_new_number(self.counts, 'USTC', self.name) # save all_messages = save_messages('USTC', self.name, title, '', '', '', '', '', img_url, response.meta['link'], response.meta['number'], u'中国科学技术大学', u'计算机科学与技术学院') return all_messages
def parse_pages(self, response): messages = response.xpath("//td[@height='400']/p") title = response.xpath("//h4/text()").extract()[0].strip() time, address, speaker, img_url = '', '', '', '' for message in messages: text = self.get_messages(message) if u'时间:' in text or u'时间:' in text: time = self.connect_messages( text, ':') if u'时间:' in text else self.connect_messages( text, ':') if u'地点:' in text or u'地点:' in text: address = self.connect_messages( text, ':') if u'地点:' in text else self.connect_messages( text, ':') if u'报告人:' in text or u'报告人:' in text: speaker = self.connect_messages( text, ':') if u'报告人:' in text else self.connect_messages( text, ':') img = message.xpath(".//img/@src") img_url = (self.domain + img.extract()[0][1:]) if len(img) > 0 else '' if title != '': self.counts += 1 print_new_number(self.counts, 'THU', self.name) all_messages = save_messages('THU', self.name, title, time, address, speaker, '', '', img_url, response.meta['link'], response.meta['number'], u'清华大学', u'化学工程系') return all_messages
def parse_pages(self, response): title = response.xpath("//h2/text()").extract()[0] # other message summary = response.xpath("//div[@id='artibody']/p") time = self.get_keys(summary[0]) address = self.get_keys(summary[1]) speaker = self.get_keys(summary[2]) # We don't need organization and host # organization = self.get_keys(summary[3]) # host = self.get_keys(summary[4]) person_introduce, content = self.get_person_and_content(response) if title != '': self.counts += 1 print_new_number(self.counts, 'HFUT', self.name) all_messages = save_messages('HFUT', self.name, title, time, address, speaker, person_introduce, content, '', response.meta['link'], response.meta['number'], u'合肥工业大学') return all_messages
def parse_pages(self, response): messages = response.xpath("//dd[@class='info']").xpath( ".//text()").extract() sign = 0 title, time, address, speaker, person_introduce, content = '', '', '', '', '', '' for message in messages: if u'题目:' in message or u'题目:' in message: title = self.connect_message( message, ':') if u'题目:' in message else self.connect_message( message, ':') elif u'时间:' in message or u'时间:' in message: time = self.connect_message( message, ':') if u'时间:' in message else self.connect_message( message, ':') elif u'地点:' in message or u'地点:' in message: address = self.connect_message( message, ':') if u'地点:' in message else self.connect_message( message, ':') elif u'报告人:' in message or u'报告人:' in message: speaker = self.connect_message( message, ':') if u'报告人:' in message else self.connect_message( message, ':') elif u'简介:' in message or u'简介:' in message: sign = 0 person_introduce = self.connect_message( message, ':') if u'简介:' in message else self.connect_message( message, ':') elif u'摘要:' in message or u'摘要:' in message: sign = 1 content = self.connect_message( message, ':') if u'摘要:' in message else self.connect_message( message, ':') elif u'邀请人' in message: break elif not sign: person_introduce += message.strip() elif sign: content += message.strip() else: pass if title != '': self.counts += 1 print_new_number(self.counts, 'WHU', self.name) all_messages = save_messages('WHU', self.name, title, time, address, speaker, person_introduce, content, '', response.meta['link'], response.meta['number'], u'武汉大学', u'计算机学院') return all_messages
def parse_pages(self, response): messages = response.xpath("//div[@class='TRS_Editor']").xpath(".//p") sign = 0 title, speaker, time, address, content, person_introduce = '', '', '', '', '', '' for message in messages: text = self.get_text(message) if u'欢迎大家' in text or u'联系人' in text or u'紫金山天文台学术委员会' in text: continue elif u'题目:' in text or 'Title:' in text or u'题目:' in text or 'Title:' in text: title = self.connect_message( text, ':' ) if u'题目:' in text or 'Title:' in text else self.connect_message( text, ':') elif u'报告人:' in text or 'Speaker:' in text or u'主讲人:' in text or u'报告人:' in text or 'Speaker:' in text or u'主讲人:' in text: speaker = self.connect_message( text, ':' ) if u'报告人:' in text or 'Speaker:' in text or u'主讲人:' in text else self.connect_message( text, ':') elif u'时间:' in text or 'Time:' in text or u'时间:' in text or 'Time:' in text: time = self.connect_message( text, ':' ) if u'时间:' in text or 'Time:' in text else self.connect_message( text, ':') elif u'地点:' in text or 'Address:' in text or u'地点:' in text or 'Address:' in text: address = self.connect_message( text, ':' ) if u'地点:' in text or 'Address:' in text else self.connect_message( text, ':') elif u'简介:' in text or 'Bio:' in text or u'简介:' in text or 'Bio:' in text: sign = 1 person_introduce = self.connect_message( text, ':' ) if u'简介:' in text or 'Bio:' in text else self.connect_message( text, ':') elif u'摘要:' in text or 'Abstract:' in text or u'摘要:' in text or 'Abstract:' in text: sign = 2 content = self.connect_message( text, ':' ) if u'摘要:' in text or 'Abstract:' in text else self.connect_message( text, ':') else: if sign == 1: person_introduce += text.strip() elif sign == 2: content += text.strip() if title != '': self.counts += 1 print_new_number(self.counts, 'USTC', self.name) all_messages = save_messages('USTC', self.name, title, time, address, speaker, person_introduce, content, '', response.meta['link'], response.meta['number'], u'中国科学技术大学') return all_messages
def parse_pages(self, response): messages = response.xpath("//div[@class='box_detail']/p") print len(messages) sign = 0 title, time, address, speaker, person_introduce, content = '', '', '', '', '', '' for message in messages: text = self.get_messages(message) if u'题目:' in text or u'题目:' in text: title = self.connect_messages( text, ':') if u'题目:' in text else self.connect_messages( text, ':') if u'时间:' in text or u'时间:' in text: time = self.connect_messages( text, ':') if u'时间:' in text else self.connect_messages( text, ':') if u'地点:' in text or u'地点:' in text: address = self.connect_messages( text, ':') if u'地点:' in text else self.connect_messages( text, ':') if u'报告人:' in text or u'报告人:' in text: speaker = self.connect_messages( text, ':') if u'报告人:' in text else self.connect_messages( text, ':') if u'简介:' in text or u'简介:' in text: sign = 1 person_introduce = self.connect_messages( text, ':') if u'简介:' in text else self.connect_messages( text, ':') if u'摘要:' in text or u'摘要:' in text: sign = 2 content = self.connect_messages( text, ':') if u'摘要:' in text else self.connect_messages( text, ':') else: if u'联系人' in text: continue elif sign == 1: person_introduce += '\n' + text elif sign == 2: content += '\n' + text else: pass if title != '': self.counts += 1 print_new_number(self.counts, 'THU', self.name) all_messages = save_messages('THU', self.name, title, time, address, speaker, person_introduce, content, '', response.meta['link'], response.meta['number'], u'清华大学', u'数学科学系') return all_messages
def parse(self, response): messages = response.xpath("//ul[@class='list-none metlist']/li") print_new_number(self.counts, 'USTC', self.name) for i in xrange(len(messages)): report_url = self.domain + messages[i].xpath(".//a/@href").extract()[0][2:] report_time = get_localtime(messages[i].xpath(".//span/text()").extract()[0].strip()) if report_time < now_time: return yield scrapy.Request(report_url, callback=self.parse_pages, meta={'link': report_url, 'number': i + 1})
def parse_pages(self, response): title = '' img_url = self.domain + str(response.meta['time'])[0:6] + response.xpath("//p[@align='center']/img/@src").extract()[0][1:] if title != '': self.counts += 1 print_new_number(self.counts, 'USTC', self.name) all_messages = save_messages('USTC', self.name, '', '', '', '', '', '', img_url, response.meta['link'], response.meta['number'], u'中国科学技术大学') return all_messages
def parse_pages(self, response): messages = response.xpath("//div[@id='vsb_content']/p") sign = 1 title, time, address, speaker, person_introduce, content = '', '', '', '', '', '' for message in messages: text = self.get_messages(message) if u'题目:' in text or u'题目:' in text: title = self.connect_message( text, ':') if u'题目:' in text else self.connect_message( text, ':') elif u'主持人' in text: continue elif u'时间:' in text or u'时间:' in text: time = self.connect_message( text, ':') if u'时间:' in text else self.connect_message( text, ':') elif u'地点:' in text or u'地点:' in text: address = self.connect_message( text, ':') if u'地点:' in text else self.connect_message( text, ':') elif u'报告人:' in text or u'报告人:' in text: speaker = self.connect_message( text, ':') if u'报告人:' in text else self.connect_message( text, ':') elif u'简介:' in text or u'简介:' in text: sign = 1 person_introduce = self.connect_message( text, ':') if u'题目:' in text else self.connect_message( text, ':') elif u'摘要:' in text or u'摘要:' in text: sign = 2 content = self.connect_message( text, ':') if u'题目:' in text else self.connect_message( text, ':') else: if sign == 1: person_introduce += text elif sign == 2: content += text if title != '': self.counts += 1 print_new_number(self.counts, 'NPU', self.name) all_messages = save_messages('NPU', self.name, title, time, address, speaker, person_introduce, content, '', response.meta['link'], response.meta['number'], u'西北工业大学', u'计算机学院') return all_messages
def parse(self, response): messages = response.xpath("//td[@class='middle']").xpath(".//tr") print_new_number(self.counts, 'USTC', self.name) for i in xrange(len(messages)): report_title = messages[i].xpath(".//span/a/text()").extract()[0] report_url = self.domain + messages[i].xpath(".//span/a/@href").extract()[0] report_time = get_localtime(messages[i].xpath(".//span/a/text()").extract()[-1].strip('()')) if report_time < now_time: return if u'本周报告' in report_title: continue yield scrapy.Request(report_url, callback=self.parse_pages, meta={'link': report_url, 'number': i + 1})
def parse(self, response): links = response.xpath("//li[@width='30%']/a/@href").extract() times = response.xpath("//li[@width='30%']/span/text()").extract() print_new_number(self.counts, 'USTC', self.name) l = len(links) for i in range(l): report_url = self.domain + links[i][2:] report_time = get_localtime(times[i]) if report_time < now_time: return yield scrapy.Request(report_url, callback=self.parse_pages, meta={'link': report_url, 'number': i + 1})
def parse(self, response): messages = response.xpath("//ul[@class='zjlt clearfix mt-45']/li") print_new_number(self.counts, 'THU', self.name) for i, message in enumerate(messages): report_url = self.domain + message.xpath( ".//div[@class='info fr']").xpath( './/a/@href').extract()[0][1:] yield scrapy.Request(report_url, callback=self.parse_pages, meta={ 'link': report_url, 'number': i + 1 })
def parse_pages(self, response): messages = response.xpath( "//div[@class='inside panels-flexible-region-inside panels-flexible-region-jcjz-center-inside panels-flexible-region-inside-first']/div" ) # address address = self.get_message(messages[0]) # time time = self.get_message(messages[1]) # speaker speaker = self.get_message(messages[2]) # other message: title, person_introduce, content img_url = '' person_introduce = '' title = '' content = '' for i in range(3, len(messages)): part_name = messages[i].xpath( ".//div[@class='field-label']/text()").extract()[0] img_exist = messages[i].xpath(".//img") if len(img_exist) != 0: img_url = self.get_img(messages[i]) if u'报告人简介' in part_name: person_introduce = self.get_message(messages[i]) elif u'题目' in part_name: title = self.get_message(messages[i]) else: content = self.get_message(messages[i]) # break if title == '': title = response.xpath("//h1/text()").extract()[0] if img_url != '': img_url = self.domain + img_url[1:] if title != '': self.counts += 1 print_new_number(self.counts, 'USTC', self.name) all_messages = save_messages('USTC', self.name, title, time, address, speaker, person_introduce, content, img_url, response.meta['link'], response.meta['number'], u'中国科学技术大学', u'地球和空间科学学院') return all_messages
def parse(self, response): messages = response.xpath("//div[@class='list']/ul/li") print_new_number(self.counts, 'USTC', self.name) for i in xrange(len(messages)): if u'青年论坛' in messages[i].xpath(".//a/text()").extract()[0]: report_url = messages[i].xpath(".//a/@href").extract()[0] else: report_url = self.domain + messages[i].xpath(".//a/@href").extract()[0][9:] if 'Colloquium' in report_url: continue report_time = get_localtime('20' + messages[i].xpath(".//span/text()").extract()[0].strip('[]')) if report_time < now_time: return yield scrapy.Request(report_url, callback=self.parse_pages, meta={'link': report_url, 'number': i + 1})
def parse_pages(self, response): messages = response.xpath("//table[@width='96%']").xpath(".//td[@align='left' and @class='dh01']").xpath(".//text()").extract() sign = -1 title = ''; time = ''; address = ''; speaker = ''; content = '' # the order of message is not stable, so we can only use the key words. And some messages not only have one section. for i in range(len(messages) - 1): if 'Title:' in messages[i] or u'题目:' in messages[i] or 'Title:' in messages[i] or u'题目:' in messages[i]: sign = 0 title += self.get_message(messages[i], ':') if 'Title:' in messages[i] or u'题目:' in messages[i] else self.get_message(messages[i], ':') elif 'Time:' in messages[i] or u'时间:' in messages[i] or 'Time:' in messages[i] or u'时间:' in messages[i]: sign = 1 time += self.get_message(messages[i], ':') if 'Time:' in messages[i] or u'时间:' in messages[i] else self.get_message(messages[i], ':') elif 'Place:' in messages[i] or u'地点:' in messages[i] or 'Place:' in messages[i] or u'地点:' in messages[i]: sign = 2 address += self.get_message(messages[i], ':') if 'Place:' in messages[i] or u'地点:' in messages[i] else self.get_message(messages[i], ':') elif 'Speaker:' in messages[i] or u'报告人:' in messages[i] or 'Speaker:' in messages[i] or u'报告人:' in messages[i]: sign = 3 speaker += self.get_message(messages[i], ':') if 'Speaker:' in messages[i] or u'报告人:' in messages[i] else self.get_message(messages[i], ':') elif 'Abstract:' in messages[i] or u'摘要:' in messages[i] or 'Abstract:' in messages[i] or u'摘要:' in messages[i]: sign = 4 content += self.get_message(messages[i], ':') if 'Abstract:' in messages[i] or u'摘要:' in messages[i] else self.get_message(messages[i], ':') elif 'Abstract.' in messages[i] or u'摘要.' in messages[i]: sign = 4 content += self.get_message(messages[i], '.') else: if u'欢迎' in messages[i]: pass elif sign == 0: title += messages[i] elif sign == 1: time += messages[i] elif sign == 2: address += messages[i] elif sign == 3: speaker += messages[i] elif sign == 4: content += messages[i] if title != '': self.counts += 1 print_new_number(self.counts, 'USTC', self.name) all_messages = save_messages('USTC', self.name, title, time, address, speaker, '', content, '', response.meta['link'], response.meta['number'], u'中国科学技术大学', u'数学科学学院') return all_messages
def parse(self, response): messages = response.xpath("//div[@id='container']/dl/dd") print_new_number(self.counts, 'WHU', self.name) for i in xrange(len(messages)): report_url = self.domain + messages[i].xpath( ".//a/@href").extract()[0][1:] report_time = get_localtime( messages[i].xpath(".//i/text()").extract()[0].split(' ')[0]) if report_time < now_time: return yield scrapy.Request(report_url, callback=self.parse_pages, meta={ 'link': report_url, 'number': i + 1 })
def parse(self, response): messages = response.xpath("//div[@id='container']/dl/dd") print_new_number(self.counts, 'WHU', self.name) for i in xrange(len(messages)): report_url = self.domain + messages[i].xpath(".//a/@href").extract()[0][1:] report_time = get_localtime(messages[i].xpath(".//i/text()").extract()[0].split(' ')[0]) if report_time < now_time: return yield scrapy.Request(report_url, callback=self.parse_pages, meta={'link': report_url, 'number': i + 1}) now_number = int(response.xpath("//div[@class='page fn_clear']/ul/li[@class='thisclass']/text()").extract()[0]) last_number = int(response.xpath("//span[@class='pageinfo']/strong")[0].xpath(".//text()").extract()[0]) if not (now_number < last_number): return new_url = 'http://cs.whu.edu.cn/a/xueshujiangzuofabu/list_39_{}.html'.format(now_number + 1) yield scrapy.Request(new_url, callback=self.parse)
def parse_pages(self, response): messages = response.xpath("//div[@class='box_detail']/p") sign = 0 title, time, address, content = '', '', '', '' speaker = '' for message in messages: text = get_messages(message) if '题目:' in text or '题目:' in text: title = connect_messages( text, ':') if '题目:' in text else connect_messages( text, ':') elif '时间:' in text or '时间:' in text: time = connect_messages( text, ':') if '时间:' in text else connect_messages( text, ':') elif '地点:' in text or '地点:' in text: address = connect_messages( text, ':') if '地点:' in text else connect_messages( text, ':') elif '人:' in text or '人:' in text: speaker = connect_messages( text, ':') if '人:' in text else connect_messages( text, ':') elif '摘要:' in text or '摘要:' in text: sign = 1 content = connect_messages( text, ':') if '摘要:' in text else connect_messages( text, ':') elif sign == 1: content += text.strip() if title != '': self.counts += 1 print_new_number(self.counts, 'THU', self.name) all_messages = save_messages('THU', self.name, title, time, address, speaker, '', content, '', response.meta['link'], response.meta['number'], u'清华大学', u'物理系') return all_messages
def parse(self, response): messages = response.xpath( "//div[@class='view-content']/table/tbody/tr") print_new_number(self.counts, 'USTC', self.name) sign = 0 for i in xrange(len(messages)): message = messages[i].xpath(".//td") report_url = self.domain + message[0].xpath( ".//a/@href").extract()[0][1:] report_class = message[1].xpath(".//text()").extract()[0].strip() report_time = get_localtime( message[2].xpath(".//text()").extract()[0].strip()) if u'学术报告' not in report_class: continue if report_time < now_time: sign = 1 continue yield scrapy.Request(report_url, callback=self.parse_pages, meta={ 'link': report_url, 'number': i + 1 }) # The report time of this page is not sorted, so we only stop the procedure in the end of a page. if sign: return now_number = response.xpath( "//ul[@class='pager']/li[@class='pager-current first']/text()" ).extract() if len(now_number) == 0: now_number = int( response.xpath( "//ul[@class='pager']/li[@class='pager-current']/text()"). extract()[0]) else: now_number = int(now_number[0]) next_url = 'http://ess.ustc.edu.cn/notice?page=%d' % now_number yield scrapy.Request(next_url, callback=self.parse)
def parse(self, response): messages = response.xpath("//td[@class='middle']").xpath(".//tr") print_new_number(self.counts, 'USTC', self.name) for i in xrange(len(messages)): report_title = messages[i].xpath(".//span/a/text()").extract()[0] report_url = self.domain + messages[i].xpath(".//span/a/@href").extract()[0] report_time = get_localtime(messages[i].xpath(".//span/a/text()").extract()[-1].strip('()')) if report_time < now_time: return if u'本周报告' in report_title: continue yield scrapy.Request(report_url, callback=self.parse_pages, meta={'link': report_url, 'number': i + 1}) now_number = int(response.xpath("//a[@href='#']").xpath(".//text()").extract()[0]) last_number = int(response.xpath("//a[@href='#']").xpath(".//text()").extract()[-1][1:]) if now_number > last_number: return next_url = 'http://math.ustc.edu.cn/new/list.php?fid=35&page=%d' % (now_number + 1) yield scrapy.Request(next_url, callback=self.parse)
def parse_pages(self, response): messages = response.xpath("//div[@class='bbs-info']") title = self.try_get_message(messages.xpath(".//h2/text()").extract()) time = messages.xpath(".//p")[0].xpath(".//text()").extract()[1] address = messages.xpath(".//p")[1].xpath(".//text()").extract()[1] speaker = messages.xpath(".//p")[2].xpath(".//text()").extract()[1] other = response.xpath("//div[@class='show-new']") if len(other) == 0: content = '' else: content = other.xpath(".//text()").extract()[0].strip() if u'简介:' in content or 'Abstract:' in content or u'简介:' in content or 'Abstract:' in content: content = self.connect_messages( content, ':' ) if u'简介:' in content or 'Abstract:' in content else self.connect_messages( content, ':') else: pass report_time = get_localtime( response.xpath("//div[@class='wtime']/text()").extract() [0].strip()) if report_time < now_time: title = '' else: self.counts += 1 print_new_number(self.counts, 'THU', self.name) all_messages = save_messages('THU', self.name, title, time, address, speaker, '', content, '', response.meta['link'], response.meta['number'], u'清华大学') return all_messages
def parse(self, response): messages = response.xpath("//div[@class='full-page-list']/ul/li") print_new_number(self.counts, 'SYSU', self.name) for i in xrange(len(messages)): report_name = messages[i].xpath(".//a/text()").extract()[0] if u'学术报告:' not in report_name and u'学术报告:' not in report_name: continue report_url = self.domains + messages[i].xpath( ".//a/@href").extract()[0][1:] report_time = get_localtime( messages[i].xpath(".//span/text()").extract()[0].replace( '/', '-')) if report_time < now_time: return yield scrapy.Request(report_url, callback=self.parse_pages, meta={ 'link': report_url, 'number': i + 1 })
def parse_pages(self, response): messages = response.xpath("//div[@class='justify']").xpath(".//p").xpath(".//text()").extract() sign = 0 title, time, address, speaker, person_introduce, content = '', '', '', '', '', '' for message in messages: if u'题目:' in message or u'题目:' in message: title = self.connect_messages(message, ':') if u'题目:' in message else self.connect_messages(message, ':') elif u'时间:' in message or u'时间:' in message: time = self.connect_messages(message, ':') if u'时间:' in message else self.connect_messages(message, ':') elif u'地点:' in message or u'地点:' in message: address = self.connect_messages(message, ':') if u'地点:' in message else self.connect_messages(message, ':') elif u'报告人:' in message or u'报告人:' in message: speaker = self.connect_messages(message, ':') if u'报告人:' in message else self.connect_messages(message, ':') elif u'摘要:' in message or u'摘要:' in message: sign = 1 content = self.connect_messages(message, ':') if u'摘要:' in message else self.connect_messages(message, ':') elif u'简介:' in message or u'简介:' in message: sign = 2 person_introduce = self.connect_messages(message, ':') if u'简介:' in message else self.connect_messages(message, ':') else: if u'联系人' in message: continue if sign == 1: content += '\n' + message.strip() elif sign == 2: person_introduce += '\n' + message.strip() else: pass if title != '': self.counts += 1 print_new_number(self.counts, 'USTC', self.name) all_messages = save_messages('USTC', self.name, title, time, address, speaker, person_introduce, content, '', response.meta['link'], response.meta['number'], u'中国科学技术大学') return all_messages
def parse(self, response): messages = response.xpath( "//div[@class='view-content']/table/tbody/tr") print_new_number(self.counts, 'USTC', self.name) sign = 0 for i in xrange(len(messages)): message = messages[i].xpath(".//td") report_url = self.domain + message[0].xpath( ".//a/@href").extract()[0][1:] report_class = message[1].xpath(".//text()").extract()[0].strip() report_time = get_localtime( message[2].xpath(".//text()").extract()[0].strip()) if u'学术报告' not in report_class: continue if report_time < now_time: sign = 1 continue yield scrapy.Request(report_url, callback=self.parse_pages, meta={ 'link': report_url, 'number': i + 1 })
def parse_pages(self, response): title = '' for text in response.xpath("//h2").xpath(".//text()").extract(): title += text.strip() messages = response.xpath("//div[@class='box_detail']/p").xpath( ".//text()").extract() sign = 0 time, address, speaker, person_introduce, content = '', '', '', '', '' for message in messages: if 'Time:' in message or 'Time:' in message: sign = 1 time = self.get_messages( message, ':') if 'Time:' in message else self.get_messages( message, ':') elif 'Venue:' in message or 'Meeting Room:' in message or 'Location:' in message or 'Venue:' in message or 'Meeting Room:' in message or 'Location:' in message: sign = 2 address = self.get_messages( message, ':' ) if 'Venue:' in message or 'Meeting Room:' in message or 'Location:' in message else self.get_messages( message, ':') elif 'Speaker:' in message or 'Speaker:' in message: sign = 3 speaker = self.get_messages( message, ':') if 'Speaker:' in message else self.get_messages( message, ':') elif 'Bio:' in message or 'Biography:' in message or 'Bio:' in message or 'Biography:' in message: sign = 4 person_introduce = self.get_messages( message, ':' ) if 'Bio:' in message or 'Biography:' in message else self.get_messages( message, ':') elif 'Abstract:' in message or 'Abstract:' in message: sign = 5 content = self.get_messages( message, ':') if 'Abstract:' in message else self.get_messages( message, ':') else: if sign == 1: time += '\n' + message.strip() elif sign == 2: address += '\n' + message.strip() elif sign == 3: speaker += '\n' + message.strip() elif sign == 4: person_introduce += '\n' + message.strip() elif sign == 5: content += '\n' + message.strip() else: pass if title != '': self.counts += 1 print_new_number(self.counts, 'THU', self.name) all_messages = save_messages('THU', self.name, title, time, address, speaker, person_introduce, content, '', response.meta['link'], response.meta['number'], u'清华大学', u'计算机科学与技术系') return all_messages
def parse_pages(self, response): messages = response.xpath("//div[@class='field-items']").xpath(".//p") sign = 0 title, time, address, speaker, person_introduce, content, date = '', '', '', '', '', '', '' for message in messages: text, replace = self.connect_messages( message.xpath(".//text()").extract()) if u'题目:' in replace or 'Title:' in replace or u'题目:' in replace or 'Title:' in replace: title = self.get_messages( text, ':' ) if u'题目:' in replace or 'Title:' in replace else self.get_messages( text, ':') elif u'时间' in replace or 'Time:' in replace or u'时间:' in replace or 'Time:' in replace: time = self.get_messages( text, ':' ) if u'时间:' in replace or 'Time:' in replace else self.get_messages( text, ':') elif u'地点:' in replace or 'Address:' in replace or u'地点:' in replace or 'Address:' in replace: address = self.get_messages( text, ':' ) if u'地点:' in replace or 'Address:' in replace else self.get_messages( text, ':') elif u'主讲:' in replace or u'报告人:' in replace or 'Speaker:' in replace or u'主讲:' in replace or u'报告人:' in replace or 'Speaker:' in replace: speaker = self.get_messages( text, ':' ) if u'主讲:' in replace or u'报告人:' in replace or 'Speaker:' in replace else self.get_messages( text, ':') elif u'日期:' in replace or 'Date:' in replace or u'日期:' in replace or 'Date:' in replace: date = self.get_messages( text, ':' ) if u'日期:' in replace or 'Date:' in replace else self.get_messages( text, ':') elif u'地点:' in text or 'Address:' in replace or u'地点:' in replace or 'Address:' in replace: address = self.get_messages( text, ':' ) if u'地点:' in replace or 'Address:' in replace else self.get_messages( text, ':') elif u'简介:' in text or 'Biography:' in replace or 'Bio:' in replace or u'简介:' in replace or 'Biography:' in replace or 'Bio:' in replace: sign = 1 person_introduce = self.get_messages( text, ':' ) if u'简介:' in replace or 'Biography:' in replace or 'Bio:' in replace else self.get_messages( text, ':') elif u'摘要:' in replace or 'Abstract:' in replace or u'摘要:' in replace or 'Abstract:' in replace: sign = 2 content = self.get_messages( text, ':' ) if u'摘要:' in replace or 'Abstract:' in replace else self.get_messages( text, ':') else: if sign == 1: person_introduce += text elif sign == 2: content += text time = (date + ' ' + time).strip() if title != '': self.counts += 1 print_new_number(self.counts, 'SYSU', self.name) all_messages = save_messages('SYSU', self.name, title, time, address, speaker, person_introduce, content, '', response.meta['link'], response.meta['number'], u'中山大学', u'数据科学与计算机学院') return all_messages