def parse(self, response): messages = response.xpath("//table[@class='winstyle54630']").xpath( ".//tr[@height='26']") for i in xrange(len(messages)): report_name = messages[i].xpath(".//td")[0].xpath( ".//a/text()").extract()[0] if u'学术报告' not in report_name: continue report_url = self.domain + messages[i].xpath(".//td")[0].xpath( ".//a/@href").extract()[0][3:] report_time = get_localtime(messages[i].xpath(".//td")[1].xpath( ".//span/text()").extract()[0].strip().replace('/', '-')) if report_time > end_time: continue if report_time < now_time: return yield scrapy.Request(report_url, callback=self.parse_pages, meta={ 'link': report_url, 'number': i + 1, 'publication': report_time })
def parse(self, response): messages = response.xpath("//div[@class='new']/div") for i, message in enumerate(messages[:-1]): report_name = message.xpath(".//a/@title").extract()[0] if re.search(u"(报告|讲座)", report_name) is None: continue report_time = get_localtime( message.xpath("div/span/text()").extract()[0].strip().strip( "()")) if report_time > end_time: continue if report_time < now_time: return report_url = self.domain + message.xpath(".//a/@href").extract()[0] yield scrapy.Request(report_url, callback=self.parse_pages, meta={ 'link': report_url, 'number': i + 1, 'publication': report_time, 'title': report_name })
def parse(self, response): messages = response.xpath("//div[@class='article_list']/ul/li") for i, message in enumerate(messages): report_name = message.xpath(".//a/text()").extract()[0] if u"学术报告预告" not in report_name: continue report_time = get_localtime( message.xpath(".//div[@class='p_date']/text()").extract() [0].replace('/', '-')) if report_time > end_time: continue if report_time < now_time: return report_url = self.domain + message.xpath( ".//a/@href").extract()[0][1:] yield scrapy.Request(report_url, callback=self.parse_pages, meta={ 'link': report_url, 'number': i + 1, 'publication': report_time })
def parse(self, response): messages = response.xpath("//div[@class='sub_text']").xpath( ".//div[@class='news-list']") for i, message in enumerate(messages): report_name = message.xpath(".//a/text()").extract()[0] if re.search(u"学术(报告)|(讲座)", report_name) is None: continue report_url = self.domain + message.xpath( ".//a/@href").extract()[0][1:] report_time = get_localtime( message.xpath(".//div[@class='lastTime']/text()").extract()[0]) if report_time > end_time: continue if report_time < now_time: return yield scrapy.Request(report_url, callback=self.parse_pages, meta={ 'link': report_url, 'number': i + 1, 'publication': report_time })
def parse(self, response): messages = response.xpath("//div[@class='news-list']/ul/li") for i, message in enumerate(messages): report_time = get_localtime(re.sub(u"[]\[]", '', message.xpath("span/text()").extract()[0].strip())) if report_time > end_time: continue if report_time < now_time: return report_url = self.domain + message.xpath(".//a/@href").extract()[0][1:] yield scrapy.Request(report_url, callback=self.parse_pages, meta={'link': report_url, 'number': i + 1, 'publication': report_time})
def get_year(self, text, day, month): year = re.search(u"[\d]*((?![\u4e00-\u9fa5])[\W])*(?=年)", text) if year is not None: year = year.group() if len(year.strip()) < 4: year = "20" + year elif day is not None and month is not None: now_year = int(self.now_time.split('-')[0]) now_month_day = int(str(get_localtime(self.now_time))[4:]) report_month_day = int(month) * 100 + int(day) if report_month_day < now_month_day: year = str(now_year + 1) else: year = str(now_year) return year
def parse(self, response): messages = response.xpath("//div[@class='twelve columns alpha']/ul/li") for i, message in enumerate(messages): report_name = message.xpath(".//a/@title").extract()[0] if u"【预告】" not in report_name or u"论坛" in report_name: continue report_time = get_localtime(message.xpath("span/text()").extract()[0]) if report_time > end_time: continue if report_time < now_time: return report_url = self.domain + message.xpath(".//a/@href").extract()[0] yield scrapy.Request(report_url, callback=self.parse_pages, meta={'link': report_url, 'number': i + 1, 'publication': report_time})
def parse(self, response): messages = response.xpath("//div[@class='uc_lanmu_content']/ul/li") for i, message in enumerate(messages): report_name = message.xpath(".//a/text()").extract()[0] if re.search(u"[::]", report_name, re.S) != None: report_name = re.split(u"[::]", report_name)[-1] print report_name report_time = get_localtime(message.xpath(".//span[@class='article_date']/text()").extract()[0]) if report_time > end_time: continue if report_time < now_time: return report_url = self.domain + message.xpath(".//a/@href").extract()[0][1:] yield scrapy.Request(report_url, callback=self.parse_pages, meta={'link': report_url, 'number': i + 1, 'name': report_name, 'publication': report_time})
def get_time(self, text): day = self.get_day(text) month = self.get_month(text, day) year = self.get_year(text, day, month) start_time = None if day is not None and month is not None and year is not None: start_time = year + '-' + month + '-' + day else: start_time = re.search(u"([\d]*)[-~.,,]*([\d]{1,})[-~.,,]{1,}([\d]{1,})", text) if start_time is not None: start_time = re.split(u"[-~.,,]*", start_time.group()) if len(start_time) == 3: start_time = start_time[0] + '-' + start_time[1] + '-' + start_time[2] elif len(start_time) == 2: day = start_time[1] month = start_time[0] year = self.get_year('', day, month) start_time = year + '-' + month + '-' + day else: start_time = None else: weekday = re.findall(u"(?:星期|周)(一|二|三|四|五|六|七|日|天|末|[\d])", text)[0] if re.sub(u"\\s+", '', weekday) != '': if self.week2day.has_key(weekday): weekday = int(self.week2day[weekday]) else: weekday = int(weekday) now_weekday = datetime.datetime.now().weekday() + 1 if weekday < now_weekday: start_time = str(datetime.datetime.now() + datetime.timedelta(days=weekday + 7 - now_weekday)).split(' ')[0] else: start_time = str(datetime.datetime.now() + datetime.timedelta(days=weekday - now_weekday)).split(' ')[0] print start_time if start_time is None or re.sub(u"\\s+", '', start_time) == '': return None else: try: return get_localtime(start_time) except: return None
def parse(self, response): messages = response.xpath("//div[@id='rightPageContent']/dl/dd") for i, message in enumerate(messages): report_time = get_localtime( message.xpath("span/text()").extract()[0]) if report_time > end_time: continue if report_time < now_time: return report_url = message.xpath(".//a/@href").extract()[0] yield scrapy.Request(report_url, callback=self.parse_pages, meta={ 'link': report_url, 'number': i + 1, 'publication': report_time })
def parse(self, response): messages = response.xpath("//div[@id='container']/dl/dd") for i in xrange(len(messages)): report_url = self.domain + messages[i].xpath( ".//a/@href").extract()[0][1:] report_time = get_localtime( messages[i].xpath(".//i/text()").extract()[0].split(' ')[0]) if report_time > end_time: continue if report_time < now_time: return yield scrapy.Request(report_url, callback=self.parse_pages, meta={ 'link': report_url, 'number': i + 1, 'publication': report_time })
def parse_pages(self, response): report_time = get_localtime( re.split( u"[::]", response.xpath("//td[@height='32']/div/strong")[0].xpath( "text()").extract()[0])[-1]) if report_time < now_time or report_time > end_time: return messages = response.xpath("//span[contains(@class, 'content')]/p") return { 'text': messages, 'number': response.meta['number'], 'organizer': u'华东师范大学大学计算机科学技术系', 'faculty': self.name, 'link': response.meta['link'], 'publication': report_time, 'location': u"华东:上海市" }
def parse(self, response): messages = response.xpath("//div[@class='full-page-list']/ul/li") for i in xrange(len(messages)): report_url = self.domains + messages[i].xpath( ".//a/@href").extract()[0][1:] report_time = get_localtime( messages[i].xpath(".//span/text()").extract()[0].replace( '/', '-')) if report_time > end_time: continue if report_time < now_time: return yield scrapy.Request(report_url, callback=self.parse_pages, meta={ 'link': report_url, 'number': i + 1, 'publication': report_time })
def parse(self, response): messages = response.xpath( "//table[@width='100%']/tbody/tr/td/table").xpath(".//tr") for i, message in enumerate(messages[:-1]): report_url = self.domain + message.xpath( ".//a/@href").extract()[0][1:] report_time = get_localtime( message.xpath(".//font/text()").extract()[0].strip('[]')) if report_time > end_time: continue if report_time < now_time: return yield scrapy.Request(report_url, callback=self.parse_pages, meta={ 'link': report_url, 'number': i + 1, 'publication': report_time })
def parse(self, response): messages = response.xpath( "//div[@class='article-list-right']/div[@class='article-list-right-li new-article']" ) for i, message in enumerate(messages): report_time = get_localtime( message.xpath(".//div[@class='article-list-left-li-r']/text()" ).extract()[0]) if report_time > end_time: continue if report_time < now_time: return report_url = self.domain + message.xpath( ".//a/@href").extract()[0][3:] yield scrapy.Request(report_url, callback=self.parse_pages, meta={ 'link': report_url, 'number': i + 1, 'publication': report_time })
def parse(self, response): messages = response.xpath("//table[@class='datatable']/tr") for i, message in enumerate(messages[:len(messages) - 1]): report_name = message.xpath(".//a/@title").extract()[0] if u"讲座" not in report_name: continue report_time = get_localtime( message.xpath("td")[-1].xpath("span/text()").extract()[0]) if report_time > end_time: continue if report_time < now_time: return report_url = self.domain + message.xpath(".//a/@href").extract()[0] yield scrapy.Request(report_url, callback=self.parse_pages, meta={ 'link': report_url, 'number': i + 1, 'publication': report_time })
def parse(self, response): messages = response.xpath("//table[@id='dgrdNews']/tr") for i, message in enumerate(messages[:len(messages) - 1]): report_name = message.xpath(".//a/text()").extract()[0] if re.search(u"讲座|报告", report_name) is None: continue report_time = get_localtime("20" + message.xpath(".//td")[2].xpath( ".//text()").extract()[0].split(' ')[0]) if report_time > end_time: continue if report_time < now_time: return report_url = self.domain + message.xpath(".//a/@href").extract()[0] yield scrapy.Request(report_url, callback=self.parse_pages, meta={ 'link': report_url, 'number': i + 1, 'publication': report_time })
def parse(self, response): messages = response.xpath("//div[@id='TextList_time']/table")[0].xpath( "tr") for i, message in enumerate(messages): report_sign = message.xpath(".//a") if len(report_sign) == 0: continue report_time = get_localtime( message.xpath("td")[-1].xpath(".//text()").extract()[0]) if report_time > end_time: continue elif report_time < now_time: return report_url = self.domain + message.xpath(".//a/@href").extract()[0] yield scrapy.Request(report_url, callback=self.parse_pages, meta={ 'link': report_url, 'number': i + 1, 'publication': report_time })
# -*- coding:utf-8 -*- from __future__ import print_function import os import time import shutil import traceback from report_crawler.spiders.__Global_function import get_localtime from report_crawler.spiders.__Global_variable import REPORT_SAVEDIR now_time = get_localtime(time.strftime("%Y-%m-%d", time.localtime())) DATADIR = REPORT_SAVEDIR + '/' + str(now_time) class Spider_starter(object): def crawl(self): self.X001() def run_spider(self, spider_name): dirname = REPORT_SAVEDIR + '/' + str(now_time) + '/' + spider_name[len(spider_name)-3:] + '/' + spider_name[0:len(spider_name)-3] # If the dir is exist, clear the dir(today) if os.path.exists(dirname): shutil.rmtree(dirname, True) # If one of the spiders has error, the print_exc() function will tell us which is criminal try: if not os.path.exists(DATADIR): os.makedirs(DATADIR) os.system('scrapy crawl ' + spider_name) except: