def parse_detail(self, response): ''' Pase event detail page ''' self.logger.info( 'Start to parse event with url = %s ', response.url) try: item = EventFindaIterm() item['title'] = response.xpath('//h1[@itemprop="name"]//text()').extract_first() item['source'] = self.source_site item['source_url'] = response.url # item.event_website cover = self.cover_url_extractor.extract_links(response) if cover: item['cover_url'] = cover[0].url else: logging.error("no cover url found. event url=%s" % response.url) item['cover_url'] = None item['start_time'] = totimestamp(self._parse_start_time(response) or (datetime.datetime.today() + datetime.timedelta(days=1))) item['location'] = response.xpath('//div[@itemprop="location"]/text()').extract_first().strip() item['category'] = response.xpath('//div[@id="event-header"]//div[contains(@class, "label")]/text()').extract_first().strip() item['is_free'] = 1 if u'free' in u''.join(response.xpath('//div[@class="event-booknow"]//text()').extract()).strip().lower() else 0 item['detail'] = u'\n'.join(response.xpath('//article[@itemprop="description"]//text()').extract()).strip() item['event_website'] = None # todo: not found now self.logger.info('Parse event with url %s successfully.', response.url) yield item except: self.logger.warning('Parse event with url %s fail.', response.url, exc_info=True)
def _process_start_time(self, dt): """ Possible Datetime formate: Monday 14 March 2016 2:00pm Monday March 14 2016 – Sunday April 17 2016 10:00am – 7:00pm Tuesday 15 March 2016 10:00am – 12:30pm Monday March 14 2016 – Sunday March 20 2016 Friday June 3 2016 \u2013 Sunday June 5 2016 """ try: date_list_month_first = re.findall(r"\w+\s+\d{1,2}\s+\d{4}", dt) date_list_month_second = re.findall(r"\d{1,2}\s+\w+\s+\d{4}", dt) time_list = re.findall(r"\d{1,2}:\d{1,2}\w{2}", dt) date_str = date_list_month_second[0] if len(date_list_month_second) > 0 else date_list_month_first[0] time_str = time_list[0] if len(time_list) > 1 else u"12:00am" if len(date_list_month_second) > 0: time_raw = datetime.datetime.strptime(date_str + time_str, "%d %B %Y%I:%M%p") elif len(date_list_month_first) > 0: time_raw = datetime.datetime.strptime(date_str + time_str, "%B %d %Y%I:%M%p") else: return None return totimestamp(time_raw) except Exception, e: self.logger.error("Error while parsing start time: %s", e.message, exc_info=True) return None
def _process_start_time(self, dt): ''' 2016-03-15T13:00:00+08:00 ''' try: date_time = re.findall(r'\d{4}-\d{1,2}-\d{1,2}T\d{1,2}:\d{1,2}:\d{1,2}', dt)[0] time_raw = datetime.datetime.strptime(date_time, '%Y-%m-%dT%H:%M:%S') return totimestamp(time_raw) except Exception, e: self.logger.error('Error while parsing start time: %s', e.message, exc_info=True) raise ValueError('Cannot find start time.')
def _process_start_time(self, dt): ''' 2016-04-22 ''' try: date_time = re.findall(r'\d{4}-\d{1,2}-\d{1,2}', dt) time_raw = datetime.datetime.strptime(date_time[0], '%Y-%m-%d') return totimestamp(time_raw) except Exception, e: self.logger.error('Error while parsing start time %s: %s', dt, e.message, exc_info=True) return None
def parse_one_event(self, response): ''' Parse event detail page ''' self.logger.info('Start to parse event with url %s', response.url) try: item = EventFindaIterm() item['source'] = self.source item['source_url'] = response.url item['category'] = response.meta['category'] item['title'] = response.xpath('//div[@class="pane-content"]//div[@class="title"]/text()').extract_first() item['cover_url'] = response.xpath('//div[@class="pane-content"]//ul[@class="slides"]//img/@src').extract_first() item['detail'] = ''.join(response.xpath('//div[@class="content cpages"]//p//text() | //span[@class="description"]//h2//text()').extract()) # more detail is for parsing location, time, event website more_detail = response.xpath('//div[@id="NeedToKnowDetails"]/div[@class="column first"]/div[@class="detail"]') # construct start time by parsing year, month, day seperately month = more_detail[0].xpath('.//span[@class="value"]//span[@class="datetime_month"]/text()').extract_first() day = more_detail[0].xpath('.//span[@class="value"]//span[@class="datetime_date"]/text()').extract_first()[0:2] year = datetime.datetime.now().year date_time = datetime.datetime.strptime(str(year) + month + day, '%Y%b%d') item['start_time'] = totimestamp(date_time) item['location'] = more_detail[1].xpath('.//span[@class="value"]//text()').extract_first().replace(u',', u'').strip() item['event_website'] = more_detail[2].xpath('.//span[@class="value"]//a/@href').extract_first() \ if len(more_detail) > 2 else None ticket_s = ''.join(response.xpath('//div[@id="NeedToKnowDetails"]/div[@class="column second"]/div[@class="detail"]//text()').extract()) item['is_free'] = 1 if u'Free' in ticket_s else 0 self.logger.info('Parse event with url %s successfully.', response.url) yield item except: self.logger.warning('Parse event with url %s fail.', response.url, exc_info=True)