def parse_detail(self, response):
     '''
     Pase event detail page
     '''
     self.logger.info( 'Start to parse event with url = %s ', response.url)
     try:
         item = EventFindaIterm()
         item['title'] = response.xpath('//h1[@itemprop="name"]//text()').extract_first()
         item['source'] = self.source_site
         item['source_url'] = response.url
         # item.event_website
         cover = self.cover_url_extractor.extract_links(response)
         if cover:
             item['cover_url'] = cover[0].url
         else:
             logging.error("no cover url found. event url=%s" % response.url)
             item['cover_url'] = None
 
         item['start_time'] = totimestamp(self._parse_start_time(response) or
                           (datetime.datetime.today() + datetime.timedelta(days=1)))
         item['location'] = response.xpath('//div[@itemprop="location"]/text()').extract_first().strip()
         item['category'] = response.xpath('//div[@id="event-header"]//div[contains(@class, "label")]/text()').extract_first().strip()
         item['is_free'] = 1 if u'free' in u''.join(response.xpath('//div[@class="event-booknow"]//text()').extract()).strip().lower() else 0
         item['detail'] = u'\n'.join(response.xpath('//article[@itemprop="description"]//text()').extract()).strip()
         item['event_website'] = None     # todo: not found now
         self.logger.info('Parse event with url %s successfully.', response.url)
         yield item
     except:
         self.logger.warning('Parse event with url %s fail.', response.url, exc_info=True)
    def _process_start_time(self, dt):
        """
            Possible Datetime formate:
            Monday 14 March 2016 2:00pm
            Monday March 14 2016 – Sunday April 17 2016 10:00am – 7:00pm
            Tuesday 15 March 2016 10:00am – 12:30pm
            Monday March 14 2016 – Sunday March 20 2016
            Friday June  3 2016 \u2013 Sunday June  5 2016
        """

        try:
            date_list_month_first = re.findall(r"\w+\s+\d{1,2}\s+\d{4}", dt)
            date_list_month_second = re.findall(r"\d{1,2}\s+\w+\s+\d{4}", dt)
            time_list = re.findall(r"\d{1,2}:\d{1,2}\w{2}", dt)

            date_str = date_list_month_second[0] if len(date_list_month_second) > 0 else date_list_month_first[0]
            time_str = time_list[0] if len(time_list) > 1 else u"12:00am"

            if len(date_list_month_second) > 0:
                time_raw = datetime.datetime.strptime(date_str + time_str, "%d %B %Y%I:%M%p")
            elif len(date_list_month_first) > 0:
                time_raw = datetime.datetime.strptime(date_str + time_str, "%B %d %Y%I:%M%p")
            else:
                return None
            return totimestamp(time_raw)
        except Exception, e:
            self.logger.error("Error while parsing start time: %s", e.message, exc_info=True)
            return None
 def _process_start_time(self, dt):
     '''
         2016-03-15T13:00:00+08:00
     '''
     
     try:
         date_time = re.findall(r'\d{4}-\d{1,2}-\d{1,2}T\d{1,2}:\d{1,2}:\d{1,2}', dt)[0]
         time_raw = datetime.datetime.strptime(date_time, '%Y-%m-%dT%H:%M:%S')
         return totimestamp(time_raw)
     except Exception, e:
         self.logger.error('Error while parsing start time: %s', e.message, exc_info=True)
         raise ValueError('Cannot find start time.')
Ejemplo n.º 4
0
 def _process_start_time(self, dt):
     '''
         2016-04-22
     '''
     
     try:
         date_time = re.findall(r'\d{4}-\d{1,2}-\d{1,2}', dt)
         time_raw = datetime.datetime.strptime(date_time[0], '%Y-%m-%d')
         return totimestamp(time_raw)
     except Exception, e:
         self.logger.error('Error while parsing start time %s: %s', dt, e.message, exc_info=True)
         return None
 def parse_one_event(self, response):
     '''
     Parse event detail page
     '''
     self.logger.info('Start to parse event with url %s', response.url)
     try:
         item = EventFindaIterm()
         
         item['source'] = self.source
         item['source_url'] = response.url
         item['category'] = response.meta['category']
         
         item['title'] = response.xpath('//div[@class="pane-content"]//div[@class="title"]/text()').extract_first()
         item['cover_url'] = response.xpath('//div[@class="pane-content"]//ul[@class="slides"]//img/@src').extract_first()
         item['detail'] = ''.join(response.xpath('//div[@class="content cpages"]//p//text() | //span[@class="description"]//h2//text()').extract())
         
         # more detail is for parsing location, time, event website
         more_detail = response.xpath('//div[@id="NeedToKnowDetails"]/div[@class="column first"]/div[@class="detail"]')   
         
         # construct start time by parsing year, month, day seperately         
         month = more_detail[0].xpath('.//span[@class="value"]//span[@class="datetime_month"]/text()').extract_first()
         day = more_detail[0].xpath('.//span[@class="value"]//span[@class="datetime_date"]/text()').extract_first()[0:2]
         year = datetime.datetime.now().year
         date_time = datetime.datetime.strptime(str(year) + month + day, '%Y%b%d')
         item['start_time'] = totimestamp(date_time)
         
         item['location'] = more_detail[1].xpath('.//span[@class="value"]//text()').extract_first().replace(u',', u'').strip()
             
         item['event_website'] = more_detail[2].xpath('.//span[@class="value"]//a/@href').extract_first() \
                                                         if len(more_detail) > 2 else None
         
         ticket_s = ''.join(response.xpath('//div[@id="NeedToKnowDetails"]/div[@class="column second"]/div[@class="detail"]//text()').extract())          
         item['is_free'] = 1 if u'Free' in ticket_s else 0
         
         self.logger.info('Parse event with url %s successfully.', response.url)
         yield item
     except:
         self.logger.warning('Parse event with url %s fail.', response.url, exc_info=True)