Exemple #1
0
  def _parse_show(self, link):
    event_doc = html_util.fetch_and_parse(link, parse_500 = True)

    event_detail = html_util.get_first_element(event_doc,    ".event-detail")
    artist_info  = html_util.get_first_element(event_doc,    ".artist-boxes")
    
    date_txt     = html_util.get_first_element(event_detail, ".dates").text_content()
    
    performers = [] 
    
    for el in html_util.get_elements(event_doc, '.headliners'):
      for name in lang_util.parse_performers(el.text_content()):
        performers.append(Performer(name, headliner = True))

    for el in html_util.get_elements(event_doc, '.supports'):
      for name in lang_util.parse_performers(el.text_content()):
        performers.append(Performer(name, headliner = False))

    show = Show()

    show.merge_key  = link
    show.venue      = self.venue()
    show.performers = performers
    show.show_time  = date_util.parse_date_and_time(date_txt, html_util.get_first_element(event_detail, ".times").text_content())

    show.resources.show_url      = link
    show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail, artist_info)

    img = html_util.get_first_element(event_detail, "img", optional = True)
    
    if img is not None:
      show.resources.image_url = img.get('src')

    return show
  def _parse_performers(self, h):
    ret = []

    for name in lang_util.parse_performers(h.text_content()):
      ret.append(Performer(name, headliner = h.tag in ('h1', 'h2')))
      
    return ret
Exemple #3
0
  def _parse_show(self, event_detail):
    if html_util.get_first_element(event_detail, 'h2', optional = True) is None:
      return None

    show = Show()

    date_txt       = html_util.get_first_element(event_detail, 'h2').text_content()
    performers_txt = html_util.get_first_element(event_detail, '.caption').text_content()

    show.venue      = self.venue()
    show.performers = [Performer(p) for p in lang_util.parse_performers(performers_txt)]
    
    if not date_txt.lower().startswith('every'):
      show.date = date_util.parse_date_and_time(date_txt, None)

    show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail)

    for img_tag in event_detail.iter(tag = 'img'):
      show.resources.image_url = img_tag.get('src')
      
      break
      
    date_util.adjust_fuzzy_years(show, self._parse_started)
      
    return show
Exemple #4
0
  def _parse_show(self, el):    
    event_detail = html_util.get_first_element(el, '.event-details')

    date_txt     = html_util.get_first_element(event_detail, 'strong').text
    time_txt     = event_detail.text_content()

    show = Show()

    show.venue      = self.venue()
    
    title_txt       = html_util.get_first_element(event_detail, '.event-name').text_content()
    show.performers = [Performer(p) for p in lang_util.parse_performers(title_txt)]
    show.show_time  = date_util.parse_show_time(date_txt, time_txt)
    show.door_time  = date_util.parse_door_time(date_txt, time_txt)
    
    show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail)

    img = html_util.get_first_element(el, ".event-image img", optional = True)
    
    if img is not None:
      show.resources.image_url = img.get('src')
      
    date_util.adjust_fuzzy_years(show, self._parse_started)

    return show
Exemple #5
0
  def _trans_show(self, show_data):
    LOG.debug("Checking event: %s" % show_data['EventName'])

    if "Music" not in show_data['MajorGenre']:
      LOG.debug("Skipping non music show")
      return None
    elif show_data.get('Canceled'):
      LOG.debug("Skipping cancelled show")
      return None
    elif 'VIP Packages' in show_data['EventName']:
      LOG.debug("Skipping VIP package")
      return None
  
    show = Show()
    
    performers = []
    
    for i, name in enumerate(lang_util.parse_performers(show_data['EventName'])):
      performers.append(Performer(name, headliner = i == 0))
            
    show.merge_key  = show_data['EventId']
    show.venue      = self.venue()
    show.performers = performers
    show.show_time  = date_util.parse_date_time(show_data['EventDate'])
    
    #if show.show_time:
    #  show.show_time = timezone(show_data['Timezone']).localize(show.show_time)
    
    if show_data['AttractionImage']:
      show.resources.image_url = self._image_url(show_data, show_data['AttractionImage'][0])

    return show
Exemple #6
0
 def _parse_performers(self, el):    
   headliner = html_util.has_class(el, "headliners")
   support   = html_util.has_class(el, "supports")
   
   if not headliner and not support:
     return []
   elif headliner:
     return [Performer(el.text_content(), headliner = True)]
   elif support:
     return [Performer(name) for name in lang_util.parse_performers(el.text_content())]
    def _parse_show(self, link):
        event_doc = html_util.fetch_and_parse(link)

        event_detail = event_doc.get_element_by_id("eventDetail")

        title_txt = []

        found_h_el = False

        # Start parsing when we find the first h* element
        # Stop parsing if we found an h* element, but then encounter anything else
        for el in event_detail.getchildren():
            if el.tag in ("h1", "h2"):
                found_h_el = True

                if el.text_content():
                    title_txt.append(el.text_content())
            elif found_h_el:
                break

        """
    <span id="timeDetail">
      Apr 24, 2010<br />
			upstairs<br />
    	Doors @ 7 PM<br/>
    	$15.00 Adv. / $20 at the Door<br />
    	<a href="http://www.deadcellentertainment.tix.com/Schedule.asp?OrganizationNumber=2690" target="_blank">
        <img src="/images/buyticket.png" alt="Purchase Tickets" />
      </a>
    </span>
    """
        time_el = event_detail.get_element_by_id("timeDetail")
        date_txt = time_el.text
        time_txt = time_el.text_content()

        performers = []

        show = Show()

        show.merge_key = link
        show.venue = self.venue()
        show.performers = [Performer(p) for p in lang_util.parse_performers("/".join(title_txt))]
        show.door_time = date_util.parse_show_time(date_txt, time_txt)
        show.show_time = date_util.parse_door_time(date_txt, time_txt)

        show.resources.show_url = link
        show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail)

        img = html_util.get_first_element(event_detail, "img")

        if img is not None:
            show.resources.image_url = img.get("src")

        return show
Exemple #8
0
  def _parse_show(self, show_txt):
    parts = show_txt.split(self.SHOW_PART_SEP)
    
    date_txt, time_txt = parts[0], parts[1]
    performers         = parts[-1]

    show = Show()

    show.show_time = date_util.parse_date_and_time(date_txt, time_txt)
    show.venue      = self.venue()
    show.performers = [Performer(p) for p in lang_util.parse_performers(performers)]

    date_util.adjust_fuzzy_years(show, self._parse_started)

    return show
Exemple #9
0
  def _process_entry(self, entry):
    logger.debug("Processing entry: %s, starting on: %s" % (entry.title.text, entry.when[0].start_time))
    
    if not self.BACK_ROOM_RE.match(entry.title.text):
      return None
      
    title_txt = self.BACK_ROOM_RE.sub('', entry.title.text)

    show = Show()

    show.venue      = self.venue()
    show.performers = [Performer(p) for p in lang_util.parse_performers(title_txt)]

    show.show_time = date_util.parse_date_time(entry.when[0].start_time)

    return show
Exemple #10
0
  def _parse_show(self, link):
    raw_url   = self.raw_url(link)
    
    match     = self.EVENT_ID.search(link)
    
    if not match:
      raise Exception("Unable to locate event id in: %s" % link)
      
    event_id = match.group(0)
    
    logging.debug('Fetching show info: %s' % link)

    event_doc = html_util.fetch_and_parse(link)
    
    show_el   = html_util.get_first_element(event_doc, '#content')

    header_el = html_util.get_first_element(show_el, 'h1')
    
    header_match = self.HEADER_PARSE.search(header_el.text_content())
    
    if not header_match:
      raise Exception("Unable to parse header: %s" % header_el.text_content())
      
    date_txt = header_match.group('date').strip()
    title    = header_match.group('title').strip()
    
    if date_txt.lower().startswith('tonight'):
      date_txt = datetime.today().date().strftime('%F')

    img   = html_util.get_first_element(show_el, 'img', optional = True)

    show = Show()
    
    show.performers = [Performer(p) for p in lang_util.parse_performers(title)]
    show.show_time  = date_util.parse_date_and_time(date_txt, None)

    show.merge_key = event_id
    show.venue     = self.venue()
  
    show.resources.show_url      = link
    show.resources.resource_uris = self.resource_extractor.extract_resources(show_el)
    
    if img is not None:
      show.resources.image_url = img.get('src')
      
    return show
Exemple #11
0
    def _parse_show(self, api, event):
        LOG.debug("Parsing event: %s" % event["id"])

        show = Show()

        show.merge_key = event["id"]
        show.venue = self.venue()
        show.performers = [Performer(p) for p in lang_util.parse_performers(event["name"])]
        show.show_time = date_util.parse_date_time(event["start_time"])

        html_doc = u"<html><body>%s</body></html>" % cgi.escape(event.get("description", ""))
        doc = lxml.html.document_fromstring(html_doc)

        show.resources.show_url = self.EVENT_URL % event["id"]
        show.resources.image_url = self.PICTURE_URL % event["id"]
        show.resources.resource_uris = self.resource_extractor.extract_resources(doc)

        return show
Exemple #12
0
  def _parse_show(self, url, section):
    doc = html_util.fetch_and_parse(url)

    show_el  = html_util.get_first_element(doc, '#detailPage')
    date_txt = html_util.get_first_element(show_el, 'time.dtstart').get('datetime')

    title = html_util.get_first_element(section, 'h4').text_content()

    show = Show()

    show.merge_key  = url
    show.venue      = self.venue()
    show.performers = [Performer(p) for p in lang_util.parse_performers(title)]
    show.show_time  = date_util.parse_date_time(date_txt)

    show.resources.show_url      = url
    show.resources.resource_uris = self.resource_extractor.extract_resources(section, show_el)

    return show
Exemple #13
0
    def _parse_show(self, link):
        event_doc = html_util.fetch_and_parse(link)
        match = self.IS_EVENT.match(link)

        event_id = int(match.group("event_id"))
        event_detail = html_util.get_first_element(event_doc, ".tfly-event-id-%d" % event_id)

        date_txt = html_util.get_first_element(event_doc, ".dates").text_content()
        time_txt = html_util.get_first_element(event_doc, ".times").text_content()

        img = html_util.get_first_element(event_detail, "img")

        performers = []

        for p in html_util.get_elements(event_detail, ".headliners"):
            performers.append(Performer(p.text_content(), headliner=True))

        for p in html_util.get_elements(event_detail, ".supports"):
            for pi in lang_util.parse_performers(p.text_content()):
                performers.append(Performer(pi, headliner=False))

        show = Show()

        show.merge_key = link
        show.venue = self.venue()
        show.performers = performers
        show.show_time = date_util.parse_show_time(date_txt, time_txt)
        show.door_time = date_util.parse_door_time(date_txt, time_txt)

        show.resources.show_url = link
        show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail)

        if img is not None:
            show.resources.image_url = img.get("src")

        return show
Exemple #14
0
  def _parse_shows(self, entry):
    content = None
    shows   = []
    today   = datetime.now()
    
    entry_date = date_util.parse_date_time(entry.published)
    
    # Only parse shows for the current year, or at the tail end of last year
    if entry_date.year != today.year or (entry_date.year == today.year -1 and entry_date.month > 10):
      return []

    for item in entry.content:
      if item.type in ('text/html',):
        content = item.value

    if not content:
      logging.error('Unable to extract content from entry: %s' % entry.id)

      return []
      
    # This next part is technically pretty evil
    entry_doc = lxml.html.fromstring(content)

    tags = ['span', 'b', 'i', 'strong', 'em']

    cleaner = Cleaner(remove_tags = tags, links = False)

    clean_content = cleaner.clean_html(entry_doc)
  
    # FIXME patch lxml to handle this while calling text_content()
    # http://codespeak.net/pipermail/lxml-dev/2008-August/004009.html  
    content_str = lxml.html.tostring(clean_content)
    
    for regexp, replacement in self.REPLACEMENTS:
      content_str = regexp.sub(replacement, content_str)
  
    for part in self.SHOW_DIVIDER_RE.split(content_str):
      part = part.strip(' \t\n')
      
      parts = part.split('\n')
      
      header = parts.pop(0)
      body   = '\n'.join(parts)
      
      header_parts = self.HEADER_SEP_RE.split(header)

      date_txt = header_parts.pop(0)
      time_txt = None
      
      for part in header_parts:
        if date_util.STRICT_TIME_RE.search(part):
          time_txt = date_util.sanitize_time(part)
          
          break
          
      if not time_txt:
        logging.error('Unable to find time in header: %s' % header)
        
        continue

      if '-' in time_txt:
        time_txt = time_txt.split('-')[0].strip()
        
      if not(time_txt.endswith('am') or time_txt.endswith('pm')):
        time_txt = time_txt + 'pm'

      show_doc = lxml.html.fromstring(body)
      
      use_all         = False
      performer_parts = []
      all_parts       = []
      
      for el in show_doc.iter():
        if self._is_img(el):
          break

        text = el.text or ''
        tail = el.tail or ''
  
        for regexp in self.BODY_SKIP:
          if regexp.search(text):
            text = ''

          if regexp.search(tail):
            tail = ''
        
        for p in (text, tail):
          if p:
            all_parts.append(p)

        if text and el.tag != 'a':
          use_all = True
        
        if el.tag == 'a' and tail.strip() not in(',', '&', 'w/', ''):
          use_all = True

        if el.tag == 'a':
          performer_parts.append(text)

      img_url = None
      
      for img in show_doc.iter(tag = 'img'):
        if img.get('src'):
          img_url = img.get('src')
          
          break
          
      show = Show()
      
      show.venue = self.venue()
      
      if use_all:
        performers_str  = ' '.join(all_parts).replace(' ,', ',').replace('  ', ' ')

        show.performers = [Performer(name) for name in lang_util.parse_performers(performers_str)]
      else:
        show.performers = [Performer(name) for name in performer_parts if name]
      
      try:
        show.show_time = date_util.parse_date_and_time(date_txt, time_txt)
      except:
        logging.exception('Unable to parse: %s - %s' % (date_txt, time_txt))
        continue

      show.resources.image_url     = img_url
      show.resources.resource_uris = self.resource_extractor.extract_resources(show_doc)

      date_util.adjust_fuzzy_years(show, entry_date)

      shows.append(show)

    return shows