Esempio n. 1
0
  def _parse_show(self, el):    
    event_detail = html_util.get_first_element(el, '.event-details')

    date_txt     = html_util.get_first_element(event_detail, 'strong').text
    time_txt     = event_detail.text_content()

    show = Show()

    show.venue      = self.venue()
    
    title_txt       = html_util.get_first_element(event_detail, '.event-name').text_content()
    show.performers = [Performer(p) for p in lang_util.parse_performers(title_txt)]
    show.show_time  = date_util.parse_show_time(date_txt, time_txt)
    show.door_time  = date_util.parse_door_time(date_txt, time_txt)
    
    show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail)

    img = html_util.get_first_element(el, ".event-image img", optional = True)
    
    if img is not None:
      show.resources.image_url = img.get('src')
      
    date_util.adjust_fuzzy_years(show, self._parse_started)

    return show
Esempio n. 2
0
  def _parse_show(self, event_detail):
    if html_util.get_first_element(event_detail, 'h2', optional = True) is None:
      return None

    show = Show()

    date_txt       = html_util.get_first_element(event_detail, 'h2').text_content()
    performers_txt = html_util.get_first_element(event_detail, '.caption').text_content()

    show.venue      = self.venue()
    show.performers = [Performer(p) for p in lang_util.parse_performers(performers_txt)]
    
    if not date_txt.lower().startswith('every'):
      show.date = date_util.parse_date_and_time(date_txt, None)

    show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail)

    for img_tag in event_detail.iter(tag = 'img'):
      show.resources.image_url = img_tag.get('src')
      
      break
      
    date_util.adjust_fuzzy_years(show, self._parse_started)
      
    return show
Esempio n. 3
0
  def _parse_show(self, link):
    event_doc = html_util.fetch_and_parse(link, parse_500 = True)

    event_detail = html_util.get_first_element(event_doc,    ".event-detail")
    artist_info  = html_util.get_first_element(event_doc,    ".artist-boxes")
    
    date_txt     = html_util.get_first_element(event_detail, ".dates").text_content()
    
    performers = [] 
    
    for el in html_util.get_elements(event_doc, '.headliners'):
      for name in lang_util.parse_performers(el.text_content()):
        performers.append(Performer(name, headliner = True))

    for el in html_util.get_elements(event_doc, '.supports'):
      for name in lang_util.parse_performers(el.text_content()):
        performers.append(Performer(name, headliner = False))

    show = Show()

    show.merge_key  = link
    show.venue      = self.venue()
    show.performers = performers
    show.show_time  = date_util.parse_date_and_time(date_txt, html_util.get_first_element(event_detail, ".times").text_content())

    show.resources.show_url      = link
    show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail, artist_info)

    img = html_util.get_first_element(event_detail, "img", optional = True)
    
    if img is not None:
      show.resources.image_url = img.get('src')

    return show
Esempio n. 4
0
  def _parse_show(self, link):
    logging.debug('Parsing show from: %s' % link)

    event_doc = html_util.fetch_and_parse(link)

    event        = html_util.get_first_element(event_doc, '.biglisting')
    img          = html_util.get_first_element(event, '.tonightinfo img', optional = True)

    date_el     = html_util.get_first_element(event, '.date')
    
    date_match  = self.DATE_RE.search(date_el.text_content())
    
    if date_match:
      date_txt = date_match.group(0)
    else:
      raise Exception('Unable to determine show date from: %s' % date_el.text_content())
    
    performers = [] 
    first_time = None

    for det in event.cssselect('.showpage-details'):
      header = None
      
      for child in det.getchildren():
        if child.tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
          header = child
          
      if header is None:
        logger.error('Unable to determine performer')
      else:
        time_txt = html_util.get_first_element(det, '.time').text_content()
      
        time_match = date_util.TIME_RE.search(time_txt)
      
        if time_match:
          first_time = time_txt = time_match.group('time')
        else:
          time_txt = None

        performers.append(Performer(header.text_content(), start_time = time_txt, headliner = header.tag in ('h1')))

    show = Show()

    show.merge_key  = link
    show.venue      = self.venue()
    show.performers = performers
    show.show_time  = date_util.parse_date_and_time(date_txt, first_time)

    show.resources.show_url      = link
    show.resources.resource_uris = self.resource_extractor.extract_resources(event)
    
    if img is not None:
      show.resources.image_url  = img.get('src')

    return show
Esempio n. 5
0
  def _parse_show(self, link):
    LOG.debug("Fetching show: %s" % link)

    event_doc    = html_util.fetch_and_parse(link)

    event_detail = event_doc.get_element_by_id("mainColumn")

    show = Show()
    
    for performer in html_util.get_elements(event_detail, 'h1'):
      name = performer.text_content().strip(' \n\r\t')
      if name:
        show.performers.append(Performer(name))
      
    date_txt = html_util.get_first_element(event_detail, '.date').text_content()
    
    event_match = self.EVENT_URL.match(link)

    show.merge_key = event_match.group('page_id')
    show.venue     = self.venue()
    show.show_time = date_util.parse_date_time(date_txt).replace(hour = 21)
    
    LOG.debug('Date: %s' % show.date)
    
    show.resources.show_url      = link
    show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail)

    for img_tag in event_detail.iter(tag = 'img'):
      if 'main' in img_tag.get('src'):
        show.resources.image_url = img_tag.get('src')
        
        break

    return show
Esempio n. 6
0
  def _fetch_profile(self, profile_id):
    profile_link = 'http://www.myspace.com/' + profile_id

    logger.debug('Fetching profile page: %s' % profile_link)

    html, doc = self._fetch_and_parse(profile_link)

    body = parsing.get_first_element(doc, 'body')

    if 'layout_0_1' in body.get('class'):
      logger.debug('%s is v1 profile' % profile_id)

      friend_id = FRIEND_ID_RE.search(html)

      if not friend_id:
        raise Exception("Unable to determine friend id for v1 myspace profile: %s" % profile_id)

      classic_profile_link = 'http://www.myspace.com/%s/classic' % friend_id.group(1)

      logger.debug('Fetching classic profile page: %s' % classic_profile_link)

      new_html, new_doc = self._fetch_and_parse(classic_profile_link)

      return (1, new_html, new_doc)
    elif 'layout_0_2' in body.get('class'):
      return (2, html, doc)
    else:
      raise Exception('Unable to determine myspace profile version: %s' % profile_id)

      logger.debug('%s is v2 profile' % profile.profile_id)
Esempio n. 7
0
  def _parse_show(self, link):
    raw_url   = self.raw_url(link)
    
    match     = self.EVENT_ID.search(link)
    
    if not match:
      raise Exception("Unable to locate event id in: %s" % link)
      
    event_id = match.group(0)
    
    logging.debug('Fetching show info: %s' % link)

    event_doc = html_util.fetch_and_parse(link)
    
    show_el   = html_util.get_first_element(event_doc, '#content')

    header_el = html_util.get_first_element(show_el, 'h1')
    
    header_match = self.HEADER_PARSE.search(header_el.text_content())
    
    if not header_match:
      raise Exception("Unable to parse header: %s" % header_el.text_content())
      
    date_txt = header_match.group('date').strip()
    title    = header_match.group('title').strip()
    
    if date_txt.lower().startswith('tonight'):
      date_txt = datetime.today().date().strftime('%F')

    img   = html_util.get_first_element(show_el, 'img', optional = True)

    show = Show()
    
    show.performers = [Performer(p) for p in lang_util.parse_performers(title)]
    show.show_time  = date_util.parse_date_and_time(date_txt, None)

    show.merge_key = event_id
    show.venue     = self.venue()
  
    show.resources.show_url      = link
    show.resources.resource_uris = self.resource_extractor.extract_resources(show_el)
    
    if img is not None:
      show.resources.image_url = img.get('src')
      
    return show
Esempio n. 8
0
  def _parse_v2(self, doc):
    content    = parsing.get_first_element(doc, '.content.contentMid')

    html_boxes = list(parsing.get_elements(content, '.htmlBoxModule'))

    resources = self.resource_extractor.extract_resources(*html_boxes)
    
    return ArtistProfileParserResult(resources)
Esempio n. 9
0
  def _month_parser(self, request_date):
    month_url = '%scalendar/%s.html' % (self.BASE_URL, request_date.strftime('%b%y').lower())
    
    logger.debug('Parsing: %s' % month_url)

    doc = html_util.fetch_and_parse(month_url)
    
    main_table = html_util.get_first_element(doc, 'body > table')
    
    trs = main_table.getchildren()
    
    # Remove the "header" row from the table
    trs.pop(0)
    # Remove the "footer" row from the table
    trs.pop()
    
    # The remaining rows look as follows:
    # The Show Details/Time pairing might be repeated
    """
    <tr><td>Monday 4th</td></tr>
    <tr><td>
      <center>
        <p>Show Details</p>
        <p>Show Time</p>
      </center>
    </td></tr>
    """
    
    while trs:
      date_row, show_row = trs.pop(0).getchildren(), trs.pop(0).getchildren()
      
      for i, date_td in enumerate(date_row):
        # At the end of the month, the html doesn't always contain a corresponding table
        # cell in the show_row for each date in the date_row
        if i > len(show_row) - 1:
          break

        date_match = self.DATE_RE.match(date_td.text_content().strip())
        
        if date_match:
          show_date = request_date.replace(day = int(date_match.group('day')))

          show_td = show_row[i]
          
          p_list = show_td.cssselect('center > p')
      
          if len(p_list) % 2 == 0:
            while p_list:
              show_detail, show_time = p_list.pop(0), p_list.pop(0)

              show = self._parse_show(show_date, show_detail, show_time)
              
              if not (show.door_time or show.show_time):
                logger.warning('Unable to determine door or show time for show on %s, discarding' % show_date)
              elif not (show.title or len(show.performers) > 0):
                logger.warning('Unable to determine title or performers for show on %s, discarding' % show_date)
              else:
                yield show
Esempio n. 10
0
  def _parse_show(self, url, section):
    doc = html_util.fetch_and_parse(url)

    show_el  = html_util.get_first_element(doc, '#detailPage')
    date_txt = html_util.get_first_element(show_el, 'time.dtstart').get('datetime')

    title = html_util.get_first_element(section, 'h4').text_content()

    show = Show()

    show.merge_key  = url
    show.venue      = self.venue()
    show.performers = [Performer(p) for p in lang_util.parse_performers(title)]
    show.show_time  = date_util.parse_date_time(date_txt)

    show.resources.show_url      = url
    show.resources.resource_uris = self.resource_extractor.extract_resources(section, show_el)

    return show
Esempio n. 11
0
    def _parse_show(self, link):
        event_doc = html_util.fetch_and_parse(link)

        event_detail = event_doc.get_element_by_id("eventDetail")

        title_txt = []

        found_h_el = False

        # Start parsing when we find the first h* element
        # Stop parsing if we found an h* element, but then encounter anything else
        for el in event_detail.getchildren():
            if el.tag in ("h1", "h2"):
                found_h_el = True

                if el.text_content():
                    title_txt.append(el.text_content())
            elif found_h_el:
                break

        """
    <span id="timeDetail">
      Apr 24, 2010<br />
			upstairs<br />
    	Doors @ 7 PM<br/>
    	$15.00 Adv. / $20 at the Door<br />
    	<a href="http://www.deadcellentertainment.tix.com/Schedule.asp?OrganizationNumber=2690" target="_blank">
        <img src="/images/buyticket.png" alt="Purchase Tickets" />
      </a>
    </span>
    """
        time_el = event_detail.get_element_by_id("timeDetail")
        date_txt = time_el.text
        time_txt = time_el.text_content()

        performers = []

        show = Show()

        show.merge_key = link
        show.venue = self.venue()
        show.performers = [Performer(p) for p in lang_util.parse_performers("/".join(title_txt))]
        show.door_time = date_util.parse_show_time(date_txt, time_txt)
        show.show_time = date_util.parse_door_time(date_txt, time_txt)

        show.resources.show_url = link
        show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail)

        img = html_util.get_first_element(event_detail, "img")

        if img is not None:
            show.resources.image_url = img.get("src")

        return show
Esempio n. 12
0
  def _month_parser(self, request_date):
    month_url = '%scalendar/%d-%d' % (self.BASE_URL, request_date.year, request_date.month)
    
    logger.debug('Parsing: %s' % month_url)

    doc = html_util.fetch_and_parse(month_url)
    
    main_table = html_util.get_first_element(doc, '.month-view table')
    
    for td in html_util.get_elements(main_table, 'td.has-events'):
      for show in self._parse_shows(request_date, td):
        yield show
Esempio n. 13
0
  def _get_parser(self):    
    doc     = html_util.fetch_and_parse(self.BASE_URL)

    events  = html_util.get_first_element(doc, '.defaultText')
    content = html_util.get_displayed_text_content(events).strip()

    for line in content.split('\n'):
      if self.SHOW_START_RE.match(line):
        show = self._parse_show(line)
        
        if show:
          yield show
Esempio n. 14
0
  def _parse_show(self, link):
    event_doc = html_util.fetch_and_parse(link)

    event        = html_util.get_first_element(event_doc, '#tfly-center-column-wide')
    event_detail = html_util.get_first_element(event,     '#details')
    """
    <div class="info"> 
      Sat, May 22, 2010<br /> 
      Doors: 6:00 PM / Show: 7:00 PM&nbsp;<br /> 
      $5.00<br /> 
    </div> 
    """
    event_info   = html_util.get_first_element(event_detail, ".info")

    date_txt     = event_info.text
    time_txt     = event_info.getchildren()[0].tail

    img          = html_util.get_first_element(event_detail, "img")
    
    performers = [] 

    for tag in ('h1', 'h2', 'h3', 'h4'):
      for h in event_detail.iter(tag = tag):
        performers.extend(self._parse_performers(h))

    show = Show()

    show.merge_key  = link
    show.venue      = self.venue()
    show.performers = performers
    show.show_time  = date_util.parse_show_time(date_txt, time_txt)
    show.door_time  = date_util.parse_door_time(date_txt, time_txt)

    show.resources.show_url      = link
    show.resources.resource_uris = self.resource_extractor.extract_resources(event)
    
    if img is not None:
      show.resources.image_url  = img.get('src')

    return show
Esempio n. 15
0
  def _parse_show(self, el):
    date_el = html_util.get_first_element(el, '.calendardates')
    
    for span in date_el.iter(tag = 'span'):
      if span.get('class') == 'small':
        span.getparent().remove(span)
    
    date_txt = date_el.text_content().lower()
    
    # Skip recurring events
    if 'every' in date_txt:
      return None
      
    date_txt, time_txt = date_txt.split(',')
    
    performers = [] 
    
    title_el = html_util.get_first_element(el, '.calendar')

    for name in title_el.text_content().split('/'):
      performers.append(Performer(name))

    show = Show()

    show.venue      = self.venue()
    show.performers = performers
    show.show_time  = date_util.parse_date_and_time(date_txt, time_txt)

    show.resources.resource_uris = self.resource_extractor.extract_resources(el)
    
    for img in el.iter(tag = 'img'):
      logging.debug('image: %s - %s' % (img.get('src'), self.IMAGE_RE.search(img.get('src', ''))))
      if self.IMAGE_RE.search(img.get('src', '')):
        show.resources.image_url = img.get('src')
        
    date_util.adjust_fuzzy_years(show, self._parse_started)

    return show
Esempio n. 16
0
  def _parse_show(self, link, show_section):
    show_doc    = html_util.fetch_and_parse(link)

    show_detail = show_doc.get_element_by_id("content")
    
    title       = html_util.get_first_element(show_detail, '.title').text

    date_txt    = html_util.get_first_element(show_section, '.date').text
    image_url   = html_util.get_first_element(show_detail,  '.left-view-header img').get('src')
    
    performers = []
    
    performer_detail = html_util.get_first_element(show_detail, '.performers')
    performer_urls   = []
    
    for anchor in performer_detail.iter(tag = 'a'):
      performers.extend(self._parse_performers(anchor))
      
      if self.IS_ARTIST_URL_RE.match(anchor.get('href')):
        performer_urls.append(anchor.get('href'))
        
    resource_sections = [show_section, show_detail]
    
    for url in performer_urls:
      resource_sections.extend(self.fetch_performer_content(url))
    
    show = Show()

    show.merge_key               = link
    show.venue                   = self.venue()
    show.performers              = performers
    show.show_time               = date_util.parse_date_time(date_txt)
    show.resources.show_url      = link
    show.resources.image_url     = image_url
    show.resources.resource_uris = self.resource_extractor.extract_resources(*resource_sections)

    return show
Esempio n. 17
0
  def _parse_show(self, link, show_section):
    show_doc    = html_util.fetch_and_parse(link)

    show_detail = html_util.get_first_element(show_doc, "#content .event-detail")

    date_txt    = html_util.get_first_element(show_detail, ".dates").text_content()
    time_txt    = html_util.get_first_element(show_detail, ".times").text_content()
    sold_out    = html_util.get_first_element(show_detail,  '.sold-out', optional = True)
    image       = html_util.get_first_element(show_detail,  'img',       optional = True)
    
    # The image we want is generally the first one, but if the layout changes this may break
    if image is not None:
      image_url = image.get('src')
    else:
      image_url = None
    
    performers = [] 
    
    for tag in ('h1', 'h2', 'h3'):
      for p in show_detail.iter(tag = tag):
        if p.text_content():    
          performers.extend(self._parse_performers(p))

    show = Show()

    show.merge_key               = link
    show.venue                   = self.venue()
    show.performers              = performers
    show.door_time               = date_util.parse_door_time(date_txt, time_txt)
    show.show_time               = date_util.parse_show_time(date_txt, time_txt)
    show.soldout                 = sold_out is not None

    show.resources.show_url      = link
    show.resources.image_url     = image_url
    show.resources.resource_uris = self.resource_extractor.extract_resources(show_detail)

    return show
Esempio n. 18
0
    def _parse_show(self, link):
        event_doc = html_util.fetch_and_parse(link)
        match = self.IS_EVENT.match(link)

        event_id = int(match.group("event_id"))
        event_detail = html_util.get_first_element(event_doc, ".tfly-event-id-%d" % event_id)

        date_txt = html_util.get_first_element(event_doc, ".dates").text_content()
        time_txt = html_util.get_first_element(event_doc, ".times").text_content()

        img = html_util.get_first_element(event_detail, "img")

        performers = []

        for p in html_util.get_elements(event_detail, ".headliners"):
            performers.append(Performer(p.text_content(), headliner=True))

        for p in html_util.get_elements(event_detail, ".supports"):
            for pi in lang_util.parse_performers(p.text_content()):
                performers.append(Performer(pi, headliner=False))

        show = Show()

        show.merge_key = link
        show.venue = self.venue()
        show.performers = performers
        show.show_time = date_util.parse_show_time(date_txt, time_txt)
        show.door_time = date_util.parse_door_time(date_txt, time_txt)

        show.resources.show_url = link
        show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail)

        if img is not None:
            show.resources.image_url = img.get("src")

        return show
Esempio n. 19
0
  def parse(self, artist, profile):
    msp_profile  = api.MySpaceProfile(profile.profile_id)

    doc = msp_profile.get_profile_doc()
    body = parsing.get_first_element(doc, 'body')
    
    self._resolve_offsite_links(doc)

    if msp_profile.get_profile_version() == 1:
      logger.debug('%s is v1 profile' % profile.profile_id)

      return self._parse_v1(doc)
    elif msp_profile.get_profile_version() == 2:
      logger.debug('%s is v2 profile' % profile.profile_id)

      return self._parse_v2(doc)
    else:
      raise Exception('Unable to determine myspace profile version')
Esempio n. 20
0
 def _parse_shows(self, base_date, td):
   day = int(html_util.get_first_element(td, '.day').text_content())
   
   date = base_date.replace(day = day)
   
   logger.debug('Parsing shows on %s' % date.strftime('%F'))
   
   lr_shows     = html_util.get_elements(td, '.lr_color a')
   googie_shows = html_util.get_elements(td, '.googie_color a')
   
   shows = []
   
   if lr_shows:
     shows.append(self._parse_show(date, lr_shows))
     
   if googie_shows:
     shows.append(self._parse_show(date, googie_shows))
     
   return shows
Esempio n. 21
0
  def _parse_show(self, show_url, event_detail, today):
    show = Show()

    # Union hall will have duplicate instances of #unionhall_performer
    # some may or may not have links, but those that do have links are tagged
    # with the same id again ie: <div id="unionhall_performer"><a href="#" id="#unionhall_performer"> ...
    performers = [Performer(p.text_content()) for p in event_detail.cssselect("#unionhall_performer") if p.tag != 'a']

    performers[0].headliner = True
    
    ticket_link = html_util.get_first_element(event_detail, '#ticket_link a', optional = True)

    show.venue      = self.venue()
    show.performers = performers

    if ticket_link is not None:
      show.merge_key = ticket_link.get('href')

    # Format: THU 3/25: 6pm / $15      
    date_tag   = event_detail.get_element_by_id("unionhall_date")
    
    date_match = self.DATE_RE.match(date_tag.text_content())
    time_match = self.TIME_RE.search(date_tag.text_content())
    
    if date_match and time_match:
      month, day = (int(d) for d in (date_match.group('month'), date_match.group('day')))

      show_date = datetime.now().replace(month = month, day = day)
      
      show.show_time = date_util.parse_date_and_time(show_date.strftime('%F'), time_match.group('time'))

    show.resources.resource_uris = self.resource_extractor.extract_resources(event_detail)

    for img_tag in event_detail.iter(tag = 'img'):
      show.resources.image_url = img_tag.get('src')
      
      break

    date_util.adjust_fuzzy_years(show, self._parse_started)
  
    return show
Esempio n. 22
0
  def _parse_artist(self, a):
    time_match = self.TIME_RE.match(a.text_content())
    
    if time_match:
      start_time = time_match.group('time')
      name       = self.TIME_RE.sub('', a.text_content())
    else:
      start_time = None
      name       = a.text_content()

    link = a.get('href')
    
    if link:
      artist_doc = html_util.fetch_and_parse(link)
      
      artist_el  = html_util.get_first_element(artist_doc, '#bleh')
    else:
      artist_el = None

    logging.debug('Artist (%s) name: %s from (%s)' % (start_time, name, a.text_content()))
      
    return (name, start_time, artist_el)