def events(): events = {} soup = import_util.url_to_soup(get_url_base()) body = soup.find('body', dict(bgcolor='#FFFFFF')) contents = body.findAll('b') for content in contents: if content.string == "1": page_count = 1 break while True: next = content.findNextSibling() if next.string in (">", ">"): break page_count += 1 content = next for page in xrange(1, page_count+1): partial_events = parse_page(page) for key, artists in partial_events.iteritems(): event_artists = events.get(key, list()) event_artists.extend(artists) events[key] = event_artists # convert to std struct for return to import module venues = [] for key, artists in events.iteritems(): venue, date = key event = dict(source="pollstar") event['artists'] = artists event['date'] = date event['name'] = ", ".join(artists) event['venue'] = dict(name=venue) # see importers.py import_to_db() for expected layout yield event
def events(venues=venue_list): for venue_name, code in venues: soup = import_util.url_to_soup(base_url + page_url + code) try: table = soup("table", attrs={"class":"eventDataGrid"})[0] except IndexError: continue trs = table("tr")[1:-1] venue = dict(name=venue_name) for tr in trs: event = dict(source="ticketswest") name_td = tr("td", attrs={"class":"borderTopRight paddedLeft"})[0] name = name_td.a.string match = re.compile(r' [Aa][Tt] ').search(name) if match: name = name[:match.start()] event['artists'] = name.split("*")[0].split(",") # eliminate too-short entries event['artists'] = [a for a in event['artists'] if len(a)] event['artists'] = [a.strip() for a in event['artists']] event['name'] = ", ".join(event['artists']) date_td = tr("td", attrs={"class":"borderTopRight"})[0] if not date_td.string: continue weekday, date_str, time_str, ampm = date_td.string.strip().split() event['date'] = datetime.date(*time.strptime(date_str, "%m/%d/%y")[:3]) if time_str.endswith(":00"): time_str = time_str.split(":")[0] event['time'] = time_str + " " + ampm link_td = tr("td")[3] event['ticket_url'] = base_url + link_td.div.a.attrs[1][1] event['venue'] = venue yield event
def event_urls_for_date(date): page = 1 while True: soup = import_util.url_to_soup(date_to_url(date) + "&page=%s" % page) events = soup.findAll("div", "EventListing clearfix") if not len(events): break for event in events: yield event.div.h3.find("a", recursive=False)['href'] page += 1
def events(): soup = import_util.url_to_soup(lastfm_event_url + "1") pages = int(soup("a", attrs={"class": "lastpage"})[0].string) for page in range(1, pages + 1): soup = import_util.url_to_soup(lastfm_event_url + str(page)) for tr in soup("tr", attrs={"class": re.compile("vevent.*")}): event_dict = dict(source="lastfm") venue = dict() venue["name"] = tr("td", attrs={"class": "location"})[0].a.strong.string day, month, year = tr.attrs[0][1].split("-")[-3:] event_dict["date"] = date(int(year), int(month), int(day)) artists = [tr("td", attrs={"class": "lineup"})[0].a.strong.string] try: other_artists = tr("td", attrs={"class": "lineup"})[0].a.contents[1].string artists.extend(other_artists.split(",")) except IndexError: pass event_dict["name"] = ", ".join(artists) event_dict["artists"] = artists event_dict["venue"] = venue yield event_dict
def artist_info(artist_name, count=3): info = dict(img_url=None, similars=[], tags=None) if artist_name.find("/") != -1: # can't handle / in name, punt return info artist_name = urllib.quote_plus(artist_name.encode("utf8")) try: # get img_url and similars soup = import_util.url_to_soup(base_url + artist_name + "/similar.xml") if str(soup).startswith("No artist"): return info # lastfm includes a "noimage" link if no img found. don't want! if soup.similarartists["picture"].find("noimage") == -1: info["img_url"] = urllib.unquote(soup.similarartists["picture"]) info["similars"] = [x.find("name").string.strip() for x in soup.findAll("artist")[:count]] # get tags soup = import_util.url_to_soup(base_url + artist_name + "/toptags.xml") tags = [x.find("name").string for x in soup.findAll("tag")[:count]] tags = [x for x in tags if x.lower().find("seen") == -1] if len(tags): info["tags"] = " / ".join(tags)[:100] finally: return info
def get_geocode(address): base_req = "http://maps.google.com/maps/geo?" address_enc = urllib.quote_plus(address) full_req = base_req + "key=%s&" % key + "output=xml&" + "q=%s" % address_enc soup = import_util.url_to_soup(full_req) if soup.response.code.string != "200": raise IOError # return reversed for [lat, lon] instead if [x, y] (which is lon, lat) l = list(reversed(soup.response.placemark.point.coordinates.string.split(",")[:2])) l = [Decimal(c) for c in l] try: zip_code = soup.response.placemark.addressdetails.country.administrativearea.subadministrativearea.locality.postalcode.postalcodenumber.string except AttributeError: zip_code = None l.append(zip_code) return l
def day_events(date): soup = import_util.url_to_soup(date_to_url(date)) #find all anchors with e.g. name="42820" anchors = soup('a', {'name':re.compile("\d+")}) for anchor in anchors: small = anchor.findNextSibling("small") event = dict(date=date, source="wweek") venue_stuff = small.contents[1].strip("| ") address, phone = venue_stuff.rsplit(",", 1) event['venue'] = dict(name=small.b.string, address=address, phone=phone) event_name_span = small.findNextSibling("span", "headout_event") event_name = import_util.stringify(event_name_span) if len(event_name) < 2: continue event['name'], event['artists'] = parse_event(event_name) yield event
def recordings(name, count=5): """Return names of albums by a given artist on Amazon. Amazon only returns 10 results, so setting count higher will have no effect.""" req = {} req['SearchIndex'] = "Music" req['Service'] = "AWSECommerceService" req['AWSAccessKeyId'] = access_key req['AssociateTag'] = affiliate_tag req['Operation'] = "ItemSearch" req['Version'] = "2007-07-16" req['ResponseGroup'] = "Images,Medium" req['Sort'] = 'salesrank' req['Artist'] = urllib.quote(name.encode('utf8')) params = "&".join([key+"="+value for key, value in req.items()]) url = baseurl + "?" + params soup = import_util.url_to_soup(url) if soup.itemsearchresponse.items.request.isvalid.string != "True": return # do something if it took too long? time = float(soup.itemsearchresponse.operationrequest.requestprocessingtime.string) items = soup.itemsearchresponse.items.findAll('item') for item in items[:count]: try: if item.artist.string.lower() != name.lower(): continue except: continue a = {} a['name'] = item.title.string a['url'] = item.detailpageurl.string try: info = item.itemattributes.format.string a['name'] += " (%s)" % info except AttributeError: pass try: a['img_url'] = item.smallimage.url.string except AttributeError: a['img_url'] = None yield a
def parse_page(num): soup = import_util.url_to_soup(get_url_base() + str(num)) body = soup.find('body', dict(bgcolor='#FFFFFF')) content = body.find('table', {'class':"content"}) prev_tr = content.find("tr", height="20", bgcolor="00036E") start_tr = prev_tr.findNextSibling() events = {} for tr in start_tr.findNextSiblings(): tds = tr.findAll("td") try: date = datetime.date(*time.strptime(tds[1].string.strip(), "%m/%d/%y")[:3]) artist = tds[3].a.string venue = tds[5].a.string event_artists = events.get((venue, date), list()) event_artists.append(artist) events[(venue, date)] = event_artists except IndexError: pass return events
def recordings(name): """return names of albums by a given artist on cdbaby""" quoted_name = urllib.quote_plus(name.encode('utf8')) url = baseurl + "/found?artist=" + quoted_name try: soup = import_util.url_to_soup(url) except: return albumlist = soup.find('ul', "albumlist") if not albumlist or not albumlist.previousSibling.string.startswith("Partial"): return albums = albumlist.findAll('div', "albumbox") for album in albums: a = {} artist_name = import_util.unescape(album.a['title'].split(": ", 1)[0]) a['name'] = import_util.unescape(album.a['title'].split(": ", 1)[1]) a['url'] = baseurl + album.a['href'] + affiliate_extension a['img_url'] = album.a.img['src'] if artist_name.lower() == name.lower(): yield a
def day_events(date): for url in event_urls_for_date(date): soup = import_util.url_to_soup(url) event = dict(source="mercury") event['date'] = date lt = soup.find("h1", "listingTitle") crap = lt.find("div", "FeaturesIcons") if crap: crap.extract() event['name'] = import_util.stringify(lt).strip() el = soup.find("div", id="EventLocation") event['venue'] = dict(name=el.ul.li.h4.find("a", recursive=False).string) event['artists'] = [] artists_str = event['name'] if artists_str.find(":") != -1: event['name'], artists_str = artists_str.split(":", 1) event['artists'] = artists_str.split(",") event['artists'] = [a.strip() for a in event['artists']] yield event
def info(ip): soup = import_util.url_to_soup(base_url + ip) lon, lat = soup.hostiplookupresultset.find("gml:coordinates").string.split(",") city = soup.find("gml:featuremember").find("gml:name").string return dict(lat=lat, lon=lon, city=city)