def release_date(self): try: if self.data: string = self.data['release_date'] return parseDateString(string) except KeyError: pass return None
def release_date(self): try: return parseDateString(self.data['releaseDate']) except KeyError: return None
def upgradeEntityData(entityData): # Just to be explicit.. old = entityData if '_id' in old: old['entity_id'] = str(old['_id']) del(old['_id']) try: kind = deriveKindFromOldSubcategory(old['subcategory']) types = deriveTypesFromOldSubcategories([old['subcategory']]) except KeyError as e: logs.warning("Malformed entity data: missing '%s'" % e) raise if kind == 'other' and 'coordinates' in old and 'lat' in old['coordinates'] and 'lng' in old['coordinates']: kind = 'place' types = [ 'establishment' ] new = getEntityObjectFromKind(kind)() try: seedTimestamp = ObjectId(old['entity_id']).generation_time.replace(tzinfo=None) except Exception as e: logs.warning("Unable to convert ObjectId to timestamp: %s" % e) seedTimestamp = datetime.utcnow() def setBasicGroup(source, target, oldName, newName=None, oldSuffix=None, newSuffix=None, additionalSuffixes=None, seed=True): if newName is None: newName = oldName if oldSuffix is None: item = source.pop(oldName, None) else: item = source.pop('%s_%s' % (oldName, oldSuffix), None) if item is not None: # Manual conversions... if oldName == 'track_length': try: item = int(str(item).split('.')[0]) except Exception as e: logs.warning("Unable to set length (%s) as integer: %s" % (item, e)) pass if newSuffix is None: setattr(target, newName, item) else: setattr(target, '%s_%s' % (newName, newSuffix), item) sourceName = 'format' if seed: sourceName = 'seed' if newName != 'tombstone': setattr(target, '%s_source' % newName, source.pop('%s_source' % oldName, sourceName)) setattr(target, '%s_timestamp' % newName, source.pop('%s_timestamp' % oldName, seedTimestamp)) if additionalSuffixes is not None: for s in additionalSuffixes: t = source.pop('%s_%s' % (oldName, s), None) if t is not None: setattr(target, '%s_%s' % (newName, s), t) def setListGroup(source, target, oldName, newName=None, delimiter=',', wrapper=None, seed=True): if newName is None: newName = oldName item = source.pop(oldName, None) if item is not None: items = [] for i in item.split(delimiter): if wrapper is not None: entityMini = wrapper() entityMini.title = i.strip() items.append(entityMini) else: items.append(i.strip()) setattr(target, newName, items) sourceName = 'format' if seed: sourceName = 'seed' setattr(target, '%s_source' % newName, source.pop('%s_source' % oldName, sourceName)) setattr(target, '%s_timestamp' % newName, source.pop('%s_timestamp' % oldName, seedTimestamp)) sources = old.pop('sources', {}) details = old.pop('details', {}) timestamp = old.pop('timestamp', {'created' : seedTimestamp}) place = details.pop('place', {}) contact = details.pop('contact', {}) restaurant = details.pop('restaurant', {}) media = details.pop('media', {}) video = details.pop('video', {}) artist = details.pop('artist', {}) album = details.pop('album', {}) song = details.pop('song', {}) book = details.pop('book', {}) netflix = sources.pop('netflix', {}) thetvdb = sources.pop('thetvdb', {}) # General new.schema_version = 0 new.entity_id = old.pop('entity_id', None) new.title = old.pop('title', None) new.timestamp = BasicTimestamp().dataImport(timestamp) # Images netflixImages = netflix.pop('images', {}) oldImages = [ old.pop('image', None), media.pop('artwork_url', None), netflixImages.pop('hd', None), netflixImages.pop('large', None), ] for oldImage in oldImages: if oldImage is not None: image = ImageSchema() size = ImageSizeSchema() size.url = oldImage image.sizes = [ size ] new.images = [ image ] break if new.images is not None and len(new.images) > 0: new.images_source = 'seed' new.images_timestamp = seedTimestamp setBasicGroup(old, new, 'desc') subcategory = old['subcategory'] if subcategory == 'song': subcategory = 'track' new.types += (subcategory,) # Sources setBasicGroup(sources, new.sources, 'spotify', oldSuffix='id', newSuffix='id', additionalSuffixes=['url']) setBasicGroup(sources, new.sources, 'rdio', oldSuffix='id', newSuffix='id', additionalSuffixes=['url']) setBasicGroup(sources, new.sources, 'fandango', oldSuffix='id', newSuffix='id', additionalSuffixes=['url']) setBasicGroup(sources, new.sources, 'stamped', 'tombstone', oldSuffix='id', newSuffix='id', additionalSuffixes=['url']) setBasicGroup(sources.pop('tmdb', {}), new.sources, 'tmdb', oldSuffix='id', newSuffix='id', additionalSuffixes=['url']) setBasicGroup(sources.pop('factual', {}), new.sources, 'factual', oldSuffix='id', newSuffix='id', additionalSuffixes=['url']) # TODO: Add factual_crosswalk setBasicGroup(sources.pop('singleplatform', {}), new.sources, 'singleplatform', oldSuffix='id', newSuffix='id', additionalSuffixes=['url']) # Apple / iTunes setBasicGroup(sources, new.sources, 'itunes', oldSuffix='id', newSuffix='id', additionalSuffixes=['url']) if new.sources.itunes_id is None: apple = sources.pop('apple', {}) setBasicGroup(apple, new.sources, 'aid', 'itunes', newSuffix='id') setBasicGroup(apple, new.sources, 'view_url', 'itunes', newSuffix='url') # Amazon setBasicGroup(sources, new.sources, 'amazon', oldSuffix='id', newSuffix='id', additionalSuffixes=['url']) if new.sources.amazon_id is None: amazon = sources.pop('amazon', {}) setBasicGroup(amazon, new.sources, 'asin', 'amazon', newSuffix='id') setBasicGroup(amazon, new.sources, 'amazon_link', 'amazon', newSuffix='url') # Netflix if netflix: setBasicGroup(netflix, new.sources, 'nid', 'netflix', newSuffix='id') setBasicGroup(netflix, new.sources, 'nurl', 'netflix', newSuffix='url') # TheTVDB if 'thetvdb_id' in thetvdb: setBasicGroup(thetvdb, new.sources, 'thetvdb_id', 'thetvdb', newSuffix='id') # OpenTable setBasicGroup(sources, new.sources, 'opentable', oldSuffix='id', newSuffix='id', additionalSuffixes=['nickname', 'url']) if new.sources.opentable_id is None: setBasicGroup(sources.pop('openTable', {}), new.sources, 'rid', 'opentable', newSuffix='id', additionalSuffixes=['url']) # Google Places googleplaces = sources.pop('googlePlaces', {}) setBasicGroup(googleplaces, new.sources, 'googleplaces', oldSuffix='id', newSuffix='id', additionalSuffixes=['url']) setBasicGroup(googleplaces, new.sources, 'gid', 'googleplaces', newSuffix='id') setBasicGroup(googleplaces, new.sources, 'reference', 'googleplaces', newSuffix='reference') # User Generated userGenerated = sources.pop('userGenerated', {}).pop('generated_by', None) if userGenerated is not None: new.sources.user_generated_id = userGenerated if 'created' in timestamp: new.sources.user_generated_timestamp = timestamp['created'] else: new.sources.user_generated_timestamp = seedTimestamp subtitle = old.pop('subtitle', None) if subtitle is not None: new.sources.user_generated_subtitle = subtitle # Bug fix: Some custom entities had country passed from client w/out intentional user input. Delete! if 'address' in place and place['address'] == ', US': del(place['address']) if 'coordinates' in old: del(old['coordinates']) # Contacts setBasicGroup(contact, new, 'phone') setBasicGroup(contact, new, 'site') setBasicGroup(contact, new, 'email') setBasicGroup(contact, new, 'fax') # Places if kind == 'place': coordinates = old.pop('coordinates', None) if coordinates is not None: new.coordinates = Coordinates().dataImport(coordinates) addressComponents = ['locality', 'postcode', 'region', 'street', 'street_ext'] setBasicGroup(place, new, 'address', 'address', oldSuffix='country', newSuffix='country', additionalSuffixes=addressComponents, seed=False) setBasicGroup(place, new, 'address', 'formatted_address') if 'hours' in place: place['hours'] = TimesSchema().dataImport(place['hours'], overflow=True) setBasicGroup(place, new, 'hours', seed=False) setBasicGroup(restaurant, new, 'menu', seed=False) setBasicGroup(restaurant, new, 'price_range', seed=False) setBasicGroup(restaurant, new, 'alcohol_flag', seed=False) setListGroup(restaurant, new, 'cuisine', seed=False) # Artist if kind == 'person': songs = artist.pop('songs', []) itunesSource = False newSongs = [] for song in songs: entityMini = MediaItemEntityMini() entityMini.title = song['song_name'] entityMini.kind = 'media_item' entityMini.types = [ 'track' ] if 'id' in song and 'source' in song and song['source'] == 'itunes': itunesSource = True entityMini.sources.itunes_id = song['id'] entityMini.sources.itunes_source = 'itunes' entityMini.sources.itunes_timestamp = song.pop('timestamp', seedTimestamp) newSongs.append(entityMini) if len(newSongs) > 0: new.tracks = newSongs sourceName = 'itunes' if itunesSource else 'format' new.tracks_source = artist.pop('songs_source', sourceName) new.tracks_timestamp = artist.pop('songs_timestamp', seedTimestamp) albums = artist.pop('albums', []) itunesSource = False newAlbums = [] for item in albums: entityMini = MediaCollectionEntityMini() entityMini.title = item['album_name'] if 'id' in item and 'source' in item and item['source'] == 'itunes': entityMini.sources.itunes_id = item['id'] entityMini.sources.itunes_source = 'itunes' entityMini.sources.itunes_timestamp = item.pop('timestamp', seedTimestamp) newAlbums.append(entityMini) if len(newAlbums) > 0: new.albums = newAlbums sourceName = 'itunes' if itunesSource else 'format' new.albums_source = artist.pop('albums_source', sourceName) new.albums_timestamp = artist.pop('albums_timestamp', seedTimestamp) setListGroup(media, new, 'genre', 'genres', seed=False) # General Media if kind in ['media_collection', 'media_item']: setBasicGroup(media, new, 'track_length', 'length') setBasicGroup(media, new, 'mpaa_rating', seed=False) setBasicGroup(media, new, 'release_date') setListGroup(media, new, 'genre', 'genres', seed=False) setListGroup(media, new, 'artist_display_name', 'artists', wrapper=PersonEntityMini, seed=False) setListGroup(video, new, 'cast', 'cast', wrapper=PersonEntityMini, seed=False) setListGroup(video, new, 'director', 'directors', wrapper=PersonEntityMini, seed=False) setListGroup(video, new, 'network_name', 'networks', wrapper=PersonEntityMini, seed=False) originalReleaseDate = parseDateString(media.pop('original_release_date', None)) if new.release_date is None and originalReleaseDate is not None: new.release_date = originalReleaseDate new.release_date_source = 'seed' new.release_date_timestamp = seedTimestamp # Book if 'book' in types: setBasicGroup(book, new, 'isbn') setBasicGroup(book, new, 'sku_number') setBasicGroup(book, new, 'num_pages', 'length', seed=False) setListGroup(book, new, 'author', 'authors', wrapper=PersonEntityMini, seed=False) setListGroup(book, new, 'publishers', 'publisher', wrapper=PersonEntityMini, seed=False) # Album if 'album' in types: songs = album.pop('tracks', []) newSongs = [] for song in songs: entityMini = MediaItemEntityMini() entityMini.title = song newSongs.append(entityMini) if len(newSongs) > 0: new.tracks = newSongs new.tracks_source = album.pop('songs_source', 'format') new.tracks_timestamp = album.pop('songs_timestamp', seedTimestamp) # Track if 'track' in types: albumName = song.pop('album_name', media.pop('album_name', None)) if albumName is not None: entityMini = MediaCollectionEntityMini() entityMini.title = albumName albumId = song.pop('song_album_id', None) if albumId is not None: entityMini.sources.itunes_id = albumId entityMini.sources.itunes_source = 'seed' entityMini.sources.itunes_timestamp = seedTimestamp new.albums = [ entityMini ] new.albums_source = song.pop('album_name_source', 'format') new.albums_timestamp = song.pop('album_name_timestamp', seedTimestamp) # Apps if 'app' in types: setBasicGroup(media, new, 'release_date', seed=False) setListGroup(media, new, 'artist_display_name', 'authors', wrapper=PersonEntityMini, seed=False) screenshots = media.pop('screenshots', []) newScreenshots = [] for screenshot in screenshots: imageSchema = ImageSchema() imageSizeSchema = ImageSizeSchema() imageSizeSchema.url = screenshot imageSchema.sizes = [ imageSizeSchema ] newScreenshots.append(imageSchema) if len(newScreenshots) > 0: new.screenshots = newScreenshots new.screenshots_source = media.pop('screenshots_source', 'format') new.screenshots_timestamp = media.pop('screenshots_timestamp', seedTimestamp) return new
def _parse_feed(self, feed_url): if self._verbose: utils.log("[%s] parsing feed %s" % (self, feed_url)) data = feedparser.parse(feed_url) id_r = re.compile('.*\/([0-9]*)$') title_r = re.compile('^([0-9][0-9]?). (.*) \$[0-9.M]*') genre_re = re.compile('Genres:(.*)$') length_re = re.compile('([0-9]+) *hr. *([0-9]+) min.') info_res = [ re.compile('[A-Za-z]+ ([^|]+) \| Runtime:(.+)$'), re.compile('Opens [A-Za-z]+ ([^|]+) \| Runtime:(.+)$'), re.compile('Opens [A-Za-z]+ ([^|]+) *$'), ] output = [] source = "fandango" ts = datetime.utcnow() def _set_entity(entity, key, value): setattr(entity, key, value) try: setattr(entity, "%s_source" % key, source) except AttributeError: pass try: setattr(entity, "%s_timestamp" % key, ts) except AttributeError: pass for entry in data.entries: if entry.title == 'More Movies': continue fid_match = id_r.match(entry.id) assert fid_match is not None fid = fid_match.groups()[0] title = entry.title title_match = title_r.match(title) fandango_rank = None if title_match: title_match_groups = title_match.groups() fandango_rank = title_match_groups[0] title = title_match_groups[1] entity = MediaItemEntity() setattr(entity.sources, "fandango_id", fid) setattr(entity.sources, "fandango_source", source) setattr(entity.sources, "fandango_timestamp", ts) setattr(entity, "title", title) _set_entity(entity, "types", [ "movie", ]) _set_entity(entity, "desc", entry.summary) for link in entry.links: if 'image' in link.type: # fandango gives us low resolution 69x103 versions of the image, so hackily up the # resolution before saving the entity :) url = link.href.replace('69/103', '375/375').replace('69x103', '375x375') size = ImageSizeSchema() size.url = url image = ImageSchema() image.sizes = [ size ] images = [ image ] _set_entity(entity, "images", images) break f_url = "%s" % entry.link f_url = f_url.replace('%26m%3d', '%3fpid=5348839%26m%3d') setattr(entity.sources, "fandango_url", f_url) # attempt to scrape some extra details from fandango's movie page url = "http://www.fandango.com/%s_%s/movieoverview" % \ (filter(lambda a: a.isalnum(), entity.title.replace(' ', '')), fid) try: if self._verbose: utils.log(url) response, content = service_request('fandango', 'GET', url) soup = BeautifulSoup(content) info = soup.find('div', {'id' : 'info'}).findAll('li')[1].getText() try: release_date, runtime = None, None for info_re in info_res: match = info_re.match(info) if match is not None: groups = match.groups() release_date = groups[0] if len(groups) == 2: runtime = groups[1] break if release_date is not None: release_date = parseDateString(release_date) _set_entity(entity, "release_date", release_date) if runtime is not None: match = length_re.match(runtime) if match is not None: hours, minutes = match.groups() hours, minutes = int(hours), int(minutes) seconds = 60 * (minutes + 60 * hours) _set_entity(entity, "length", release_date) except Exception: utils.printException() pass try: mpaa_rating = soup.find('div', {'class' : re.compile('rating_icn')}).getText() _set_entity(entity, 'mpaa_rating', rating) except Exception: pass details = soup.findAll('li', {'class' : 'detail_list'}) cast = filter(lambda d: 'Cast:' in d.getText(), details) if 1 == len(cast): cast = map(lambda a: a.getText(), cast[0].findAll('a')) cast = map(lambda p: PersonEntityMini().dataImport({ 'title' : p, }), cast) _set_entity(entity, "cast", cast) director = filter(lambda d: 'Director:' in d.getText(), details) if 1 == len(director): directors = map(lambda a: a.getText(), director[0].findAll('a')) directors = map(lambda p: PersonEntityMini().dataImport({ 'title' : p, }), directors) _set_entity(entity, "directors", directors) genres = filter(lambda d: 'Genres:' in d.getText(), details) if 1 == len(genres): genres = genres[0].getText() match = genre_re.match(genres) if match is not None: genre = match.groups()[0].strip() _set_entity(entity, "genres", [ genre ]) except Exception: utils.printException() pass output.append(entity) if self._verbose: utils.log("[%s] done parsing feed '%s' (%s)" % (self, data.feed.title, url)) return output
def release_date(self): try: return parseDateString( xp(self.attributes, 'PublicationDate')['v'] ) except Exception: return None