Beispiel #1
0
 def release_date(self):
     try:
         if self.data:
             string = self.data['release_date']
             return parseDateString(string)
     except KeyError:
         pass
     return None
Beispiel #2
0
 def release_date(self):
     try:
         return parseDateString(self.data['releaseDate'])
     except KeyError:
         return None
Beispiel #3
0
def upgradeEntityData(entityData):
    # Just to be explicit..
    old = entityData

    if '_id' in old:
        old['entity_id'] = str(old['_id'])
        del(old['_id'])

    try:
        kind    = deriveKindFromOldSubcategory(old['subcategory'])
        types   = deriveTypesFromOldSubcategories([old['subcategory']])
    except KeyError as e:
        logs.warning("Malformed entity data: missing '%s'" % e)
        raise 

    if kind == 'other' and 'coordinates' in old and 'lat' in old['coordinates'] and 'lng' in old['coordinates']:
        kind = 'place'
        types = [ 'establishment' ]

    new     = getEntityObjectFromKind(kind)()

    try:
        seedTimestamp = ObjectId(old['entity_id']).generation_time.replace(tzinfo=None)
    except Exception as e:
        logs.warning("Unable to convert ObjectId to timestamp: %s" % e)
        seedTimestamp = datetime.utcnow()

    def setBasicGroup(source, target, oldName, newName=None, oldSuffix=None, newSuffix=None, additionalSuffixes=None, seed=True):
        if newName is None:
            newName = oldName
        if oldSuffix is None:
            item = source.pop(oldName, None)
        else:
            item = source.pop('%s_%s' % (oldName, oldSuffix), None)

        if item is not None:
            # Manual conversions...
            if oldName == 'track_length':
                try:
                    item = int(str(item).split('.')[0])
                except Exception as e:
                    logs.warning("Unable to set length (%s) as integer: %s" % (item, e))
                    pass

            if newSuffix is None:
                setattr(target, newName, item)
            else:
                setattr(target, '%s_%s' % (newName, newSuffix), item)

            sourceName = 'format'
            if seed:
                sourceName = 'seed'

            if newName != 'tombstone':
                setattr(target, '%s_source' % newName, source.pop('%s_source' % oldName, sourceName))
            setattr(target, '%s_timestamp' % newName, source.pop('%s_timestamp' % oldName, seedTimestamp))

            if additionalSuffixes is not None:
                for s in additionalSuffixes:
                    t = source.pop('%s_%s' % (oldName, s), None)
                    if t is not None:
                        setattr(target, '%s_%s' % (newName, s), t)

    def setListGroup(source, target, oldName, newName=None, delimiter=',', wrapper=None, seed=True):
        if newName is None:
            newName = oldName

        item = source.pop(oldName, None)

        if item is not None:
            items = []
            for i in item.split(delimiter):
                if wrapper is not None:
                    entityMini = wrapper()
                    entityMini.title = i.strip()
                    items.append(entityMini)
                else:
                    items.append(i.strip())
            setattr(target, newName, items)

            sourceName = 'format'
            if seed:
                sourceName = 'seed'

            setattr(target, '%s_source' % newName, source.pop('%s_source' % oldName, sourceName))
            setattr(target, '%s_timestamp' % newName, source.pop('%s_timestamp' % oldName, seedTimestamp))

    sources                 = old.pop('sources', {})
    details                 = old.pop('details', {})
    timestamp               = old.pop('timestamp', {'created' : seedTimestamp})
    place                   = details.pop('place', {})
    contact                 = details.pop('contact', {})
    restaurant              = details.pop('restaurant', {})
    media                   = details.pop('media', {})
    video                   = details.pop('video', {})
    artist                  = details.pop('artist', {})
    album                   = details.pop('album', {})
    song                    = details.pop('song', {})
    book                    = details.pop('book', {})
    netflix                 = sources.pop('netflix', {})
    thetvdb                 = sources.pop('thetvdb', {})

    # General
    new.schema_version      = 0
    new.entity_id           = old.pop('entity_id', None)
    new.title               = old.pop('title', None)
    new.timestamp           = BasicTimestamp().dataImport(timestamp)

    # Images
    netflixImages = netflix.pop('images', {})
    oldImages = [
        old.pop('image', None),
        media.pop('artwork_url', None),
        netflixImages.pop('hd', None),
        netflixImages.pop('large', None),
    ]
    for oldImage in oldImages:
        if oldImage is not None:
            image = ImageSchema()
            size  = ImageSizeSchema()
            size.url = oldImage
            image.sizes = [ size ]
            new.images = [ image ]
            break
    if new.images is not None and len(new.images) > 0:
        new.images_source = 'seed'
        new.images_timestamp = seedTimestamp

    setBasicGroup(old, new, 'desc')
    subcategory = old['subcategory']
    if subcategory == 'song':
        subcategory = 'track'
    new.types += (subcategory,)

    # Sources
    setBasicGroup(sources, new.sources, 'spotify', oldSuffix='id', newSuffix='id', additionalSuffixes=['url'])
    setBasicGroup(sources, new.sources, 'rdio', oldSuffix='id', newSuffix='id', additionalSuffixes=['url'])
    setBasicGroup(sources, new.sources, 'fandango', oldSuffix='id', newSuffix='id', additionalSuffixes=['url'])
    setBasicGroup(sources, new.sources, 'stamped', 'tombstone', oldSuffix='id', newSuffix='id', additionalSuffixes=['url'])
    setBasicGroup(sources.pop('tmdb', {}), new.sources, 'tmdb', oldSuffix='id', newSuffix='id', additionalSuffixes=['url'])
    setBasicGroup(sources.pop('factual', {}), new.sources, 'factual', oldSuffix='id', newSuffix='id', additionalSuffixes=['url'])
    # TODO: Add factual_crosswalk
    setBasicGroup(sources.pop('singleplatform', {}), new.sources, 'singleplatform', oldSuffix='id', newSuffix='id', additionalSuffixes=['url'])

    # Apple / iTunes
    setBasicGroup(sources, new.sources, 'itunes', oldSuffix='id', newSuffix='id', additionalSuffixes=['url'])
    if new.sources.itunes_id is None:
        apple = sources.pop('apple', {})
        setBasicGroup(apple, new.sources, 'aid', 'itunes', newSuffix='id')
        setBasicGroup(apple, new.sources, 'view_url', 'itunes', newSuffix='url')

    # Amazon
    setBasicGroup(sources, new.sources, 'amazon', oldSuffix='id', newSuffix='id', additionalSuffixes=['url'])
    if new.sources.amazon_id is None:
        amazon = sources.pop('amazon', {})
        setBasicGroup(amazon, new.sources, 'asin', 'amazon', newSuffix='id')
        setBasicGroup(amazon, new.sources, 'amazon_link', 'amazon', newSuffix='url')

    # Netflix
    if netflix:
        setBasicGroup(netflix, new.sources, 'nid', 'netflix', newSuffix='id')
        setBasicGroup(netflix, new.sources, 'nurl', 'netflix', newSuffix='url')

    # TheTVDB
    if 'thetvdb_id' in thetvdb:
        setBasicGroup(thetvdb, new.sources, 'thetvdb_id', 'thetvdb', newSuffix='id')

    # OpenTable
    setBasicGroup(sources, new.sources, 'opentable', oldSuffix='id', newSuffix='id', additionalSuffixes=['nickname', 'url'])
    if new.sources.opentable_id is None:
        setBasicGroup(sources.pop('openTable', {}), new.sources, 'rid', 'opentable', newSuffix='id', additionalSuffixes=['url'])

    # Google Places
    googleplaces = sources.pop('googlePlaces', {})
    setBasicGroup(googleplaces, new.sources, 'googleplaces', oldSuffix='id', newSuffix='id', additionalSuffixes=['url'])
    setBasicGroup(googleplaces, new.sources, 'gid', 'googleplaces', newSuffix='id')
    setBasicGroup(googleplaces, new.sources, 'reference', 'googleplaces', newSuffix='reference')

    # User Generated
    userGenerated = sources.pop('userGenerated', {}).pop('generated_by', None)
    if userGenerated is not None:
        new.sources.user_generated_id = userGenerated
        if 'created' in timestamp:
            new.sources.user_generated_timestamp = timestamp['created']
        else:
            new.sources.user_generated_timestamp = seedTimestamp
        subtitle = old.pop('subtitle', None)
        if subtitle is not None:
            new.sources.user_generated_subtitle = subtitle

        # Bug fix: Some custom entities had country passed from client w/out intentional user input. Delete!
        if 'address' in place and place['address'] == ', US':
            del(place['address'])
            if 'coordinates' in old:
                del(old['coordinates'])

    # Contacts
    setBasicGroup(contact, new, 'phone')
    setBasicGroup(contact, new, 'site')
    setBasicGroup(contact, new, 'email')
    setBasicGroup(contact, new, 'fax')

    # Places
    if kind == 'place':
        coordinates = old.pop('coordinates', None)
        if coordinates is not None:
            new.coordinates = Coordinates().dataImport(coordinates)

        addressComponents = ['locality', 'postcode', 'region', 'street', 'street_ext']
        setBasicGroup(place, new, 'address', 'address', oldSuffix='country', newSuffix='country', additionalSuffixes=addressComponents, seed=False)


        setBasicGroup(place, new, 'address', 'formatted_address')
        if 'hours' in place:
            place['hours'] = TimesSchema().dataImport(place['hours'], overflow=True)
        setBasicGroup(place, new, 'hours', seed=False)
        setBasicGroup(restaurant, new, 'menu', seed=False)
        setBasicGroup(restaurant, new, 'price_range', seed=False)
        setBasicGroup(restaurant, new, 'alcohol_flag', seed=False)

        setListGroup(restaurant, new, 'cuisine', seed=False)

    # Artist
    if kind == 'person':
        songs = artist.pop('songs', [])
        itunesSource = False
        newSongs = []
        for song in songs:
            entityMini = MediaItemEntityMini()
            entityMini.title = song['song_name']
            entityMini.kind = 'media_item'
            entityMini.types = [ 'track' ]
            if 'id' in song and 'source' in song and song['source'] == 'itunes':
                itunesSource = True
                entityMini.sources.itunes_id = song['id']
                entityMini.sources.itunes_source = 'itunes'
                entityMini.sources.itunes_timestamp = song.pop('timestamp', seedTimestamp)
            newSongs.append(entityMini)
        if len(newSongs) > 0:
            new.tracks = newSongs
            sourceName = 'itunes' if itunesSource else 'format'
            new.tracks_source = artist.pop('songs_source', sourceName)
            new.tracks_timestamp = artist.pop('songs_timestamp', seedTimestamp)

        albums = artist.pop('albums', [])
        itunesSource = False
        newAlbums = []
        for item in albums:
            entityMini = MediaCollectionEntityMini()
            entityMini.title = item['album_name']
            if 'id' in item and 'source' in item and item['source'] == 'itunes':
                entityMini.sources.itunes_id = item['id']
                entityMini.sources.itunes_source = 'itunes'
                entityMini.sources.itunes_timestamp = item.pop('timestamp', seedTimestamp)
            newAlbums.append(entityMini)
        if len(newAlbums) > 0:
            new.albums = newAlbums
            sourceName = 'itunes' if itunesSource else 'format'
            new.albums_source = artist.pop('albums_source', sourceName)
            new.albums_timestamp = artist.pop('albums_timestamp', seedTimestamp)

        setListGroup(media, new, 'genre', 'genres', seed=False)

    # General Media
    if kind in ['media_collection', 'media_item']:

        setBasicGroup(media, new, 'track_length', 'length')
        setBasicGroup(media, new, 'mpaa_rating', seed=False)
        setBasicGroup(media, new, 'release_date')

        setListGroup(media, new, 'genre', 'genres', seed=False)
        setListGroup(media, new, 'artist_display_name', 'artists', wrapper=PersonEntityMini, seed=False)
        setListGroup(video, new, 'cast', 'cast', wrapper=PersonEntityMini, seed=False)
        setListGroup(video, new, 'director', 'directors', wrapper=PersonEntityMini, seed=False)
        setListGroup(video, new, 'network_name', 'networks', wrapper=PersonEntityMini, seed=False)

        originalReleaseDate = parseDateString(media.pop('original_release_date', None))
        if new.release_date is None and originalReleaseDate is not None:
            new.release_date = originalReleaseDate
            new.release_date_source = 'seed'
            new.release_date_timestamp = seedTimestamp

    # Book
    if 'book' in types:
        setBasicGroup(book, new, 'isbn')
        setBasicGroup(book, new, 'sku_number')
        setBasicGroup(book, new, 'num_pages', 'length', seed=False)

        setListGroup(book, new, 'author', 'authors', wrapper=PersonEntityMini, seed=False)
        setListGroup(book, new, 'publishers', 'publisher', wrapper=PersonEntityMini, seed=False)

    # Album
    if 'album' in types:
        songs = album.pop('tracks', [])
        newSongs = []
        for song in songs:
            entityMini = MediaItemEntityMini()
            entityMini.title = song
            newSongs.append(entityMini)
        if len(newSongs) > 0:
            new.tracks = newSongs
            new.tracks_source = album.pop('songs_source', 'format')
            new.tracks_timestamp = album.pop('songs_timestamp', seedTimestamp)

    # Track
    if 'track' in types:
        albumName = song.pop('album_name', media.pop('album_name', None))
        if albumName is not None:
            entityMini = MediaCollectionEntityMini()
            entityMini.title = albumName
            albumId = song.pop('song_album_id', None)
            if albumId is not None:
                entityMini.sources.itunes_id = albumId
                entityMini.sources.itunes_source = 'seed'
                entityMini.sources.itunes_timestamp = seedTimestamp
            new.albums = [ entityMini ]
            new.albums_source = song.pop('album_name_source', 'format')
            new.albums_timestamp = song.pop('album_name_timestamp', seedTimestamp)

    # Apps
    if 'app' in types:
        setBasicGroup(media, new, 'release_date', seed=False)
        setListGroup(media, new, 'artist_display_name', 'authors', wrapper=PersonEntityMini, seed=False)

        screenshots = media.pop('screenshots', [])
        newScreenshots = []
        for screenshot in screenshots:
            imageSchema = ImageSchema()
            imageSizeSchema = ImageSizeSchema()
            imageSizeSchema.url = screenshot
            imageSchema.sizes = [ imageSizeSchema ]
            newScreenshots.append(imageSchema)
        if len(newScreenshots) > 0:
            new.screenshots = newScreenshots
            new.screenshots_source = media.pop('screenshots_source', 'format')
            new.screenshots_timestamp = media.pop('screenshots_timestamp', seedTimestamp)

    return new
Beispiel #4
0
    def _parse_feed(self, feed_url):
        if self._verbose:
            utils.log("[%s] parsing feed %s" % (self, feed_url))
        
        data = feedparser.parse(feed_url)
        
        id_r      = re.compile('.*\/([0-9]*)$')
        title_r   = re.compile('^([0-9][0-9]?). (.*) \$[0-9.M]*')
        genre_re  = re.compile('Genres:(.*)$')
        length_re = re.compile('([0-9]+) *hr. *([0-9]+) min.')
        
        info_res  = [
            re.compile('[A-Za-z]+ ([^|]+) \| Runtime:(.+)$'), 
            re.compile('Opens [A-Za-z]+ ([^|]+) \| Runtime:(.+)$'), 
            re.compile('Opens [A-Za-z]+ ([^|]+) *$'), 
        ]
        
        output    = []
        source    = "fandango"
        ts        = datetime.utcnow()
        
        def _set_entity(entity, key, value):
            setattr(entity, key, value)
            try:
                setattr(entity, "%s_source" % key, source)
            except AttributeError:
                pass
            
            try:
                setattr(entity, "%s_timestamp" % key, ts)
            except AttributeError:
                pass
        
        for entry in data.entries:
            if entry.title == 'More Movies':
                continue
            
            fid_match = id_r.match(entry.id)
            assert fid_match is not None
            fid = fid_match.groups()[0]
            
            title = entry.title
            
            title_match = title_r.match(title)
            fandango_rank = None
            
            if title_match:
                title_match_groups = title_match.groups()
                fandango_rank = title_match_groups[0]
                title = title_match_groups[1]
            
            entity = MediaItemEntity()

            setattr(entity.sources, "fandango_id", fid)
            setattr(entity.sources, "fandango_source", source)
            setattr(entity.sources, "fandango_timestamp", ts)

            setattr(entity, "title", title)
            
            _set_entity(entity, "types", [ "movie", ])
            _set_entity(entity, "desc", entry.summary)
            
            for link in entry.links:
                if 'image' in link.type:
                    # fandango gives us low resolution 69x103 versions of the image, so hackily up the 
                    # resolution before saving the entity :)
                    url  = link.href.replace('69/103', '375/375').replace('69x103', '375x375')
                    size = ImageSizeSchema()
                    size.url = url 
                    image = ImageSchema()
                    image.sizes = [ size ]
                    images = [ image ]
                    
                    _set_entity(entity, "images", images)
                    break
            
            f_url = "%s" % entry.link
            f_url = f_url.replace('%26m%3d', '%3fpid=5348839%26m%3d')
            setattr(entity.sources, "fandango_url", f_url)
            
            # attempt to scrape some extra details from fandango's movie page
            url = "http://www.fandango.com/%s_%s/movieoverview" % \
                   (filter(lambda a: a.isalnum(), entity.title.replace(' ', '')), fid)
            
            try:
                if self._verbose:
                    utils.log(url)

                response, content = service_request('fandango', 'GET', url)
                soup = BeautifulSoup(content)
                info = soup.find('div', {'id' : 'info'}).findAll('li')[1].getText()
                
                try:
                    release_date, runtime = None, None
                    
                    for info_re in info_res:
                        match = info_re.match(info)
                        
                        if match is not None:
                            groups = match.groups()
                            release_date = groups[0]
                            if len(groups) == 2:
                                runtime = groups[1]
                            break
                    
                    if release_date is not None:
                        release_date = parseDateString(release_date)
                        _set_entity(entity, "release_date", release_date)
                    
                    if runtime is not None:
                        match = length_re.match(runtime)
                        if match is not None:
                            hours, minutes = match.groups()
                            hours, minutes = int(hours), int(minutes)
                            seconds = 60 * (minutes + 60 * hours)
                            
                            _set_entity(entity, "length", release_date)
                except Exception:
                    utils.printException()
                    pass
                
                try:
                    mpaa_rating = soup.find('div', {'class' : re.compile('rating_icn')}).getText()
                    _set_entity(entity, 'mpaa_rating', rating)
                except Exception:
                    pass
                
                details = soup.findAll('li', {'class' : 'detail_list'})
                
                cast = filter(lambda d: 'Cast:' in d.getText(), details)
                if 1 == len(cast):
                    cast = map(lambda a: a.getText(), cast[0].findAll('a'))
                    cast = map(lambda p: PersonEntityMini().dataImport({ 'title' : p, }), cast)
                    
                    _set_entity(entity, "cast", cast)
                
                director = filter(lambda d: 'Director:' in d.getText(), details)
                if 1 == len(director):
                    directors = map(lambda a: a.getText(), director[0].findAll('a'))
                    directors = map(lambda p: PersonEntityMini().dataImport({ 'title' : p, }), directors)
                    
                    _set_entity(entity, "directors", directors)
                
                genres = filter(lambda d: 'Genres:' in d.getText(), details)
                if 1 == len(genres):
                    genres = genres[0].getText()
                    match  = genre_re.match(genres)
                    
                    if match is not None:
                        genre = match.groups()[0].strip()
                        
                        _set_entity(entity, "genres", [ genre ])
            except Exception:
                utils.printException()
                pass
            
            output.append(entity)
        
        if self._verbose:
            utils.log("[%s] done parsing feed '%s' (%s)" % (self, data.feed.title, url))
        
        return output
Beispiel #5
0
 def release_date(self):
     try:
         return parseDateString( xp(self.attributes, 'PublicationDate')['v'] )
     except Exception:
         return None