def parse_playlist_metadata(item): ''' Parses and processes raw output and returns playlist_name, playlist_id, playlist_publish_date, playlist_n_videos, channel_id, channel_name, collection_date. :params item: json document :type item: dict :returns: parsed dictionary :rtype: dict ''' if not isinstance(item, dict): return dict() playlist_meta = { "playlist_name" : item['snippet'].get('title'), "playlist_id" : item['id'], "playlist_publish_date" : parse_yt_datetime(item['snippet'].get('publishedAt')), "playlist_n_videos" : item['contentDetails'].get('itemCount'), "channel_id" : item['snippet'].get('channelId'), "channel_name" : item['snippet'].get('channelTitle'), "collection_date" : datetime.datetime.now() } return playlist_meta
def parse_rec_video_metadata(item): ''' Parses and processes raw output and returns video_id, channel_title, channel_id, video_publish_date, video_title, video_description, video_category, video_thumbnail, collection_date. :params item: json document :type item: dict :returns: parsed dictionary :rtype: dict ''' if not isinstance(item, dict): return dict() video_meta = { "video_id" : item['id'].get('videoId'), "channel_title" : item["snippet"].get("channelTitle"), "channel_id" : item["snippet"].get("channelId"), "video_publish_date" : parse_yt_datetime(item["snippet"].get("publishedAt")), "video_title" : item["snippet"].get("title"), "video_description" : item["snippet"].get("description"), "video_category" : item["snippet"].get("categoryId"), "video_thumbnail" : item["snippet"]["thumbnails"]["high"]["url"], "collection_date" : datetime.datetime.now() } return video_meta
def parse_subscription_descriptive(item): ''' Parses and processes raw output and returns subscription_title, subscription_channel_id, subscription_kind, subscription_publish_date, collection_date. :params item: json document :type item: dict :returns: parsed dictionary :rtype: dict ''' if not isinstance(item, dict): return dict() sub_meta = { "subscription_title": item['snippet']['title'], "subscription_channel_id": item['snippet']['resourceId'].get('channelId'), "subscription_kind": item['snippet']['resourceId'].get('kind'), "subscription_publish_date": parse_yt_datetime(item['snippet'].get('publishedAt')), "collection_date": datetime.datetime.now() } return sub_meta
def parse_channel_metadata(item): ''' Parses and processes raw output and returns channel_id, title, account_creatation_date, keywords, description, view_count, video_count, subscription_count, playlist_id_likes, playlist_id_uploads, topic_ids, country, collection_date. :params item: json document :type item: dict :returns: parsed dictionary :rtype: dict ''' if not isinstance(item, dict): return dict() topic = item.get('topicDetails') if topic: topic = '|'.join(topic.get('topicCategories')) channel_meta = { "channel_id" : item['id'], "title" : item["snippet"].get("title"), "account_creation_date" : parse_yt_datetime(item["snippet"].get("publishedAt")), "keywords" : item['brandingSettings']['channel'].get('keywords'), "description" : item["snippet"].get("description"), "view_count" : item["statistics"].get("viewCount"), "video_count" : item["statistics"].get("videoCount"), "subscription_count" : item["statistics"].get("subscriberCount"), "playlist_id_likes" : item['contentDetails']['relatedPlaylists'].get('likes'), "playlist_id_uploads" : item['contentDetails']['relatedPlaylists'].get('uploads'), "topic_ids" : topic, "country" : item['snippet'].get('country'), "collection_date" : datetime.datetime.now() } return channel_meta
def parse_video_metadata(item): ''' :params item: json document :returns: parsed dictionary ''' tags = item["snippet"].get('tags') if isinstance(tags, Iterable): video_tags = '|'.join(tags) else: video_tags = '' video_meta = OrderedDict( video_id=item['id'], channel_title=item["snippet"].get("channelTitle"), channel_id=item["snippet"].get("channelId"), video_publish_date=parse_yt_datetime( item["snippet"].get("publishedAt")), video_title=item["snippet"].get("title"), video_description=item["snippet"].get("description"), video_category=item["snippet"].get("categoryId"), video_view_count=item["statistics"].get("viewCount"), video_comment_count=item["statistics"].get("commentCount"), video_like_count=item["statistics"].get("likeCount"), video_dislike_count=item["statistics"].get("dislikeCount"), video_thumbnail=item["snippet"]["thumbnails"]["high"]["url"], video_tags=video_tags, collection_date=datetime.datetime.now()) return video_meta
def parse_channel_metadata(item): ''' :params item: json document :returns: parsed dictionary ''' topic = item.get('topicDetails') if topic: topic = item.get('topicIds') channel_meta = OrderedDict( id=item['id'], title=item["snippet"].get("title"), publish_date=parse_yt_datetime(item["snippet"].get("publishedAt")), keywords=item['brandingSettings']['channel'].get('keywords'), description=item["snippet"].get("description"), view_count=item["statistics"].get("viewCount"), video_count=item["statistics"].get("videoCount"), subscription_count=item["statistics"].get("subscriberCount"), playlist_id_likes=item['contentDetails']['relatedPlaylists'].get( 'likes'), playlist_id_uploads=item['contentDetails']['relatedPlaylists'].get( 'uploads'), topic_ids=json.dumps(topic), collection_date=datetime.datetime.now()) return channel_meta
def parse_comment_metadata(item): ''' :params item: json document :returns: parsed dictionary ''' if item['snippet'].get('topLevelComment'): save = item['snippet'] item = item['snippet']['topLevelComment'] comment_meta = OrderedDict( commenter_channel_url=item["snippet"].get("authorChannelUrl"), commenter_channel_display_name=item['snippet'].get( 'authorDisplayName'), comment_id=item.get("id"), comment_like_count=item["snippet"].get("likeCount"), comment_publish_date=parse_yt_datetime( item["snippet"].get("publishedAt")), text=item["snippet"].get("textDisplay"), video_id=item["snippet"].get("videoId"), commenter_rating=item["snippet"].get("viewerRating"), comment_parent_id=item["snippet"].get("parentId"), collection_date=datetime.datetime.now()) try: comment_meta['reply_count'] = save.get('totalReplyCount') except: comment_meta['reply_count'] = item.get('totalReplyCount') return comment_meta
def get_videos_from_playlist_id(self, playlist_id, next_page_token=None, published_after=datetime.datetime(1990,1,1), parser=P.parse_video_url, part=['snippet'], **kwargs): ''' Given a `playlist_id`, returns `video_ids` associated with that playlist. Note that user uploads for any given channel are from a playlist named "upload playlist id". You can get this value using :meth:`youtube_api.youtube_api.get_channel_metadata` or :meth:`youtube_api.youtube_api_utils.get_upload_playlist_id`. The playlist ID for uploads is always the channel_id with "UU" subbed for "UC". Read the docs: https://developers.google.com/youtube/v3/docs/playlistItems :param playlist_id: the playlist_id IE: "UUaLfMkkHhSA_LaCta0BzyhQ" :type platlist_id: str :param next_page_token: a token to continue from a preciously stopped query IE: "CDIQAA" :type next_page_token: str :param cutoff_date: a date for the minimum publish date for videos from a playlist_id. :type cutoff_date: datetime :param parser: the function to parse the json document :type parser: :mod:`youtube_api.parsers module` :param part: The part parameter specifies a comma-separated list of one or more resource properties that the API response will include. Different parameters cost different quota costs from the API. :type part: list :returns: video ids associated with ``playlist_id``. :rtype: list of dict ''' parser=parser if parser else P.raw_json part = ','.join(part) videos = [] run = True while run: http_endpoint = ("https://www.googleapis.com/youtube/v{}/playlistItems" "?part={}&playlistId={}&maxResults=50&key={}".format( self.api_version, part, playlist_id, self.key)) for k,v in kwargs.items(): http_endpoint += '&{}={}'.format(k, v) if next_page_token: http_endpoint += "&pageToken={}".format(next_page_token) response_json = self._http_request(http_endpoint, timeout_in_n_seconds=20) if response_json.get('items'): for item in response_json.get('items'): publish_date = parse_yt_datetime(item['snippet'].get('publishedAt')) if publish_date <= published_after: run=False break videos.append(parser(item)) if response_json.get('nextPageToken'): next_page_token = response_json.get('nextPageToken') else: run=False break else: run=False break return videos
def handle(self, *args, **options): api_key = settings.YOUTUBE_API_KEY youtube = googleapiclient.discovery.build('youtube', 'v3', developerKey=api_key) # Lookup all contents for content in ImagoInfoContent.objects.filter( type='tvshow', source_id__isnull=False): request = youtube.playlistItems().list( part="snippet,contentDetails", fields="items(id,snippet(title,publishedAt),contentDetails)", playlistId=content.source_id) response = request.execute() # Only consider content from yesterday on published_after = datetime.now() - timedelta(days=1) # Create new video for each result for item in response.get('items'): published_at = parse_yt_datetime( item['snippet'].get('publishedAt')) if published_at <= published_after: break youtube_video_id = item['contentDetails']['videoId'] video = ImagoInfoVideo(publication_date=published_at, title=item['snippet']['title'], content_id=content.content_id, thumbnail='youtube', hosting='youtube', youtube_id=youtube_video_id, start_time=0, end_time=0, type='tvshow') # Grab video duration through another API call request = youtube.videos().list( part="contentDetails", id=youtube_video_id, fields="items(contentDetails(duration))") response = request.execute() duration = response.get( 'items')[0]['contentDetails']['duration'] duration_in_seconds = isodate.parse_duration( duration).total_seconds() video.duration = duration_in_seconds video.save()
def parse_video_url(item): ''' :params item: json document :returns: parsed dictionary ''' publish_date = item['snippet'].get('publishedAt') publish_date = parse_yt_datetime(publish_date) video_id = item['snippet']['resourceId'].get('videoId') channel_id = item['snippet'].get('channelId') return OrderedDict(publish_date=publish_date, video_id=video_id, channel_id=channel_id, collection_date=datetime.datetime.now())
def parse_subscription_descriptive(item): ''' :params item: json document :returns: parsed dictionary ''' sub_meta = OrderedDict( subscription_title=item['snippet']['title'], subscription_channel_id=item['snippet']['resourceId'].get('channelId'), subscription_kind=item['snippet']['resourceId'].get('kind'), subscription_publish_date=parse_yt_datetime( item['snippet'].get('publishedAt')), collection_date=datetime.datetime.now()) return sub_meta
def parse_comment_metadata(item): ''' Parses and processes raw output and returns video_id, commenter_channel_url, commenter_channel_display_name, comment_id, comment_like_count, comment_publish_date, text, commenter_rating, comment_parent_id, collection_date. :params item: json document :type item: dict :returns: parsed dictionary :rtype: dict ''' if not isinstance(item, dict): return dict() if item['snippet'].get('topLevelComment'): save = item['snippet'] item = item['snippet']['topLevelComment'] comment_meta = { "video_id": item["snippet"].get("videoId"), "commenter_channel_url": item["snippet"].get("authorChannelUrl"), "commenter_channel_id": item['snippet'].get('authorChannelId').get('value', None), "commenter_channel_display_name": item['snippet'].get('authorDisplayName'), "comment_id": item.get("id"), "comment_like_count": item["snippet"].get("likeCount"), "comment_publish_date": parse_yt_datetime(item["snippet"].get("publishedAt")), "text": item["snippet"].get("textDisplay"), "commenter_rating": item["snippet"].get("viewerRating"), "comment_parent_id": item["snippet"].get("parentId"), "collection_date": datetime.datetime.now() } try: comment_meta['reply_count'] = save.get('totalReplyCount') except: comment_meta['reply_count'] = item.get('totalReplyCount') return comment_meta
def parse_playlist_metadata(item): ''' :params item: json document :returns: parsed dictionary ''' playlist_meta = OrderedDict( playlist_name=item['snippet'].get('title'), playlist_id=item['id'], playlist_publish_date=parse_yt_datetime( item['snippet'].get('publishedAt')), playlist_n_videos=item['contentDetails'].get('itemCount'), channel_id=item['snippet'].get('channelId'), channel_name=item['snippet'].get('channelTitle'), collection_date=datetime.datetime.now()) return playlist_meta
def parse_rec_video_metadata(item): ''' :params item: json document :returns: parsed dictionary ''' video_meta = OrderedDict( video_id=item['id'].get('videoId'), channel_title=item["snippet"].get("channelTitle"), channel_id=item["snippet"].get("channelId"), video_publish_date=parse_yt_datetime( item["snippet"].get("publishedAt")), video_title=item["snippet"].get("title"), video_description=item["snippet"].get("description"), video_category=item["snippet"].get("categoryId"), video_thumbnail=item["snippet"]["thumbnails"]["high"]["url"], collection_date=datetime.datetime.now()) return video_meta
def parse_video_metadata(item): ''' Parses and processes raw output and returns video_id, channel_title, channel_id, video_publish_date, video_title, video_description, video_category, video_view_count, video_comment_count, video_like_count, video_dislike_count, video_thumbnail, video_tags, collection_date. :params item: json document :type item: dict :returns: parsed dictionary :rtype: dict ''' if not isinstance(item, dict): return dict() tags = item["snippet"].get('tags') if isinstance(tags, Iterable): video_tags = '|'.join(tags) else: video_tags = '' video_meta = { "video_id": item['id'], "channel_title": item["snippet"].get("channelTitle"), "channel_id": item["snippet"].get("channelId"), "video_publish_date": parse_yt_datetime(item["snippet"].get("publishedAt")), "video_title": item["snippet"].get("title"), "video_description": item["snippet"].get("description"), "video_category": item["snippet"].get("categoryId"), "video_view_count": item["statistics"].get("viewCount"), "video_comment_count": item["statistics"].get("commentCount"), "video_like_count": item["statistics"].get("likeCount"), "video_dislike_count": item["statistics"].get("dislikeCount"), "duration": item["contentDetails"]["duration"], "video_thumbnail": item["snippet"]["thumbnails"]["high"]["url"], "video_tags": video_tags, "collection_date": datetime.datetime.now() } return video_meta
def parse_video_url(item): ''' Parses and processes raw output and returns publish_date, video_id, channel_id, collection_date :params item: json document :type item: dict :returns: parsed dictionary :rtype: dict ''' if not isinstance(item, dict): return dict() publish_date = item['snippet'].get('publishedAt') publish_date = parse_yt_datetime(publish_date) video_id = item['snippet']['resourceId'].get('videoId') channel_id = item['snippet'].get('channelId') return { "video_id" : video_id, "channel_id" : channel_id, "publish_date" : publish_date, "collection_date" : datetime.datetime.now() }
def test_parse_yt_datetime(self): ''' #Verified by Megan Brown on 11/30/2018''' resp = utils.parse_yt_datetime(self.date) self.assertEqual(resp, self.datetime_date)
def test_parse_yt_datetime(self): resp = utils.parse_yt_datetime(self.date) self.assertEqual(resp, self.datetime_date)