def get_amount_of_videos_uploaded(): """ :return: Returns the amount of videos vakantie vincent has uploaded. """ api = Api(api_key=get_youtube_api_key()) channel_by_id = api.get_channel_info(channel_id=channel_id) return channel_by_id.items[0].to_dict()["statistics"]["videoCount"]
def get_channel_info(username): api = Api(api_key='redacted') channel_by_name = api.get_channel_info(channel_name=username) try: response = channel_by_name.items[0].to_dict() except: return 'User doesnt exist' channel_name = response["snippet"]["localized"]["title"] created_at = response["snippet"]["publishedAt"] pfp_url = response["snippet"]["thumbnails"]["default"]["url"] view_count = response["statistics"]["viewCount"] subscribers = response["statistics"]["subscriberCount"] video_count = response["statistics"]["videoCount"] userid = response["id"] description = response["brandingSettings"]["channel"]["description"] match = re.findall(r'[\w\.-]+@[\w\.-]+', str(description)) if len(match) > 0: emails = ','.join(match) else: emails = "None found" data = { "name": channel_name, "created_at": parser.parse(created_at).timestamp(), "pfp_url": pfp_url, "total_views": view_count, "subsribers": subscribers, "video_count": video_count, "userid": userid, "emails": emails, "url": f'https://www.youtube.com/channel/{userid}' } return data
def get_number_subscribers_youtube_channel(self, youtubeChannelName): res = {} apiKey = 'AIzaSyBOCLFDDz4wHFmatH-fPxsjjRnBfPzcOFQ' try: api = YoutubeApi(api_key=apiKey) r = api.search_by_keywords(q=youtubeChannelName, search_type=["channel"], count=2, limit=2) idChannel = r.items[0].snippet.channelId channel = api.get_channel_info(channel_id=idChannel) if (channel.items): res['result'] = { 'subscriberCount': channel.items[0].statistics.subscriberCount, 'channel': channel.items[0].snippet } else: res['result'] = { 'error': 'Youtube url not updated', 'msg': 'Channel not found.' } return jsonify(res) except Exception as e: print("error : get_number_subscribers_youtube_channel\n", str(e), flush=True) res['error'] = "internal error" res['message'] = 'Youtube Data API exceded' return res
def fetchYoutubeData(): processedChannelDataID = None try: successMessage('- Gathering youtube channel & video data...') api = Api(api_key=os.getenv('YOUTUBE_DATA_API_KEY')) channelById = api.get_channel_info( channel_id=os.getenv('YOUTUBE_CHANNEL_ID')) successMessage('- Fetched youtube channel & video data...') uploadsPlaylistId = channelById.items[ 0].contentDetails.relatedPlaylists.uploads allChannelVideos = api.get_playlist_items( playlist_id=uploadsPlaylistId, count=30, limit=30) successMessage('- Constructing youtube channel & video data...') processedData = [] for video in allChannelVideos.items: processedData.append({ "videoUrl": video.contentDetails.videoId, "videoTitle": video.snippet.channelTitle, "videoDescription": video.snippet.description, }) successMessage('- Storing youtube video & channel data...') processedChannelDataID = saveDataToMongoDB( { "thumbnail": channelById.items[0].snippet.thumbnails.high.url, "channelName": channelById.items[0].snippet.title, "channelDescription": channelById.items[0].snippet.description, "keywords": channelById.items[0].brandingSettings.channel.keywords.split(), "resetAt": round(time.time()) }, "youtubeChannelData") saveDataToMongoDB( { "_id": processedChannelDataID, "channelName": channelById.items[0].snippet.title, "videos": processedData, "resetAt": round(time.time()), "hasBeenProcessed": False }, "youtubeVideoData") successMessage('- Completed storing youtube video & channel data...') except: errorMessage('- An exception occurred') else: successMessage('- Completed youtube data step... ') return processedChannelDataID
def main(channel_name="YaleCourses", load_from_file=False): # find all the videos api = Api(api_key="AIzaSyCw0j0aCe0y_T42q3RLBoVtGXlTOMGGaSM") # AIzaSyCw0j0aCe0y_T42q3RLBoVtGXlTOMGGaSM print("Setup dir to save the transcripts of %s channel" % (channel_name)) channel_dir = os.path.join(raw_dir, "transcripts", channel_name) channel_id_file = os.path.join(raw_dir, "video_ids", channel_name + ".txt") if not os.path.exists(channel_dir): os.mkdir(channel_dir) else: print("\tThe folder of the channel %s is already exist\n" "\tdelete it before executing this script -" "we don't want to override your data" % (channel_name)) return ''' Since google is blocking after a while the retrival of the IDs, We will write the IDs to a file as a buffer for safety. ''' if load_from_file is False: print("Retriving %s channel information" % (channel_name)) channel_by_name = api.get_channel_info(channel_name=channel_name) print("\tFetch all the playlists") playlists_by_channel = api.get_playlists( channel_id=channel_by_name.items[0].id, count=None) print("\tFetch all the videos of the playlist") playlists_videos = [] for playlist in playlists_by_channel.items: print("\t\tFetching videos IDs of playlist %s" % (playlist.id)) playlists_videos.append( api.get_playlist_items(playlist_id=playlist.id, count=None)) videos_ids = [] for playlist in playlists_videos: for video in playlist.items: videos_ids.append(video.snippet.resourceId.videoId) print("We gathered now %s videos, saving save to file" % (len(videos_ids))) with open(channel_id_file, 'w') as f: json.dump(videos_ids, f) else: with open(channel_id_file, 'r') as f: videos_ids = json.load(f) print("Save %s channel videos transcripts" % (channel_name)) #map(save_transcript,videos_ids) #[save_transcript(vd) for vd in videos_ids] for video_id in videos_ids: print("The video ID is %s" % (video_id)) try: transcript_list = YouTubeTranscriptApi.list_transcripts( video_id) #,languages=['en'] #transcript_list = [transcript for transcript in transcript_list\ # if bool(re.match(transcript.language,"[en]*"))] video_transcripts = None for transcript in transcript_list: # the Transcript object provides metadata properties print("Video id : ", transcript.video_id) print("\tlanguage : %s , language code : %s" % (transcript.language, transcript.language_code)) print("\tis_generated: %s, is_translatable: %s" % (transcript.is_generated, transcript.is_translatable)) if transcript.language_code == 'en' and transcript.is_generated is False: actual_transcript = transcript.fetch() video_transcripts = actual_transcript if video_transcripts is not None: #print( "Current length json of trancsript is " ,len(transcript)) video_path = os.path.join(raw_dir, "transcripts", channel_name, video_id + ".json") with open(video_path, 'w') as outfile: json.dump(video_transcripts, outfile) except Exception as e: print(e) print("Finish main")
class YTParser: """Получаем информацию из Ютуба """ def __init__(self, api, audio_path, kaldi_path): self.api = Api(api_key=api) self.filename = False self.path = audio_path self.kaldi_path = kaldi_path def url2id(self, url): return url.split('watch?v=')[1] def id2url(self, id): return 'https://www.youtube.com/watch?v=' + id def get_latest_videos_by_channel_link(self, url): """Получаем ссылки на последние видео по именной ссылке на канал """ channel_name = url.split('/user/')[1] channel_by_id = self.api.get_channel_info(channel_name=channel_name) channel_info = channel_by_id.items[0].to_dict() uploads_plst = channel_info['contentDetails']['relatedPlaylists'][ 'uploads'] items = self.api.get_playlist_items(playlist_id=uploads_plst, count=100) videos = [] for item in items.items: if item.snippet.resourceId.kind == 'youtube#video': videos.append({ 'id': item.snippet.resourceId.videoId, 'url': self.id2url(item.snippet.resourceId.videoId) }) return videos def _catch_filename(self, d): if d['status'] == 'finished': self.filename = os.path.splitext(d['filename'])[0] + '.mp3' def _downloaded_data(self): """Мета-данные скачанного видео """ if self.filename == False: return False self.description_file = os.path.splitext( self.filename)[0] + '.info.json' with open(os.path.join(self.path, self.description_file)) as fp: description = json.load(fp) return { 'id': description['id'], 'uploader_url': description['uploader_url'], 'channel_id': description['channel_id'], 'channel_url': description['channel_url'], 'upload_date': datetime.datetime.strptime(description['upload_date'], "%Y%m%d"), 'title': description['title'], 'description': description['description'], 'webpage_url': description['webpage_url'], 'view_count': description['view_count'], 'like_count': description['like_count'], 'dislike_count': description['dislike_count'], 'average_rating': description['average_rating'], } def video2data(self, url): """Получаем распознанный текст ролика по его url """ current_dir = os.getcwd() os.chdir(self.path) ydl_opts = { 'format': 'bestaudio/best', 'writeinfojson': 'info', 'postprocessors': [{ 'key': 'FFmpegExtractAudio', 'preferredcodec': 'mp3', 'preferredquality': '192', }], 'progress_hooks': [self._catch_filename], } with youtube_dl.YoutubeDL(ydl_opts) as ydl: ydl.download([url]) time.sleep(20) video_description = self._downloaded_data() model = Model(self.kaldi_path) rec = KaldiRecognizer(model, 16000) process = subprocess.Popen([ 'ffmpeg', '-loglevel', 'quiet', '-i', os.path.join(self.path, self.filename), '-ar', str(16_000), '-ac', '1', '-f', 's16le', '-' ], stdout=subprocess.PIPE) full_text = '' while True: data = process.stdout.read(4000) if len(data) == 0: break if rec.AcceptWaveform(data): res = json.loads(rec.Result()) full_text += ' ' + res['text'] full_text += ' ' + json.loads(rec.FinalResult())['text'] os.remove(os.path.join(self.path, self.description_file)) os.remove(os.path.join(self.path, self.filename)) os.chdir(current_dir) return full_text, video_description