def trans_to_file(hlink, fi): length_of_line = 0 ids = id_from_url(hlink) q = YouTubeTranscriptApi.get_transcript(ids) results = youtube.videos().list(id=ids, part='snippet').execute() for result in results.get('items', []): title = "Title: " + result['snippet']['title'] line_string = "Transcript for YouTube video at https://www.youtube.com/watch?v={:s}\n\n{:s}\n\n".format(hlink, title) for q0 in q: a = q0['text'] st = q0['start'] en = q0['start'] + q0['duration'] stm = int(st) // 60 sts = st % 60 enm = int(en) // 60 ens = en % 60 if flag_start and flag_end: a = "({:02d}:{:05.2f}-{:02d}:{:05.2f}) {:s}".format(stm, sts, enm, ens, a) elif flag_start: a = "({:05.2f}) {:s}".format(st, a) elif flag_end: a = "(-{:05.2f}) {:s}".format(en, a) if not a.strip(): line_string += "\n" length_of_line = 0 if len(a) + length_of_line >= max_line and length_of_line > 0: line_string += "\n" + a length_of_line = len(a) else: if length_of_line: line_string += " " line_string += a length_of_line += len(a) + 1 print("Writing to", fi) f = open(fi, "w") f.write(line_string) f.close()
n = n + 1 print(id_list) ############# #variable for iterating through list of videos n = 0 #create match list for storing videoid and start time of transcript block that contains the word match = [] #uses transcript api to get transcript for each video for i in id_list: video_id = id_list[n] try: video_transcript = YouTubeTranscriptApi.get_transcript( video_id, languages=['pt']) #searches video transcript for word and stores video id and timestamp in a dictionary in the match list, update later to ignore upper/lower case for i in video_transcript: if word in i['text']: #print("yes") match.append({'match_id': video_id, 'timestamp': i['start']}) #else: #print("no") except: "no transcript available in selected language" n = n + 1 #added try and except since language wasn't available in 3rd transcript so it gave an error and didn't continue. had to indent everything
sys.exit("{:d} youtube video{:s} to files.".format(valid, 's' if valid == 1 else '')) video_id = re.sub(".*=", "", video_id) ids = video_id results = youtube.videos().list(id=ids, part='snippet').execute() title = "" for result in results.get('items', []): title = "Title: " + result['snippet']['title'] if not video_id: sys.exit("Specify video id or use -c for clipboard.") if not (print_output or write_output): sys.exit("Need to specify print or write output on. To launch, just use -jl.") q = YouTubeTranscriptApi.get_transcript(video_id) line_string = "Transcript for YouTube video at https://www.youtube.com/watch?v={:s}\n\n{:s}\n\n".format(video_id, title) for q0 in q: a = q0['text'] st = q0['start'] en = q0['start'] + q0['duration'] stm = int(st) // 60 sts = st % 60 enm = int(en) // 60 ens = en % 60 if flag_start and flag_end: a = "({:02d}:{:05.2f}-{:02d}:{:05.2f}) {:s}".format(stm, sts, enm, ens, a) elif flag_start: a = "({:05.2f}) {:s}".format(st, a)
# -*- coding: utf-8 -*- """ Created on Mon Sep 23 11:30:43 2019 @author: NDH00360 """ import pandas as pd from youtube_transcript_api import YouTubeTranscriptApi dfVlogsFinal = pd.DataFrame() for videoId in videoId_list: #from CaptionSentiments.py try: text = YouTubeTranscriptApi.get_transcript(videoId) print(videoId) dfVlogs = pd.DataFrame(text) dfVlogs1 = pd.DataFrame(dfVlogs['text']) dfVlogsFinal = dfVlogsFinal.append(dfVlogs1) except: print("e") dfVlogs.to_csv('C:\\Users\\NDH00360\\Desktop\\VloggersAltimaComments.csv') dfVlogs = pd.read_csv( r"C:\Users\NDH00360\Desktop\YoutubeSentimets Data\videoCommentDataAltima.csv" ) my_lst_str = ' '.join(map(str, dfVlogs['text'])) df1 = dfVlogs from nltk.tokenize import sent_tokenize tokenized_text = sent_tokenize(my_lst_str) print(tokenized_text) dfVlogs = pd.DataFrame(tokenized_text)
# Examples: # - http://youtu.be/SA2iWivDJiE # - http://www.youtube.com/watch?v=_oPAwA_Udwc&feature=feedu # - http://www.youtube.com/embed/SA2iWivDJiE # - http://www.youtube.com/v/SA2iWivDJiE?version=3&hl=en_US query = urlparse(url) if query.hostname == 'youtu.be': return query.path[1:] if query.hostname in ('www.youtube.com', 'youtube.com'): if query.path == '/watch': return parse_qs(query.query)['v'][0] if query.path[:7] == '/embed/': return query.path.split('/')[2] if query.path[:3] == '/v/': return query.path.split('/')[2] # fail? return None video_id = extract_video_id(video_url) transcript_list = YouTubeTranscriptApi.get_transcript(video_id) ratio = st.sidebar.slider("Sentence Keep Ratio", 0.1, 0.9, 0.3) min_length = st.sidebar.slider("Minimum Length", 50, 99, 50) max_length = st.sidebar.slider("Maxmium Length", 100, 500, 200) #with open('your_file.txt', 'w') as f: # for item in transcript_list: # f.write("%s\n" % item) @st.cache def gather_text(transcript_list): full_text = [] length = len(transcript_list) for i in range(length):
from youtube_transcript_api import YouTubeTranscriptApi video_id = 'Z6IBu6h7bQc' result = YouTubeTranscriptApi.get_transcript(video_id) print(result)
from youtube_transcript_api import YouTubeTranscriptApi import nltk from collections import OrderedDict verb_db = {} NUM_VIDEOS_PLAYLIST = 96 # Read from videos.txt videos = open("videos.txt", "r") lines = videos.readlines() for line in lines: print(line) # Basics with Babish video on burgers: https://www.youtube.com/watch?v=iC1rvXPt_rE playlist_url = line transcript = YouTubeTranscriptApi.get_transcript(playlist_url) # Separate into sections for subtitle in transcript: text = subtitle['text'] # Parse sentence into tokens tokens = nltk.word_tokenize(text) tagged = nltk.pos_tag(tokens) # Count verbs for tag in tagged: if tag[1] == "VB": if tag[0].lower() not in verb_db: verb_db[tag[0].lower()] = 1 else: verb_db[tag[0].lower()] += 1
from youtube_transcript_api import YouTubeTranscriptApi print(YouTubeTranscriptApi.get_transcript("aGGBGcjdjXA", languages=["en-US"]))
def get_transc(): text = YouTubeTranscriptApi.get_transcript(video_id) transc = [] for x in range(len(text)): transc.append(text[x]['text'])
print('Wrong usage!!!') print('Correct usage: python gettranscript.py url zh-CN') sys.exit(1) if len(sys.argv) > 2: language = [] language.append(sys.argv[2]) if video_id[:32] == 'https://www.youtube.com/watch?v=': video_id = video_id[32:] else: print(video_id, ' is incorrect. Exit now!') sys.exit(1) #words = YouTubeTranscriptApi.get_transcript(video_id, languages=list(language)) words = YouTubeTranscriptApi.get_transcript(video_id) for line in words: transcript += line['text'] + ' ' print(words[:4]) transcript2 = [x['text'] for x in words] print(transcript2) # Translate # translator = Translator(service_urls=[ # 'translate.google.com', # 'translate.google.co.kr', # 'translate.google.cn' # ])
'preferredquality': '192', }], } dir_path = SAVE_PATH + speaker_name #'/%(title)s.%(ext)s' count = 1 for link in links: print("Downloading sound clip ", count) temp = links[0].split("&")[0] try: vid = pafy.new(link) bestaudio = vid.getbestaudio() if not os.path.exists(dir_path + "/" + str(count) + "/"): os.makedirs(dir_path + "/" + str(count) + "/") bestaudio.download(filepath=dir_path + "/" + str(count) + "/" + str(count) + "." + bestaudio.extension) except: print("Unresolvable conn error for video", count) continue vid_id = link.split("?")[1].split("=")[1].split("&")[0] try: trans = YouTubeTranscriptApi.get_transcript(vid_id) with open(dir_path + "/" + str(count) + "/" + str(count) + ".json", "w+") as f: json.dump(trans, f) except: print("Could not generate Transcript here") print("ID: ", vid_id) count += 1
def download_youtube(youtube_id, target_saying): folder = './data/{}'.format(target_saying) if not os.path.exists(folder): os.makedirs(folder) if os.path.exists('./data/{}/{}.wav'.format(target_saying, youtube_id)): print("{} already exsists...skipping".format(youtube_id)) return False f=open("./data/{}/transcript.txt".format(target_saying), "a+") #print("youtube id: {}".format(youtube_id)) ydl_opts = { 'outtmpl': youtube_id, 'format': 'bestaudio/best', 'postprocessors': [{ 'key': 'FFmpegExtractAudio', 'preferredcodec': 'wav', 'preferredquality': '192' }], 'postprocessor_args': [ '-ar', '16000' ], 'prefer_ffmpeg': True, 'keepvideo': False } try: transcript = YouTubeTranscriptApi.get_transcript(youtube_id) except Exception as e: print("Transcript doesn't exist for video: {}".format(e)) return False for x in range(0, len(transcript)): text = transcript[x]['text'] if target_saying.lower() in text.lower(): print("{} was detected in transcript. Downloading Video".format(target_saying)) with youtube_dl.YoutubeDL(ydl_opts) as ydl: ydl.download(['http://www.youtube.com/watch?v={}'.format(youtube_id)]) print("download complete") start = float(transcript[x]['start']) duration = float(transcript[x]['duration']) t1 = start * 1000 #Works in milliseconds if x+1 < len(transcript): next_start = transcript[x+1]['start'] * 1000 t2 = next_start #the start of the next one else: t2 = t1 + (duration * 1000) print("{} - start: {}({}) duration: {} end time:{} ".format(text, start, t1, duration, t2)) clean_line = re.sub(r'([^a-zA-Z ]+?)', '', text) clean_line = clean_line.lower() #lowercase #clean_line = clean_line.encode('utf-8') newAudio = AudioSegment.from_wav("wav") newAudio = newAudio[t1:t2] newAudio.export('./data/{}/{}.wav'.format(target_saying, youtube_id), format="wav") annotation_text = "{} {}\n".format(youtube_id, clean_line.upper()) f.write(annotation_text) f.close() return True return False
def search_youtube_videos(params): query = params["query"] order = dict_tools.dict_get_existent(params, "order", None) max_results = dict_tools.dict_get_existent(params, "results_max_count", None) page_token = dict_tools.dict_get_existent(params, "page_token", None) want_descriptions = dict_tools.dict_get_existent(params, "want_descriptions", True) want_en_transcripts = dict_tools.dict_get_existent(params, "want_en_transcripts", True) want_comments = dict_tools.dict_get_existent(params, "want_comments", True) # You can enable page_token by simply removing the following line. page_token = None search_max_results = 5 if max_results is not None: search_max_results = max_results basic_search_result = youtube_basic.youtube_video_basic_search( query, order=order, max_results=search_max_results, page_token=page_token) video_ids = basic_search_result["results_video_id"] count = len(video_ids) results_list = [] i = 0 while i < count: this_video_id = video_ids[i] this_video_isContinue = True # get basic information try: this_video_basic_info = youtube_basic.youtube_video_get_basic_info( this_video_id) except Exception: this_video_isContinue = False # get English transcript if (this_video_isContinue is True) and (want_en_transcripts is True): try: this_video_en_transcript = YouTubeTranscriptApi.get_transcript( this_video_id, languages=["en"]) except Exception: this_video_isContinue = False # get comments if (this_video_isContinue is True) and (want_comments is True): try: this_video_comments = youtube_basic.youtube_video_get_comments( this_video_id, max_results=100) except Exception: this_video_comments = youtube_basic.youtube_video_create_empty_comments( ) # put them together if this_video_isContinue is True: this_video_data_dict = { "title": this_video_basic_info["title"], "likeCount": this_video_basic_info["likeCount"], "dislikeCount": this_video_basic_info["dislikeCount"], "viewCount": this_video_basic_info["viewCount"], } if want_descriptions is True: this_video_data_dict["description"] = this_video_basic_info[ "description"] if want_en_transcripts is True: this_video_data_dict[ "en_transcript"] = this_video_en_transcript if want_comments is True: this_video_data_dict["some_comments"] = this_video_comments this_video_dict = { "data": this_video_data_dict, "video_id": this_video_id } results_list.append(this_video_dict) i = i + 1 # build the final return ret_dict = { "video_results_count": len(results_list), "video_results": results_list } # ret_dict["pageInfo"] = basic_search_result["pageInfo"] # ret_dict["nextPageToken"] = basic_search_result["nextPageToken"] # ret_dict["prevPageToken"] = basic_search_result["prevPageToken"] return ret_dict
from youtube_transcript_api import YouTubeTranscriptApi try: transcript = YouTubeTranscriptApi.get_transcript('OKe7q1nUFgE', languages=['en', 'hi']) print(transcript) except: print("Error in retrieving transcript")
def gen_transcripts(video_IDs_list): transcripts_list = [] for video_id in video_IDs_list: transcripts_list.append(YouTubeTranscriptApi.get_transcript(video_id)) return transcripts_list
def download_info(channel_id): ## Obtain video-ids res = self.youtube.channels().list( id=channel_id, part='contentDetails').execute() playlist_id = res['items'][0]['contentDetails'][ 'relatedPlaylists']['uploads'] store_all_info = [] next_page_token = None ## To display how many were successfully downloaded num_videos_seen = 0 num_videos_stored = 0 ## Retrieve all video subtitles while True: res = self.youtube.playlistItems().list( playlistId=playlist_id, part='snippet', maxResults=50, pageToken=next_page_token).execute() information_list = res['items'] for video_info in information_list: ## Does the video have subtitles time.sleep(0.05) ## Do not access the site too quickly try: video_id = video_info['snippet']['resourceId'][ 'videoId'] video_capt = YouTubeTranscriptApi.get_transcript( video_id) dict_store = { 'title': '', 'description': '', 'video_id': '', 'subs': '' } dict_store['title'] = video_info['snippet'][ 'title'].replace('\n', ' ') dict_store['description'] = video_info['snippet'][ 'description'].replace('\n', ' ') dict_store['video_id'] = video_id store_text = '' for subs in video_capt: clean_text = subs['text'].replace('\n', ' ') store_text = store_text + clean_text dict_store['subs'] = store_text store_all_info.append(dict_store) num_videos_seen = num_videos_seen + 1 num_videos_stored = num_videos_stored + 1 ## If not then ignore it except: num_videos_seen = num_videos_seen + 1 self.print_progress() next_page_token = res.get('nextPageToken') if next_page_token is None: break if len(store_all_info) == 0: no_information_stored = 'No information could be retrieved. Most likely due to youtube blocking repeated access attempts.' raise Exception(no_information_stored) else: print('\n Information for {}/{} videos could be retrieved'. format(num_videos_stored, num_videos_seen)) return store_all_info
def extract_caption(video_id, language): # get transcripts return YouTubeTranscriptApi.get_transcript(video_id=video_id, languages=[language])
def get_transcript(self, youtube_video_id) -> List[Dict]: transcript = YouTubeTranscriptApi.get_transcript(youtube_video_id) if not self._has_manually_created_transcript(youtube_video_id): return self._punctuate(transcript) return transcript
def get_recipe(url, use_filter): # target Binging with Babish video transcript = YouTubeTranscriptApi.get_transcript(url) # Get verbs from db verbs = [] with open("../../datasets/cooking_verbs.txt", 'r') as verb_file: verbs = verb_file.readlines() # ingredient extraction ============================ db = Ingredients() ingreds = set([]) actual_ingredients = [] res = {} i = 0 print('>>> Getting ingredients') for subtitle in transcript: i += 1 text = subtitle['text'] cur_ingredients = db.parse_ingredients(text) measurements = db.parse_measurements(text) if len(cur_ingredients) > 0: print('>>> Ingredients from line ' + str(i) + ': ') print(cur_ingredients) ingreds |= set(cur_ingredients) actual_ingredients += get_actual_ingredients(cur_ingredients, measurements) print('>>> Ingredients detected: ') print(actual_ingredients) res['ingredients'] = actual_ingredients # ================================================== video_file = download_video(url) # steps extraction ================================= times = ['seconds', 'minutes', 'hours', 'second', 'minute', 'hour'] instructions = [] pictures = [] if use_filter: print('>>> Generating steps') i = 0 for subtitle in transcript: text = subtitle['text'] # Write all lines # with open('before_filter.txt', 'a') as before_file: # before_file.write(text + '\n') # Remove lines without an ingredient, cooking verb, or time measurement for target in (list(ingreds) + verbs): if (target in text or len([t for t in times if (t in text)])): instructions.append({'step': text}) if i % PICTURE_FREQUENCY == 0: pictures.append(subtitle['start']) print('>>> KEEPING LINE: ' + text) # with open('after_filter.txt', 'a') as after_file: # after_file.write(text + '\n') # break i += 1 break res['instructions'] = instructions print('>>> Steps Generation Complete') # ================================================== # frames extraction ================================ print('>>> Extracting Frames...') file_names = extract_frames(video_file, pictures) i = 0 for file_name in file_names: res['instructions'][i]['image'] = file_name i += PICTURE_FREQUENCY if os.path.exists(video_file): os.remove(video_file) # ================================================== print('>>> finished. result: ') print(res) return res
from youtube_transcript_api import YouTubeTranscriptApi videoID = "6bnaBnd4kyU" dd = YouTubeTranscriptApi.get_transcript(videoID, languages=["en"]) print(dd)
def get_transcripts(): with open(TIMESTAMPS, 'r') as f: json_str = json.loads(f.read()) df_ts = pd.read_json(json_str) if os.path.isfile(RAW_DATA): with open(RAW_DATA, 'r') as f: raw = json.loads(f.read()) else: raw = dict() if os.path.isfile(CHECKPOINT): logger.info('Resume scraping from last check point.') with open(CHECKPOINT, 'r') as f: i = int(f.read()) else: logger.info('Start transcript scraping') i = 0 retries = 0 while True: i += 1 if retries >= 10: # most possibly hitting YouTube Transcript API's limit logger.info('Reached retry limit. Stopping...') break vid = df_ts.videoID[i] ts_ranges = df_ts.time[i] if vid in raw.keys(): continue elif i == df_ts.index[-1]: logger.info('No more video to scrape. Stopping...') break else: try: cap = pd.DataFrame( yti.get_transcript(vid, languages=['en', 'en-US', 'en-GB'])) retries = 0 logger.debug(f'Scraped video id {vid}') except: retries += 1 logger.debug(f'Cannot scrape video id {vid}') continue cap['end'] = cap['start'] + cap['duration'] cap['label'] = 0 cap.drop(['duration'], axis=1, inplace=True) for ts in eval(str(ts_ranges)): mask = (cap.start >= ts[0]) & (cap.end <= ts[1]) cap.loc[mask, 'label'] = 1 raw[vid] = cap.to_json() logger.info('Dumping data...') with open(RAW_DATA, 'w') as f: json.dump(raw, f) with open(CHECKPOINT, 'w') as f: f.write(str(i - MAX_RETRIES)) logger.info(f'Done. Raw data has {len(raw)} lines') logger.info('EXIT 0')
#running the word_count function on video_description, prints out word_count_description_2020 = word_count(video_description_list_2020) sorted_word_count_description_2020 = sorted( word_count_description_2020.items(), key=lambda x: x[1]) df_sorted_word_count = pd.DataFrame(sorted_word_count_description_2020) pd.options.display.max_rows = 5100 df_sorted_word_count.head(5100) #gathering subtitles of the videos transcript_list = [] #checks if videos have subtitles, throws error when video does not have subtitles for i in range(0, 100): print(i) transcript_2020 = YouTubeTranscriptApi.get_transcript(video_id_2020[i]) transcript_list.append(transcript_2020) print(transcript_list) transcript_list_2019 = [] #checks if videos have subtitles, throws error when video does not have subtitles for i in range(0, 100): print(i) transcript_2019 = YouTubeTranscriptApi.get_transcript(video_id_2019[i]) #searches_2020_more_info_singluar['number'] = i transcript_list_2019.append(transcript_2019) #transcript_list.append("AKSDJFLKJSDFLKJASD;FKJAS;LDKFJALS;DKJF;LAKSDJF;LAKSDJF;LKASDJF") print(transcript_list_2019)
def download_captions(self) -> None: base_dir = "./datasets/" c = self.output_dir lang = self.lang video_id = [] text = [] start = [] duration = [] names = [] full_names = [] file_list = os.listdir(base_dir + c + "/wavs/") file_list_wav = [file for file in file_list if file.endswith(".wav")] for f in tqdm.tqdm(file_list_wav): try: video = f.split(".wav")[0] subtitle = YouTubeTranscriptApi.get_transcript( video, languages=[lang]) for s in range(len(subtitle) - 1): video_id.append(video) full_name = base_dir + c + '/wavs/' + video + '.' + str( s).zfill(4) + '.wav' full_names.append(full_name) name = video + '.' + str(s).zfill(4) + '.wav' names.append(name) subtitle[s]['text'] = ''.join([ c for c in subtitle[s]['text'] if c not in ('!', '?', ',', '.', '\n', '~', '"', "'") ]) text.append(subtitle[s]['text']) start.append(subtitle[s]['start']) if subtitle[s]['duration'] >= (subtitle[s + 1]['start'] - subtitle[s]['start']): duration.append(subtitle[s + 1]['start'] - subtitle[s]['start']) else: duration.append(subtitle[s]['duration']) except: pass df = pd.DataFrame({ "id": video_id, "text": text, "start": start, "duration": duration, "name": full_names }) makedirs(base_dir + c + '/text') df.to_csv(base_dir + c + '/text/subtitle.csv', encoding='utf-8') res = [i + '|' + j for i, j in zip(names, text)] df2 = pd.DataFrame({"name": res}) df2.to_csv(base_dir + c + '/metadata.csv', encoding='utf-8', header=False, index=False) file_data = OrderedDict() for i in range(df.shape[0]): file_data[df['name'][i]] = df['text'][i] with open(base_dir + c + '/alignment.json', 'w', encoding="utf-8") as make_file: json.dump(file_data, make_file, ensure_ascii=False, indent="\n") print(c + ' channel was finished')
def insight(vidId): text_data = [] keyWordList = YouTubeTranscriptApi.get_transcript(vidId) # print(keyWordList) for line in keyWordList: # print(line['text']) tokens = prepare_text_for_lda(line['text']) if random.random() > .85: # print(tokens) text_data.append(tokens) #genism dictionary = corpora.Dictionary(text_data) corpus = [dictionary.doc2bow(text) for text in text_data] pickle.dump(corpus, open('corpus.pkl', 'wb')) dictionary.save('dictionary.gensim') #topics NUM_TOPICS = 5 ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=NUM_TOPICS, id2word=dictionary, passes=55) ldamodel.save('model5.gensim') topics = ldamodel.print_topics(num_words=4) # for topic in topics: # print(topic) #Displaying dictionary = gensim.corpora.Dictionary.load('dictionary.gensim') corpus = pickle.load(open('corpus.pkl', 'rb')) lda = gensim.models.ldamodel.LdaModel.load('model5.gensim') lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False) pyLDAvis.display(lda_display) pyLDAvis.save_html(lda_display, './frontend/lda.html') f = open("./frontend/lda.html", encoding="utf8") soup = bs4.BeautifulSoup(f) scr = soup.select("script") with open("./frontend/new.js", 'w') as b: src = scr[0].getText().replace( 'https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min', '/js/d3.min.js', 1) src1 = src.replace( 'https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min.js', '/js/d3.min.js', 1) src2 = src1.replace( 'https://cdn.rawgit.com/bmabey/pyLDAvis/files/ldavis.v1.0.0.js', '/js/ldavis.js', 2) b.write(src2) b.close() div = soup.select('div') link = soup.select('link') print(div[0], link[0]) with open("./frontend/lda.html", 'w') as b: b.write(str(link[0])) print('\n') b.write(str(div[0])) b.write('<script src="new.js"></script>') return ""
def downloadTranscript(videoID): '''Takes in a YT VideoID and returns the transcript object''' transcript_object = YouTubeTranscriptApi.get_transcript(videoID) return transcript_object
def get_captions(video_id): res = YouTubeTranscriptApi.get_transcript(video_id) captions = "" for r in res: captions = captions + " " + r.get("text") return captions
videos_list = list({v['id']: v for v in videos_list}.values()) os.system('osascript -e beep') no_transcript = [ 'pKYeAN-_wFI', 'kpIIBH5jEGs', 'mGLMi9kXTRI', 'aaLiLRVeaZA', 'waXb8QGdEYQ', '9g3CjQv5yec', 'NZ83rfAqWMw', 'PttKq0GcnoQ', 'GGEGF7cHmMU', 'ms5a_C7EeNk' ] i = 0 for video in videos_list: if video['id'] in no_transcript: continue try: transcript = YouTubeTranscriptApi.get_transcript(video['id']) except: continue divide = "=" for i in range(0, 10): divide = divide + "=" print(divide) print("<|startoftext|>") print(video['title']) for line in transcript: print(line['text']) print("\n<|endoftext|>") i += 1 os.system('osascript -e beep')
def transcript(movie_id: str): data = YouTubeTranscriptApi.get_transcript(movie_id) return jsonify(data)
import csv import json import pandas as pd from youtube_transcript_api import YouTubeTranscriptApi data = pd.read_csv('data_with_index.csv') transcript = [] with open('Youtube_Transcripts1.csv', 'w', newline='', encoding="utf-8") as csvfile: youtube = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL) for i in data.video_id: # a= ["LsCUCElZli0", "3AWDKiPGsdQ"] # for i in a: try: transcript_list = YouTubeTranscriptApi.get_transcript(i) sentences = [] for dict in transcript_list: sentences.append(dict['text']) # print('Sentences:', sentences) transcript.append(' '.join(sentences)) except: transcript.append('No transcript') pass for i in range(len(transcript)): youtube.writerow([transcript[i]]) # print('Transcript: ', transcript[1])