Example #1
0
def trans_to_file(hlink, fi):
    length_of_line = 0
    ids = id_from_url(hlink)
    q = YouTubeTranscriptApi.get_transcript(ids)
    results = youtube.videos().list(id=ids, part='snippet').execute()
    for result in results.get('items', []):
        title = "Title: " + result['snippet']['title']
    line_string = "Transcript for YouTube video at https://www.youtube.com/watch?v={:s}\n\n{:s}\n\n".format(hlink, title)
    for q0 in q:
        a = q0['text']
        st = q0['start']
        en = q0['start'] + q0['duration']
        stm = int(st) // 60
        sts = st % 60
        enm = int(en) // 60
        ens = en % 60
        if flag_start and flag_end:
            a = "({:02d}:{:05.2f}-{:02d}:{:05.2f}) {:s}".format(stm, sts, enm, ens, a)
        elif flag_start:
            a = "({:05.2f}) {:s}".format(st, a)
        elif flag_end:
            a = "(-{:05.2f}) {:s}".format(en, a)
        if not a.strip():
            line_string += "\n"
            length_of_line = 0
        if len(a) + length_of_line >= max_line and length_of_line > 0:
            line_string += "\n" + a
            length_of_line = len(a)
        else:
            if length_of_line: line_string += " "
            line_string += a
            length_of_line += len(a) + 1
    print("Writing to", fi)
    f = open(fi, "w")
    f.write(line_string)
    f.close()
Example #2
0
    n = n + 1

print(id_list)
#############

#variable for iterating through list of videos
n = 0

#create match list for storing videoid and start time of transcript block that contains the word
match = []

#uses transcript api to get transcript for each video
for i in id_list:
    video_id = id_list[n]
    try:
        video_transcript = YouTubeTranscriptApi.get_transcript(
            video_id, languages=['pt'])

        #searches video transcript for word and stores video id and timestamp in a dictionary in the match list, update later to ignore upper/lower case
        for i in video_transcript:
            if word in i['text']:
                #print("yes")
                match.append({'match_id': video_id, 'timestamp': i['start']})
            #else:
            #print("no")

    except:
        "no transcript available in selected language"

    n = n + 1

#added try and except since language wasn't available in 3rd transcript so it gave an error and didn't continue.  had to indent everything
Example #3
0
        sys.exit("{:d} youtube video{:s} to files.".format(valid, 's' if valid == 1 else ''))

video_id = re.sub(".*=", "", video_id)

ids = video_id
results = youtube.videos().list(id=ids, part='snippet').execute()

title = ""

for result in results.get('items', []):
    title = "Title: " + result['snippet']['title']

if not video_id: sys.exit("Specify video id or use -c for clipboard.")
if not (print_output or write_output): sys.exit("Need to specify print or write output on. To launch, just use -jl.")

q = YouTubeTranscriptApi.get_transcript(video_id)

line_string = "Transcript for YouTube video at https://www.youtube.com/watch?v={:s}\n\n{:s}\n\n".format(video_id, title)

for q0 in q:
    a = q0['text']
    st = q0['start']
    en = q0['start'] + q0['duration']
    stm = int(st) // 60
    sts = st % 60
    enm = int(en) // 60
    ens = en % 60
    if flag_start and flag_end:
        a = "({:02d}:{:05.2f}-{:02d}:{:05.2f}) {:s}".format(stm, sts, enm, ens, a)
    elif flag_start:
        a = "({:05.2f}) {:s}".format(st, a)
# -*- coding: utf-8 -*-
"""
Created on Mon Sep 23 11:30:43 2019

@author: NDH00360
"""
import pandas as pd
from youtube_transcript_api import YouTubeTranscriptApi
dfVlogsFinal = pd.DataFrame()
for videoId in videoId_list:  #from CaptionSentiments.py
    try:
        text = YouTubeTranscriptApi.get_transcript(videoId)
        print(videoId)
        dfVlogs = pd.DataFrame(text)
        dfVlogs1 = pd.DataFrame(dfVlogs['text'])
        dfVlogsFinal = dfVlogsFinal.append(dfVlogs1)
    except:
        print("e")

dfVlogs.to_csv('C:\\Users\\NDH00360\\Desktop\\VloggersAltimaComments.csv')
dfVlogs = pd.read_csv(
    r"C:\Users\NDH00360\Desktop\YoutubeSentimets Data\videoCommentDataAltima.csv"
)
my_lst_str = ' '.join(map(str, dfVlogs['text']))

df1 = dfVlogs

from nltk.tokenize import sent_tokenize
tokenized_text = sent_tokenize(my_lst_str)
print(tokenized_text)
dfVlogs = pd.DataFrame(tokenized_text)
Example #5
0
    # Examples:
    # - http://youtu.be/SA2iWivDJiE
    # - http://www.youtube.com/watch?v=_oPAwA_Udwc&feature=feedu
    # - http://www.youtube.com/embed/SA2iWivDJiE
    # - http://www.youtube.com/v/SA2iWivDJiE?version=3&hl=en_US
    query = urlparse(url)
    if query.hostname == 'youtu.be': return query.path[1:]
    if query.hostname in ('www.youtube.com', 'youtube.com'):
        if query.path == '/watch': return parse_qs(query.query)['v'][0]
        if query.path[:7] == '/embed/': return query.path.split('/')[2]
        if query.path[:3] == '/v/': return query.path.split('/')[2]
    # fail?
    return None

video_id = extract_video_id(video_url)
transcript_list = YouTubeTranscriptApi.get_transcript(video_id)


ratio = st.sidebar.slider("Sentence Keep Ratio", 0.1, 0.9, 0.3)
min_length = st.sidebar.slider("Minimum Length", 50, 99, 50)
max_length = st.sidebar.slider("Maxmium Length", 100, 500, 200)
 
#with open('your_file.txt', 'w') as f:
#    for item in transcript_list:
#        f.write("%s\n" % item)

@st.cache
def gather_text(transcript_list):
    full_text = []
    length = len(transcript_list)
    for i in range(length):
Example #6
0
from youtube_transcript_api import YouTubeTranscriptApi
video_id = 'Z6IBu6h7bQc'
result = YouTubeTranscriptApi.get_transcript(video_id)
print(result)
from youtube_transcript_api import YouTubeTranscriptApi
import nltk
from collections import OrderedDict

verb_db = {}
NUM_VIDEOS_PLAYLIST = 96

# Read from videos.txt
videos = open("videos.txt", "r")
lines = videos.readlines()

for line in lines:
    print(line)
    # Basics with Babish video on burgers: https://www.youtube.com/watch?v=iC1rvXPt_rE
    playlist_url = line
    transcript = YouTubeTranscriptApi.get_transcript(playlist_url)
    # Separate into sections
    for subtitle in transcript:
        text = subtitle['text']
        # Parse sentence into tokens
        tokens = nltk.word_tokenize(text)
        tagged = nltk.pos_tag(tokens)

        # Count verbs
        for tag in tagged:
            if tag[1] == "VB":
                if tag[0].lower() not in verb_db:
                    verb_db[tag[0].lower()] = 1
                else:
                    verb_db[tag[0].lower()] += 1
Example #8
0
from youtube_transcript_api import YouTubeTranscriptApi

print(YouTubeTranscriptApi.get_transcript("aGGBGcjdjXA", languages=["en-US"]))
Example #9
0
def get_transc():
    text = YouTubeTranscriptApi.get_transcript(video_id)
    transc = []
    for x in range(len(text)):
        transc.append(text[x]['text'])
Example #10
0
    print('Wrong usage!!!')
    print('Correct usage: python gettranscript.py url zh-CN')
    sys.exit(1)

if len(sys.argv) > 2:
    language = []
    language.append(sys.argv[2])

if video_id[:32] == 'https://www.youtube.com/watch?v=':
    video_id = video_id[32:]
else:
    print(video_id, ' is incorrect. Exit now!')
    sys.exit(1)

#words = YouTubeTranscriptApi.get_transcript(video_id, languages=list(language))
words = YouTubeTranscriptApi.get_transcript(video_id)

for line in words:
    transcript += line['text'] + ' '
print(words[:4])

transcript2 = [x['text'] for x in words]
print(transcript2)

# Translate
# translator = Translator(service_urls=[
#       'translate.google.com',
#       'translate.google.co.kr',
#       'translate.google.cn'
#     ])
Example #11
0
        'preferredquality': '192',
    }],
}
dir_path = SAVE_PATH + speaker_name
#'/%(title)s.%(ext)s'
count = 1
for link in links:
    print("Downloading sound clip ", count)
    temp = links[0].split("&")[0]

    try:
        vid = pafy.new(link)
        bestaudio = vid.getbestaudio()
        if not os.path.exists(dir_path + "/" + str(count) + "/"):
            os.makedirs(dir_path + "/" + str(count) + "/")
        bestaudio.download(filepath=dir_path + "/" + str(count) + "/" +
                           str(count) + "." + bestaudio.extension)
    except:
        print("Unresolvable conn error for video", count)
        continue
    vid_id = link.split("?")[1].split("=")[1].split("&")[0]
    try:
        trans = YouTubeTranscriptApi.get_transcript(vid_id)
        with open(dir_path + "/" + str(count) + "/" + str(count) + ".json",
                  "w+") as f:
            json.dump(trans, f)
    except:
        print("Could not generate Transcript here")
        print("ID: ", vid_id)
    count += 1
Example #12
0
def download_youtube(youtube_id, target_saying):
	folder = './data/{}'.format(target_saying)
	if not os.path.exists(folder):
		os.makedirs(folder)

	if os.path.exists('./data/{}/{}.wav'.format(target_saying, youtube_id)):
		print("{} already exsists...skipping".format(youtube_id))
		return False

	f=open("./data/{}/transcript.txt".format(target_saying), "a+")
	#print("youtube id: {}".format(youtube_id))
	ydl_opts = {
	    'outtmpl': youtube_id,
	    'format': 'bestaudio/best',
	    'postprocessors': [{
	        'key': 'FFmpegExtractAudio',
	        'preferredcodec': 'wav',
	        'preferredquality': '192'
	    }],
	    'postprocessor_args': [
	        '-ar', '16000'
	    ],
	    'prefer_ffmpeg': True,
	    'keepvideo': False
	}

	try:
		transcript = YouTubeTranscriptApi.get_transcript(youtube_id)
	except Exception as e:
		print("Transcript doesn't exist for video: {}".format(e))
		return False




	
	
	for x in range(0, len(transcript)):
		text = transcript[x]['text']
		if target_saying.lower() in text.lower():
			print("{} was detected in transcript. Downloading Video".format(target_saying))
			with youtube_dl.YoutubeDL(ydl_opts) as ydl:
				ydl.download(['http://www.youtube.com/watch?v={}'.format(youtube_id)])
			print("download complete")
			
			start = float(transcript[x]['start'])
			duration = float(transcript[x]['duration'])

			
			
			t1 = start * 1000 #Works in milliseconds
			if x+1 < len(transcript):
				next_start = transcript[x+1]['start'] * 1000
				t2 = next_start #the start of the next one
			else:
				t2 = t1 + (duration * 1000)

			print("{} - start: {}({}) duration: {} end time:{} ".format(text, start, t1, duration, t2))
			
			clean_line = re.sub(r'([^a-zA-Z ]+?)', '', text)
			clean_line = clean_line.lower() #lowercase 
			#clean_line = clean_line.encode('utf-8')

			

			newAudio = AudioSegment.from_wav("wav")
			newAudio = newAudio[t1:t2]
			newAudio.export('./data/{}/{}.wav'.format(target_saying, youtube_id), format="wav")

			
			annotation_text = "{} {}\n".format(youtube_id, clean_line.upper())
			f.write(annotation_text)
			f.close()
			return True

	return False
Example #13
0
def search_youtube_videos(params):

    query = params["query"]
    order = dict_tools.dict_get_existent(params, "order", None)
    max_results = dict_tools.dict_get_existent(params, "results_max_count",
                                               None)
    page_token = dict_tools.dict_get_existent(params, "page_token", None)
    want_descriptions = dict_tools.dict_get_existent(params,
                                                     "want_descriptions", True)
    want_en_transcripts = dict_tools.dict_get_existent(params,
                                                       "want_en_transcripts",
                                                       True)
    want_comments = dict_tools.dict_get_existent(params, "want_comments", True)

    # You can enable page_token by simply removing the following line.
    page_token = None

    search_max_results = 5
    if max_results is not None:
        search_max_results = max_results

    basic_search_result = youtube_basic.youtube_video_basic_search(
        query,
        order=order,
        max_results=search_max_results,
        page_token=page_token)

    video_ids = basic_search_result["results_video_id"]

    count = len(video_ids)
    results_list = []
    i = 0
    while i < count:
        this_video_id = video_ids[i]
        this_video_isContinue = True

        # get basic information
        try:
            this_video_basic_info = youtube_basic.youtube_video_get_basic_info(
                this_video_id)
        except Exception:
            this_video_isContinue = False

        # get English transcript
        if (this_video_isContinue is True) and (want_en_transcripts is True):
            try:
                this_video_en_transcript = YouTubeTranscriptApi.get_transcript(
                    this_video_id, languages=["en"])
            except Exception:
                this_video_isContinue = False

        # get comments
        if (this_video_isContinue is True) and (want_comments is True):
            try:
                this_video_comments = youtube_basic.youtube_video_get_comments(
                    this_video_id, max_results=100)
            except Exception:
                this_video_comments = youtube_basic.youtube_video_create_empty_comments(
                )

        # put them together
        if this_video_isContinue is True:
            this_video_data_dict = {
                "title": this_video_basic_info["title"],
                "likeCount": this_video_basic_info["likeCount"],
                "dislikeCount": this_video_basic_info["dislikeCount"],
                "viewCount": this_video_basic_info["viewCount"],
            }

            if want_descriptions is True:
                this_video_data_dict["description"] = this_video_basic_info[
                    "description"]

            if want_en_transcripts is True:
                this_video_data_dict[
                    "en_transcript"] = this_video_en_transcript

            if want_comments is True:
                this_video_data_dict["some_comments"] = this_video_comments

            this_video_dict = {
                "data": this_video_data_dict,
                "video_id": this_video_id
            }

            results_list.append(this_video_dict)

        i = i + 1

    # build the final return
    ret_dict = {
        "video_results_count": len(results_list),
        "video_results": results_list
    }
    # ret_dict["pageInfo"] = basic_search_result["pageInfo"]
    # ret_dict["nextPageToken"] = basic_search_result["nextPageToken"]
    # ret_dict["prevPageToken"] = basic_search_result["prevPageToken"]

    return ret_dict
Example #14
0
from youtube_transcript_api import YouTubeTranscriptApi

try:
    transcript = YouTubeTranscriptApi.get_transcript('OKe7q1nUFgE',
                                                     languages=['en', 'hi'])
    print(transcript)

except:
    print("Error in retrieving transcript")
Example #15
0
def gen_transcripts(video_IDs_list):
    transcripts_list = []
    for video_id in video_IDs_list:
        transcripts_list.append(YouTubeTranscriptApi.get_transcript(video_id))
    return transcripts_list
Example #16
0
        def download_info(channel_id):

            ## Obtain video-ids
            res = self.youtube.channels().list(
                id=channel_id, part='contentDetails').execute()
            playlist_id = res['items'][0]['contentDetails'][
                'relatedPlaylists']['uploads']

            store_all_info = []
            next_page_token = None

            ## To display how many were successfully downloaded
            num_videos_seen = 0
            num_videos_stored = 0

            ## Retrieve all video subtitles
            while True:

                res = self.youtube.playlistItems().list(
                    playlistId=playlist_id,
                    part='snippet',
                    maxResults=50,
                    pageToken=next_page_token).execute()

                information_list = res['items']

                for video_info in information_list:

                    ## Does the video have subtitles
                    time.sleep(0.05)  ## Do not access the site too quickly
                    try:

                        video_id = video_info['snippet']['resourceId'][
                            'videoId']

                        video_capt = YouTubeTranscriptApi.get_transcript(
                            video_id)

                        dict_store = {
                            'title': '',
                            'description': '',
                            'video_id': '',
                            'subs': ''
                        }

                        dict_store['title'] = video_info['snippet'][
                            'title'].replace('\n', ' ')
                        dict_store['description'] = video_info['snippet'][
                            'description'].replace('\n', ' ')

                        dict_store['video_id'] = video_id

                        store_text = ''
                        for subs in video_capt:
                            clean_text = subs['text'].replace('\n', ' ')
                            store_text = store_text + clean_text
                        dict_store['subs'] = store_text
                        store_all_info.append(dict_store)
                        num_videos_seen = num_videos_seen + 1
                        num_videos_stored = num_videos_stored + 1

                    ## If not then ignore it
                    except:
                        num_videos_seen = num_videos_seen + 1
                    self.print_progress()

                next_page_token = res.get('nextPageToken')

                if next_page_token is None:
                    break

            if len(store_all_info) == 0:
                no_information_stored = 'No information could be retrieved. Most likely due to youtube blocking repeated access attempts.'
                raise Exception(no_information_stored)
            else:
                print('\n Information for {}/{} videos could be retrieved'.
                      format(num_videos_stored, num_videos_seen))
                return store_all_info
Example #17
0
def extract_caption(video_id, language):
    # get transcripts
    return YouTubeTranscriptApi.get_transcript(video_id=video_id,
                                               languages=[language])
Example #18
0
 def get_transcript(self, youtube_video_id) -> List[Dict]:
     transcript = YouTubeTranscriptApi.get_transcript(youtube_video_id)
     if not self._has_manually_created_transcript(youtube_video_id):
         return self._punctuate(transcript)
     return transcript
Example #19
0
def get_recipe(url, use_filter):
    # target Binging with Babish video
    transcript = YouTubeTranscriptApi.get_transcript(url)

    # Get verbs from db
    verbs = []
    with open("../../datasets/cooking_verbs.txt", 'r') as verb_file:
        verbs = verb_file.readlines()

    # ingredient extraction ============================
    db = Ingredients()
    ingreds = set([])
    actual_ingredients = []
    res = {}
    i = 0
    print('>>> Getting ingredients')
    for subtitle in transcript:
        i += 1
        text = subtitle['text']
        cur_ingredients = db.parse_ingredients(text)
        measurements = db.parse_measurements(text)
        if len(cur_ingredients) > 0:
            print('>>> Ingredients from line ' + str(i) + ': ')
            print(cur_ingredients)
        ingreds |= set(cur_ingredients)
        actual_ingredients += get_actual_ingredients(cur_ingredients,
                                                     measurements)

    print('>>> Ingredients detected: ')
    print(actual_ingredients)
    res['ingredients'] = actual_ingredients
    # ==================================================
    video_file = download_video(url)

    # steps extraction =================================
    times = ['seconds', 'minutes', 'hours', 'second', 'minute', 'hour']
    instructions = []
    pictures = []
    if use_filter:
        print('>>> Generating steps')
        i = 0
        for subtitle in transcript:
            text = subtitle['text']

            # Write all lines
            # with open('before_filter.txt', 'a') as before_file:
            #     before_file.write(text + '\n')

            # Remove lines without an ingredient, cooking verb, or time measurement
            for target in (list(ingreds) + verbs):
                if (target in text or len([t for t in times if (t in text)])):
                    instructions.append({'step': text})
                    if i % PICTURE_FREQUENCY == 0:
                        pictures.append(subtitle['start'])
                    print('>>> KEEPING LINE: ' + text)
                    # with open('after_filter.txt', 'a') as after_file:
                    #     after_file.write(text + '\n')
                    #     break
                    i += 1
                    break
    res['instructions'] = instructions
    print('>>> Steps Generation Complete')
    # ==================================================

    # frames extraction ================================
    print('>>> Extracting Frames...')
    file_names = extract_frames(video_file, pictures)
    i = 0
    for file_name in file_names:
        res['instructions'][i]['image'] = file_name
        i += PICTURE_FREQUENCY
    if os.path.exists(video_file):
        os.remove(video_file)
    # ==================================================
    print('>>> finished. result: ')
    print(res)
    return res
Example #20
0
from youtube_transcript_api import YouTubeTranscriptApi

videoID = "6bnaBnd4kyU"

dd = YouTubeTranscriptApi.get_transcript(videoID, languages=["en"])

print(dd)
Example #21
0
def get_transcripts():
    with open(TIMESTAMPS, 'r') as f:
        json_str = json.loads(f.read())
        df_ts = pd.read_json(json_str)
    if os.path.isfile(RAW_DATA):
        with open(RAW_DATA, 'r') as f:
            raw = json.loads(f.read())
    else:
        raw = dict()

    if os.path.isfile(CHECKPOINT):
        logger.info('Resume scraping from last check point.')
        with open(CHECKPOINT, 'r') as f:
            i = int(f.read())
    else:
        logger.info('Start transcript scraping')
        i = 0
    retries = 0
    while True:
        i += 1
        if retries >= 10:
            # most possibly hitting YouTube Transcript API's limit
            logger.info('Reached retry limit. Stopping...')
            break

        vid = df_ts.videoID[i]
        ts_ranges = df_ts.time[i]

        if vid in raw.keys():
            continue
        elif i == df_ts.index[-1]:
            logger.info('No more video to scrape. Stopping...')
            break
        else:
            try:
                cap = pd.DataFrame(
                    yti.get_transcript(vid, languages=['en', 'en-US',
                                                       'en-GB']))
                retries = 0
                logger.debug(f'Scraped video id {vid}')
            except:
                retries += 1
                logger.debug(f'Cannot scrape video id {vid}')
                continue

            cap['end'] = cap['start'] + cap['duration']
            cap['label'] = 0
            cap.drop(['duration'], axis=1, inplace=True)

            for ts in eval(str(ts_ranges)):
                mask = (cap.start >= ts[0]) & (cap.end <= ts[1])
                cap.loc[mask, 'label'] = 1

            raw[vid] = cap.to_json()

    logger.info('Dumping data...')
    with open(RAW_DATA, 'w') as f:
        json.dump(raw, f)
    with open(CHECKPOINT, 'w') as f:
        f.write(str(i - MAX_RETRIES))
    logger.info(f'Done. Raw data has {len(raw)} lines')
    logger.info('EXIT 0')
#running the word_count function on video_description, prints out
word_count_description_2020 = word_count(video_description_list_2020)
sorted_word_count_description_2020 = sorted(
    word_count_description_2020.items(), key=lambda x: x[1])
df_sorted_word_count = pd.DataFrame(sorted_word_count_description_2020)
pd.options.display.max_rows = 5100
df_sorted_word_count.head(5100)

#gathering subtitles of the videos
transcript_list = []

#checks if videos have subtitles, throws error when video does not have subtitles
for i in range(0, 100):
    print(i)
    transcript_2020 = YouTubeTranscriptApi.get_transcript(video_id_2020[i])
    transcript_list.append(transcript_2020)

print(transcript_list)

transcript_list_2019 = []
#checks if videos have subtitles, throws error when video does not have subtitles
for i in range(0, 100):
    print(i)
    transcript_2019 = YouTubeTranscriptApi.get_transcript(video_id_2019[i])
    #searches_2020_more_info_singluar['number'] = i
    transcript_list_2019.append(transcript_2019)
    #transcript_list.append("AKSDJFLKJSDFLKJASD;FKJAS;LDKFJALS;DKJF;LAKSDJF;LAKSDJF;LKASDJF")

print(transcript_list_2019)
Example #23
0
    def download_captions(self) -> None:
        base_dir = "./datasets/"
        c = self.output_dir
        lang = self.lang
        video_id = []
        text = []
        start = []
        duration = []
        names = []
        full_names = []
        file_list = os.listdir(base_dir + c + "/wavs/")
        file_list_wav = [file for file in file_list if file.endswith(".wav")]
        for f in tqdm.tqdm(file_list_wav):
            try:
                video = f.split(".wav")[0]
                subtitle = YouTubeTranscriptApi.get_transcript(
                    video, languages=[lang])
                for s in range(len(subtitle) - 1):
                    video_id.append(video)
                    full_name = base_dir + c + '/wavs/' + video + '.' + str(
                        s).zfill(4) + '.wav'
                    full_names.append(full_name)
                    name = video + '.' + str(s).zfill(4) + '.wav'
                    names.append(name)
                    subtitle[s]['text'] = ''.join([
                        c for c in subtitle[s]['text']
                        if c not in ('!', '?', ',', '.', '\n', '~', '"', "'")
                    ])
                    text.append(subtitle[s]['text'])
                    start.append(subtitle[s]['start'])
                    if subtitle[s]['duration'] >= (subtitle[s + 1]['start'] -
                                                   subtitle[s]['start']):
                        duration.append(subtitle[s + 1]['start'] -
                                        subtitle[s]['start'])
                    else:
                        duration.append(subtitle[s]['duration'])

            except:
                pass

        df = pd.DataFrame({
            "id": video_id,
            "text": text,
            "start": start,
            "duration": duration,
            "name": full_names
        })
        makedirs(base_dir + c + '/text')
        df.to_csv(base_dir + c + '/text/subtitle.csv', encoding='utf-8')
        res = [i + '|' + j for i, j in zip(names, text)]
        df2 = pd.DataFrame({"name": res})
        df2.to_csv(base_dir + c + '/metadata.csv',
                   encoding='utf-8',
                   header=False,
                   index=False)
        file_data = OrderedDict()
        for i in range(df.shape[0]):
            file_data[df['name'][i]] = df['text'][i]
        with open(base_dir + c + '/alignment.json', 'w',
                  encoding="utf-8") as make_file:
            json.dump(file_data, make_file, ensure_ascii=False, indent="\n")
        print(c + ' channel was finished')
Example #24
0
def insight(vidId):
    text_data = []
    keyWordList = YouTubeTranscriptApi.get_transcript(vidId)
    # print(keyWordList)
    for line in keyWordList:
        # print(line['text'])

        tokens = prepare_text_for_lda(line['text'])
        if random.random() > .85:
            # print(tokens)
            text_data.append(tokens)
    #genism

    dictionary = corpora.Dictionary(text_data)
    corpus = [dictionary.doc2bow(text) for text in text_data]

    pickle.dump(corpus, open('corpus.pkl', 'wb'))
    dictionary.save('dictionary.gensim')

    #topics

    NUM_TOPICS = 5
    ldamodel = gensim.models.ldamodel.LdaModel(corpus,
                                               num_topics=NUM_TOPICS,
                                               id2word=dictionary,
                                               passes=55)
    ldamodel.save('model5.gensim')
    topics = ldamodel.print_topics(num_words=4)
    # for topic in topics:
    #     print(topic)
    #Displaying
    dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
    corpus = pickle.load(open('corpus.pkl', 'rb'))
    lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')

    lda_display = pyLDAvis.gensim.prepare(lda,
                                          corpus,
                                          dictionary,
                                          sort_topics=False)
    pyLDAvis.display(lda_display)
    pyLDAvis.save_html(lda_display, './frontend/lda.html')
    f = open("./frontend/lda.html", encoding="utf8")

    soup = bs4.BeautifulSoup(f)
    scr = soup.select("script")
    with open("./frontend/new.js", 'w') as b:
        src = scr[0].getText().replace(
            'https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min',
            '/js/d3.min.js', 1)
        src1 = src.replace(
            'https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min.js',
            '/js/d3.min.js', 1)
        src2 = src1.replace(
            'https://cdn.rawgit.com/bmabey/pyLDAvis/files/ldavis.v1.0.0.js',
            '/js/ldavis.js', 2)
        b.write(src2)
        b.close()
    div = soup.select('div')
    link = soup.select('link')
    print(div[0], link[0])
    with open("./frontend/lda.html", 'w') as b:
        b.write(str(link[0]))
        print('\n')
        b.write(str(div[0]))
        b.write('<script src="new.js"></script>')

    return ""
Example #25
0
def downloadTranscript(videoID):
    '''Takes in a YT VideoID and returns the transcript object'''
    transcript_object = YouTubeTranscriptApi.get_transcript(videoID)
    return transcript_object
Example #26
0
def get_captions(video_id):
    res = YouTubeTranscriptApi.get_transcript(video_id)
    captions = ""
    for r in res:
        captions = captions + " " + r.get("text")
    return captions
videos_list = list({v['id']: v for v in videos_list}.values())

os.system('osascript -e beep')

no_transcript = [
    'pKYeAN-_wFI', 'kpIIBH5jEGs', 'mGLMi9kXTRI', 'aaLiLRVeaZA', 'waXb8QGdEYQ',
    '9g3CjQv5yec', 'NZ83rfAqWMw', 'PttKq0GcnoQ', 'GGEGF7cHmMU', 'ms5a_C7EeNk'
]

i = 0

for video in videos_list:
    if video['id'] in no_transcript:
        continue
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video['id'])
    except:
        continue
    divide = "="
    for i in range(0, 10):
        divide = divide + "="
    print(divide)
    print("<|startoftext|>")
    print(video['title'])
    for line in transcript:
        print(line['text'])

    print("\n<|endoftext|>")
    i += 1
    os.system('osascript -e beep')
Example #28
0
def transcript(movie_id: str):
    data = YouTubeTranscriptApi.get_transcript(movie_id)
    return jsonify(data)
Example #29
0
import csv
import json
import pandas as pd
from youtube_transcript_api import YouTubeTranscriptApi

data = pd.read_csv('data_with_index.csv')
transcript = []

with open('Youtube_Transcripts1.csv', 'w', newline='',
          encoding="utf-8") as csvfile:
    youtube = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
    for i in data.video_id:
        # a= ["LsCUCElZli0", "3AWDKiPGsdQ"]
        # for i in a:
        try:
            transcript_list = YouTubeTranscriptApi.get_transcript(i)
            sentences = []

            for dict in transcript_list:
                sentences.append(dict['text'])
            # print('Sentences:', sentences)
            transcript.append(' '.join(sentences))

        except:
            transcript.append('No transcript')
            pass

    for i in range(len(transcript)):
        youtube.writerow([transcript[i]])
    # print('Transcript: ', transcript[1])