Esempio n. 1
0
    def test_get_transcripts__continue_on_error(self, mock_get_transcript):
        video_id_1 = 'video_id_1'
        video_id_2 = 'video_id_2'

        YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'], continue_after_error=True)

        mock_get_transcript.assert_any_call(video_id_1, ('en',), None, None)
        mock_get_transcript.assert_any_call(video_id_2, ('en',), None, None)
Esempio n. 2
0
    def test_get_transcripts(self, mock_get_transcript):
        video_id_1 = 'video_id_1'
        video_id_2 = 'video_id_2'
        languages = ['de', 'en']

        YouTubeTranscriptApi.get_transcripts([video_id_1, video_id_2], languages=languages)

        mock_get_transcript.assert_any_call(video_id_1, languages, None, None)
        mock_get_transcript.assert_any_call(video_id_2, languages, None, None)
        self.assertEqual(mock_get_transcript.call_count, 2)
Esempio n. 3
0
    def test_get_transcripts__continue_on_error(self):
        video_id_1 = 'video_id_1'
        video_id_2 = 'video_id_2'
        YouTubeTranscriptApi.get_transcript = MagicMock(
            side_effect=Exception('Error'))

        YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'],
                                             continue_after_error=True)

        YouTubeTranscriptApi.get_transcript.assert_any_call(
            video_id_1, ('en', ), None)
        YouTubeTranscriptApi.get_transcript.assert_any_call(
            video_id_2, ('en', ), None)
Esempio n. 4
0
def obtener_transcripciones_videos(ids_videos):
    """
    Obtiene las transcripciones de varios videos, dados sus ids.
    Retorna un diccionario, donde las llaves son los ids, y los valores son cadenas.
    """

    # Obtener las listas de subtitulos en espanol
    diccionario_subtitulos = YouTubeTranscriptApi.get_transcripts(
        ids_videos, languages=['es'])[0]

    # Diccionario en donde se concatenaran los subtitulos por cada id
    diccionario_transcripciones = dict()

    # Recorrer el diccionario de subtitulos
    for id in diccionario_subtitulos.keys():
        # Cadena temporal donde se concatenaran los subtitulos para un id
        transcripcion = ''

        # Recorrer la lista de subtitulos para un id y concatenarlos
        for s in diccionario_subtitulos[id]:
            transcripcion += s['text'] + ' '

        # Guardar el resultado en el diccionario de transcripciones
        diccionario_transcripciones[id] = transcripcion

    # Retornar el diccionario de transcripciones
    return diccionario_transcripciones
Esempio n. 5
0
def get_youtube_cc(url):
    try:
        video_ids = [url.split('?v=')[1]]
        id = video_ids[0]
        captions = str()
        cc = (YouTubeTranscriptApi.get_transcripts(video_ids,
                                                   languages=['de', 'en']))
        for line in (cc[0][id]):
            captions += (' ' + line['text'])
        return (captions, True)
    except Exception as e:
        return ("Can't fetch from youtube captions", False)
Esempio n. 6
0
    def test_get_transcript__with_proxies(self):
        proxies = {'http': '', 'https:': ''}
        transcript = YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8',
                                                         proxies=proxies)

        self.assertEqual(transcript, [{
            'text': 'Hey, this is just a test',
            'start': 0.0,
            'duration': 1.54
        }, {
            'text': 'this is not the original transcript',
            'start': 1.54,
            'duration': 4.16
        }, {
            'text': 'just something shorter, I made up for testing',
            'start': 5.7,
            'duration': 3.239
        }])
        YouTubeTranscriptApi.get_transcript = MagicMock()
        YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], proxies=proxies)
        YouTubeTranscriptApi.get_transcript.assert_any_call(
            'GJLlxj_dtq8', ('en', ), proxies)
Esempio n. 7
0
 def download(self):
     print("Downloading...")
     try:
         self.title = YouTube(self.url).title
     except:
         pass
     if not Path.exists(self.vid_path):
         print("Downloading Video...")
         while True:
             try:
                 YouTube(self.url).streams.first().download(self.dl_path, "vid")
                 break
             except KeyError:
                 print("Download failed. Retry...")
     if not Path.exists(self.sub_path):
         print("Downloading Transcript...")
         sub = YouTubeTranscriptApi.get_transcripts([self.video_id], languages=['en'])
         pickle.dump(sub, open(self.sub_path, "wb"))
     self.subtitles = pickle.load(open(self.sub_path,"rb"))
Esempio n. 8
0
def caption_data(video_ids):
    transcript_data = YouTubeTranscriptApi.get_transcripts(
        video_ids=video_ids, continue_after_error=True)
    for vid in transcript_data[0]:
        text_list = []
        counter = 0
        for trans_dict in transcript_data[0][vid]:
            #I think this is where they are getting concatenateds
            if counter < 2:
                print(trans_dict['text'])
                counter += 1
            text_list.append(trans_dict['text'])
            text_list.append(' ')

        caption_text = "".join(text_list)
        caption = Caption(body=caption_text, video_id=vid)
        db.session.add(caption)
        db.session.commit()

    return_JSON = {"status": 'success'}
    return return_JSON
Esempio n. 9
0
def download_transcripts_of_playlist(playlist_id, transcripts_dir):
    """Retrieves transcripts of individual youtube videos from playlist
    and writes to json files.
    params:
        playlist_id: youtube videos playlist id
        transcripts_dir: path to write transcripts json
    """
    # create transcripts directory if not exist
    if not os.path.exists(transcripts_dir):
        os.makedirs(transcripts_dir, exist_ok=True)

        # get video ids from playlist id
        video_ids = get_video_ids(playlist_id)

        # get transcripts
        transcripts, unretrieved_videos = YouTubeTranscriptApi.get_transcripts(
            video_ids, languages=['en'], continue_after_error=True)
        count = 1
        for key, transcript in transcripts.items():
            # the json file where the output must be stored
            file_path = os.path.join(transcripts_dir,
                                     str(count) + key + ".json")
            print(file_path)
            out_file = open(file_path, "w")
            # dump to json
            json.dump(transcript, out_file, indent=3)
            out_file.close()
            count += 1

        print("unretrieved_videos: ", unretrieved_videos)

    else:
        # skip downloading transcripts if already exists
        print(
            "playlist with id {0} transcripts already downloaded, skipping...".
            format(playlist_id))
Esempio n. 10
0
 def test_get_transcripts__stop_on_error(self, mock_get_transcript):
     with self.assertRaises(Exception):
         YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'])
Esempio n. 11
0
from youtube_transcript_api import YouTubeTranscriptApi

video_ids = ['5-yxXzLX2QY', '9UuFTwUxkLw']

out, _ = YouTubeTranscriptApi.get_transcripts(video_ids)

text_out = {}
for video_id in out:
    text_out[video_id] = ' '.join([x.get('text', '') for x in out[video_id]])

print(text_out)
from lxml.etree import tostring
from youtube_transcript_api import YouTubeTranscriptApi


#Totally useless file tbh
def get_video_captions(video_id):

    CAPTION_URL = 'https://www.diycaptions.com/php/get-automatic-captions-as-txt.php?id=' + video_id + '&language=asr'

    captionPage = requests.get(CAPTION_URL)
    captionTree = html.fromstring(captionPage.content)
    caption = captionTree.xpath('//div[@contenteditable="true"]/text()')

    return (str(caption))


if (__name__ == '__main__'):
    video_id = 'Cjim2F5Kk38'
    transcript_data = YouTubeTranscriptApi.get_transcripts(
        ['Cjim2F5Kk38', 'DtdRCCMvllo'])
    for vid in transcript_data[0]:
        text_list = []
        counter = 0
        for trans_dict in transcript_data[0][vid]:
            if counter < 3:
                print(trans_dict['text'])
            text_list.append(trans_dict['text'])

        caption_text = "".join(text_list)
    # print(caption_text)
Esempio n. 13
0
'load a df with video ids (which will be used for the youtube api to download the transcripts: and later on for extracting the labels                          '


with open('IdList_selfWachtedYoutubeVids.txt',encoding="utf-8") as f:
    idList = f.readlines() #txt file with the ids:

#alternatively:
#dic = dict_oldDf # a dictionary where the keys correspond the the youtubeIDs

#======================================================================== #
' downloading the transcripts by their ids                           '
#======================================================================== #
from youtube_transcript_api import YouTubeTranscriptApi
import time # just to record how long it takes to download the transcripts
STARTTIME = time.time() #plus counting the time
Transcripts_w_timestamps =YouTubeTranscriptApi.get_transcripts(video_ids=idList,continue_after_error=True)

Transcripts_w_timestamps = Transcripts_w_timestamps[0]

print('time it took:', time.time() - STARTTIME)

print( 'len trans', len(Transcripts_w_timestamps)) # see how many could be downloaded

# =============================================================================
# transcripts that were unable to be extraced:
# =============================================================================
ids_thatcouldnotbedownloaded = list( set_originalId - set_downloadedtransIds )
print( 'len downloaded trans:',ids_thatcouldnotbedownloaded)

# =============================================================================
# # creating a dict with transcripts, ψ Writing to string files to (re)create the transcripts
#This is to overcome redundant information.
df1.sort_values(['ViewCount', 'likeCount', 'CommentCount'],
                ascending=[False, False, False],
                inplace=True)
df1.reset_index(drop=True, inplace=True)
df_red = df1

#Step 6:

df_red.loc[:, "emv_video"] = df_red.apply(
    lambda row: int(row.ViewCount) * 0.14 + int(row.CommentCount) * 8.20 + int(
        row.likeCount) * 0.72,
    axis=1)

videoid = list(df_red['VideoId'])
x = YouTubeTranscriptApi.get_transcripts(videoid, continue_after_error=True)
vids_with_sub = x[0]
vids_without_sub = x[1]
df_trans = pd.DataFrame(list(vids_with_sub.keys()), columns=['VideoId'])

# In[117]:

result2 = []
for i in range(0, len(vids_with_sub)):
    print(i)
    result1 = []
    list_con = list(vids_with_sub.values())[i]
    for j in list_con:
        text_proc = j['text']
        if (re.findall('[a-zA-Z]', text_proc) == []):
            text_proc_fin = text_proc
Esempio n. 15
0
 def test_get_transcripts__with_cookies(self, mock_get_transcript):
     cookies = '/example_cookies.txt'
     YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], cookies=cookies)
     mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en', ), None,
                                         cookies)
Esempio n. 16
0
from youtube_transcript_api import YouTubeTranscriptApi
import json

print(YouTubeTranscriptApi.get_transcript("P_6vDLq64gE"))
transcript_list = YouTubeTranscriptApi.list_transcripts("P_6vDLq64gE")
print(transcript_list)
with open('examples/file1.json', 'w', encoding='utf-8') as f:
    json.dump(YouTubeTranscriptApi.get_transcripts(["iCvmsMzlF7o"],
                                                   languages=['uk']),
              f,
              ensure_ascii=False)

transcript = transcript_list.find_transcript(['de', 'en'])
print(transcript)
transcript = transcript_list.find_manually_created_transcript(['uk'])
print(transcript.is_generated)

print(
    transcript.video_id,
    transcript.language,
    transcript.language_code,
    # whether it has been manually created or generated by YouTube
    transcript.is_generated,
    # whether this transcript can be translated or not
    transcript.is_translatable,
    # a list of languages the transcript can be translated to
    transcript.translation_languages,
)
transcript.fetch()
Esempio n. 17
0
    def test_get_transcripts__stop_on_error(self):
        YouTubeTranscriptApi.get_transcript = MagicMock(
            side_effect=Exception('Error'))

        with self.assertRaises(Exception):
            YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'])
Esempio n. 18
0
 def test_get_transcripts__with_proxies(self, mock_get_transcript):
     proxies = {'http': '', 'https:': ''}
     YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], proxies=proxies)
     mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en', ), proxies,
                                         None)
Esempio n. 19
0
    }

    video_detail_df = pd.read_csv(video_details_file, index_col=0)
    video_ids = list(
        video_detail_df[video_detail_df['caption'] == True]['video'])

    ### extract the video ids appear as the file name in a folder
    video_set_with_transcripts = get_all_video_ids_with_transcripts(
        video_caption_pickle_folder)
    unfound_video = set(
        pd.read_csv(os.path.join(CUR_FILE_DIR, '../Data', 'unfound.csv'),
                    header=None)[0])

    video_ids = list(
        set(video_ids) - video_set_with_transcripts - unfound_video)

    # video_ids = ['-TIkkGSHWeM']
    print(len(video_ids))
    for i in range(0, len(video_ids), 50):
        print('start to extract videos {0!r}'.format(str(i)))
        video_ids_sub = video_ids[i:i + 50]
        print(video_ids_sub)
        transcripts = YouTubeTranscriptApi.get_transcripts(
            video_ids_sub,
            languages=['en'],
            continue_after_error=True,
            proxies=proxies)
        persist_transcript(transcripts)
        print(str(i))
        # time.sleep(600)