def main(): parser = argparse.ArgumentParser(description='ytgrep') parser.add_argument( '-e', action="store_true", help='Interpret PATTERN as an extended regular expression') parser.add_argument('-v', action="store_true", help='Print debug information while searching') parser.add_argument( '-links', action="store_true", help='include shortcut links to video at matched time i.e. ?t=<time>') parser.add_argument('pattern', type=str, help='term to search for') parser.add_argument('urls', nargs='+', help='video URL(s)') args = parser.parse_args() args_dict = vars(args) download = Download(args_dict) try: captions = download.get_captions() if len(captions) == 0: print("No matches found.") sys.exit(1) print(captions) except Exception as err: print("Unable to retrieve captions, {}".format(err))
def main(args): if args.videoUrl is not None: video_id = getVideoId(args.videoUrl) print(bcolors.OKGREEN + "Scraping Captions from url with id: {}...".format(video_id) + bcolors.ENDC) download = Download() captions = download.get_captions(video_id) if captions is not None: print(bcolors.OKBLUE + "Done!" + bcolors.ENDC) else: print(bcolors.FAIL + "\tError Occured: Unable to download captions" + bcolors.ENDC) exit() if captions is not None: if args.outFile is not None: f = open(args.outFile, "w") else: f = open("youtube-{}.txt".format(video_id), "w") wordNum = 0 for word in re.split('\s+', captions): if wordNum == args.lineLimit: f.write('\n') wordNum = 0 f.write("{} ".format(word)) wordNum += 1 f.close()
def results(): link = request.args['inputname'] SAVE_PATH = "C:/Users/Stanley/Desktop/SBHacks-2019/videos" yt = YouTube(link) stream = yt.streams.filter(file_extension='mp4').first() stream.download(SAVE_PATH) title = yt.title + ".mp4" title = title.replace("|", "") title = title.replace(":", "") arraylink = link.split("=") video_id = arraylink[1] download = Download() captions = download.get_captions(video_id) sentiment = TextBlob(captions) captions = captions.replace("[Music]", "") captions = captions.replace(" ", ". ") score = sentiment.sentiment.polarity score = str(score) m = clarifail.public_models.general_model try: response = m.predict_by_filename( "C:/Users/Stanley/Desktop/SBHacks-2019/videos/" + title, is_video=True, sample_ms=10000) except ApiError as e: print('Error status code: %d' % e.error_code) print('Error description: %s' % e.error_desc) if e.error_details: print('Error details: %s' % e.error_details) exit(1) total_concepts = {} frames = response['outputs'][0]['data']['frames'] for frame in frames: print('Concepts in frame at time: %d ms' % frame['frame_info']['time']) for concept in frame['data']['concepts']: print(' %s %f' % (concept['name'], concept['value'])) if concept['name'] in total_concepts.keys(): total_concepts[concept['name']] += concept['value'] else: total_concepts[concept['name']] = concept['value'] final = sorted(total_concepts.items(), key=operator.itemgetter(1), reverse=True) for x in range(5): print(final[x]) return render_template('results.html', content1=captions, content2=score, content3=str(final[0][0]), content4=str(final[1][0]), content5=str(final[2][0]), content6=str(final[3][0]), content7=str(final[4][0]))
def test_video_does_not_exist(self): self.download = Download({ 'urls': ['12323123123'], 'pattern': 'elephants', 'e': False, 'v': False }) with self.assertRaises(DownloadException): self.download.get_captions()
def test_result_is_zero(self): video_id = 'we342oij234' ydl = Mock() ydl.download = Mock(return_value=0) with patch('youtube_dl.YoutubeDL.__enter__', return_value=ydl): download = Download() self.assertEqual(0, download.get_result(video_id)) ydl.download.assert_called_with( ['http://www.youtube.com/watch?v=we342oij234'])
def test_video_no_captions(self): self.download = Download({ 'urls': ['https://www.youtube.com/watch?v=jyoTZ69mWZE'], 'pattern': 'elephants', 'e': False, 'v': False }) with self.assertRaises(NoCaptionsException): self.download.get_captions()
def test_video(self): self.download = Download({ 'urls': ['https://www.youtube.com/watch?v=jNQXAC9IVRw'], 'pattern': 'elephants', 'e': False, 'v': False }) expected = '[00:00:01.300 --> 00:00:04.400] all right, so here we are in front of the ' + \ red('elephants') + ',' self.assertEqual(expected, self.download.get_captions())
def test_caption(self): tests = [{'name': '1 video, caption found', 'urls': ['https://www.swag.com/'], 'pattern': 'vision', 'regex': False, 'links': False, 'expected': '[00:00:17.350 --> 00:00:18.752] we have this ' + red('vision') + ' of einstein'}, {'name': '1 video, caption not found', 'urls': ['https://www.swag.com/'], 'pattern': 'iwontbefound', 'regex': False, 'links': False, 'expected': '', }, {'name': '1 video, caption found more than once', 'urls': ['https://www.swag.com/'], 'pattern': 'light', 'regex': False, 'links': False, 'expected': '[00:00:33.666 --> 00:00:38.138] actor as einstein: what ' + red('light') + ' would i see if i rode on a beam of ' + red('light') + '?', }, {'name': '1 video, regular expression', 'urls': ['https://www.swag.com/'], 'pattern': 'actor|light', 'regex': True, 'links': False, 'expected': '[00:00:33.666 --> 00:00:38.138] ' + red('actor') + ' as einstein: what ' + red('light') + ' would i see if i rode on a beam of ' + red('light') + '?', }, {'name': '1 video, 1 link', 'urls': ['https://www.swag.com/'], 'pattern': 'actor|light', 'regex': True, 'links': True, 'expected': '[00:00:33.666 --> 00:00:38.138] ' + red('actor') + ' as einstein: what ' + red('light') + ' would i see if i rode on a beam of ' + red('light') + '? (https://www.swag.com/&t=33s)', }, ] for test in tests: download = Download({'urls': test['urls'], 'pattern': test['pattern'], 'e': test['regex'], 'v': False, 'links': test['links']}) m = mock_open(read_data=FIXTURE_WEBVTT) with patch('ytcc.download.open', m, create=True): with patch('ytcc.storage.Storage.remove_file', Mock()): download.get_result = Mock(return_value=0) actual = download.get_captions() expected = test['expected'] self.assertEqual(actual, expected)
class TestRealVideo(unittest.TestCase): def setUp(self): self.download = Download() def test_video(self): video_id = 'jNQXAC9IVRw' excerpt = 'All right, so here we are in front of the elephants, the cool thing' self.assertEqual(excerpt, self.download.get_captions(video_id)[:len(excerpt)]) def test_failed(self): video_id = '12323123123' with self.assertRaises(DownloadException): self.download.get_captions(video_id)
def main(): id = 'PL4cUxeGkcC9jticTs2l6Nt2lsybNW0-4O' download = Download(playlist=len(id) > 12) captions_files = download.get_captions(id, 'it') Path("../Outputs").mkdir(parents=True, exist_ok=True) for video_id in captions_files: text_file = open("../Outputs/" + video_id + ".txt", "w") text_file.write(captions_files[video_id]) text_file.close()
def test_video_multiline_match_with_regexp(self): self.download = Download({ 'urls': ['https://www.youtube.com/watch?v=jNQXAC9IVRw'], 'pattern': 'cool', 'e': True, 'v': False }) expected = '[00:00:04.400 --> 00:00:09.166] the ' + red( 'cool' ) + ' thing about these guys is that they have really,\n[00:00:12.700 --> 00:00:17.000] and thats, thats ' + red( 'cool') + '."' self.assertEqual(expected, self.download.get_captions())
class TestDownloadGetUrlFromVideoId(unittest.TestCase): def setUp(self): self.download = Download() def test_set(self): video_id = 'vDOIDJdds' test = 'http://www.youtube.com/watch?v={0}'.format(video_id) result = self.download.get_url_from_video_id(video_id) self.assertEqual(test, result) def test_encoding(self): video_id = 'vDDD://' encoded_video_id = 'vDDD%3A%2F%2F' test = 'http://www.youtube.com/watch?v={0}'.format(encoded_video_id) result = self.download.get_url_from_video_id(video_id) self.assertEqual(test, result)
class TestGetFilePathFromVideoId(unittest.TestCase): def setUp(self): self.download = Download() def test_valid(self): self.assertEqual( FIXTURE_WEBVTT_STRIPPED, self.download.get_captions_from_output(FIXTURE_WEBVTT))
def test_video_one_missing_captions(self): self.download = Download({ 'urls': [ 'https://www.youtube.com/watch?v=jNQXAC9IVRw', 'https://www.youtube.com/watch?v=jyoTZ69mWZE' ], 'pattern': 'elephants', 'e': False, 'v': False }) expected = """https://www.youtube.com/watch?v=jNQXAC9IVRw [00:00:01.300 --> 00:00:04.400] all right, so here we are in front of the elephants,""" expected = expected.replace('elephants', red('elephants')) self.assertEqual(expected, self.download.get_captions())
class TestRemoveTimeFromCaption(unittest.TestCase): caption = '"00:00:49.860 --> 00:00:50.179\nand cook to taste like a hot\ndogs"' stripped_caption = "\nand cook to taste like a hot\ndogs" caption_two = "'00:00:14.848 --> 00:00:17.350\nMAN:\nWhen we think\nof E equals m c-squared,'" stripped_caption_two = "\nMAN:\nWhen we think\nof E equals m c-squared," def setUp(self): self.download = Download() def test_removal(self): self.assertEqual(self.stripped_caption, self.download.remove_time_from_caption(self.caption)) def test_greedy_newline(self): self.assertEqual( self.stripped_caption_two, self.download.remove_time_from_caption(self.caption_two))
class TestUpdateOpts(unittest.TestCase): opts = {'skip_download': True, 'writeautomaticsub': True} def setUp(self): self.download = Download() def test_defaults(self): self.assertEqual(self.opts['skip_download'], self.download.opts['skip_download']) def test_update_default(self): self.download.update_opts({'skip_download': False}) self.assertFalse(self.download.opts['skip_download']) def test_add_new(self): additional = {'new_option': 'test'} self.download.update_opts(additional) self.assertEqual('test', self.download.opts['new_option']) self.assertEqual(True, self.download.opts['skip_download'])
class TestRealVideo(unittest.TestCase): def setUp(self): self.download = Download() def test_video(self): video_id = 'jNQXAC9IVRw' excerpt = 'All right, so here we are in front of the elephants, the cool thing' self.assertEqual(excerpt, self.download.get_captions(video_id)[:len(excerpt)]) def test_video_french(self): video_id = 'VLAMC3NJsP4' excerpt = 'J\'ai fini Jouons à LoL (League of Legends). Oh mon Dieu,' self.assertEqual( excerpt, self.download.get_captions(video_id, 'fr')[:len(excerpt)]) def test_failed(self): video_id = '12323123123' with self.assertRaises(DownloadException): self.download.get_captions(video_id)
def test_caption_captions_do_not_exist(self): test = { 'name': 'captions do not exist', 'urls': ['https://www.swag.com/'], 'pattern': 'my pattern', 'regex': False, 'links': False, } download = Download({'urls': test['urls'], 'pattern': test['pattern'], 'e': test['regex'], 'v': False, 'links': test['links']}) m = mock_open(read_data=FIXTURE_WEBVTT) m.side_effect = FileNotFoundError with patch('ytcc.download.open', m, create=True): with patch('ytcc.storage.Storage.remove_file', Mock()): download.get_result = Mock(return_value=0) with self.assertRaises(NoCaptionsException): download.get_captions()
class TestRealVideo(unittest.TestCase): def test_video(self): self.download = Download({ 'urls': ['https://www.youtube.com/watch?v=jNQXAC9IVRw'], 'pattern': 'elephants', 'e': False, 'v': False }) expected = '[00:00:01.300 --> 00:00:04.400] all right, so here we are in front of the ' + \ red('elephants') + ',' self.assertEqual(expected, self.download.get_captions()) def test_video_multiline_match_with_regexp(self): self.download = Download({ 'urls': ['https://www.youtube.com/watch?v=jNQXAC9IVRw'], 'pattern': 'cool', 'e': True, 'v': False }) expected = '[00:00:04.400 --> 00:00:09.166] the ' + red( 'cool' ) + ' thing about these guys is that they have really,\n[00:00:12.700 --> 00:00:17.000] and thats, thats ' + red( 'cool') + '."' self.assertEqual(expected, self.download.get_captions()) def test_video_does_not_exist(self): self.download = Download({ 'urls': ['12323123123'], 'pattern': 'elephants', 'e': False, 'v': False }) with self.assertRaises(DownloadException): self.download.get_captions() def test_video_no_captions(self): self.download = Download({ 'urls': ['https://www.youtube.com/watch?v=jyoTZ69mWZE'], 'pattern': 'elephants', 'e': False, 'v': False }) with self.assertRaises(NoCaptionsException): self.download.get_captions()
def initialize_worker(): global download_global download_global = Download()
def download_script(self): download = Download() script = download.get_captions(self.url[-11:], 'en') script_file = open(SCRIPT_PATH + self.name + '.txt', 'w') script_file.write(script)
def get_download(download): if download is None: print("new Download session") return Download() else: return download
from ytcc.download import Download video_id = '_un8mHYFJp0' download = Download() # Language is optional and default to "en" # YouTube uses "en","fr" not "en-US", "fr-FR" captions = download.get_captions(video_id, 'en') review_file = open("review.txt","w+") review_file.write(captions) # print(captions)
def setUp(self): self.download = Download()
video = pafy.new(url_full) video_title = video.title for invalid_char in INVALID_CHARS: video_title = video_title.replace(invalid_char, '') print(video_title) timestamp = datetime.today().strftime('%y%m%d') best_audio_stream = video.getbestaudio() audio_path = AUDIO_PATH + timestamp + '_' + video_title + '.' + best_audio_stream.extension best_audio_stream.download(audio_path) sound = AudioSegment.from_file(audio_path) mp3_path = AUDIO_PATH + timestamp + '_' + video_title + '.mp3' sound.export(mp3_path, format="mp3", bitrate="128k") os.remove(audio_path) download = Download() caption = download.get_captions(url, 'en') json_data = OrderedDict() json_data['id'] = file_cnt json_data['url'] = url json_data['title'] = timestamp + '_' + video_title json_data['caption'] = caption json_path = DATABASE_PATH + json_data['id'] + '.json' with open(json_path, 'w') as json_file: json.dump(json_data, json_file) file_cnt += 1