Exemple #1
0
def main():
    parser = argparse.ArgumentParser(description='ytgrep')
    parser.add_argument(
        '-e',
        action="store_true",
        help='Interpret PATTERN as an extended regular expression')
    parser.add_argument('-v',
                        action="store_true",
                        help='Print debug information while searching')
    parser.add_argument(
        '-links',
        action="store_true",
        help='include shortcut links to video at matched time i.e. ?t=<time>')
    parser.add_argument('pattern', type=str, help='term to search for')
    parser.add_argument('urls', nargs='+', help='video URL(s)')

    args = parser.parse_args()
    args_dict = vars(args)

    download = Download(args_dict)
    try:
        captions = download.get_captions()
        if len(captions) == 0:
            print("No matches found.")
            sys.exit(1)
        print(captions)
    except Exception as err:
        print("Unable to retrieve captions, {}".format(err))
Exemple #2
0
def main(args):
    if args.videoUrl is not None:
        video_id = getVideoId(args.videoUrl)
        print(bcolors.OKGREEN +
              "Scraping Captions from url with id: {}...".format(video_id) +
              bcolors.ENDC)
        download = Download()
        captions = download.get_captions(video_id)
        if captions is not None:
            print(bcolors.OKBLUE + "Done!" + bcolors.ENDC)
        else:
            print(bcolors.FAIL +
                  "\tError Occured: Unable to download captions" +
                  bcolors.ENDC)
            exit()
    if captions is not None:
        if args.outFile is not None:
            f = open(args.outFile, "w")
        else:
            f = open("youtube-{}.txt".format(video_id), "w")

        wordNum = 0
        for word in re.split('\s+', captions):
            if wordNum == args.lineLimit:
                f.write('\n')
                wordNum = 0
            f.write("{} ".format(word))
            wordNum += 1
        f.close()
Exemple #3
0
def results():
    link = request.args['inputname']
    SAVE_PATH = "C:/Users/Stanley/Desktop/SBHacks-2019/videos"
    yt = YouTube(link)
    stream = yt.streams.filter(file_extension='mp4').first()
    stream.download(SAVE_PATH)
    title = yt.title + ".mp4"
    title = title.replace("|", "")
    title = title.replace(":", "")

    arraylink = link.split("=")
    video_id = arraylink[1]
    download = Download()
    captions = download.get_captions(video_id)
    sentiment = TextBlob(captions)
    captions = captions.replace("[Music]", "")
    captions = captions.replace("  ", ". ")
    score = sentiment.sentiment.polarity
    score = str(score)

    m = clarifail.public_models.general_model
    try:
        response = m.predict_by_filename(
            "C:/Users/Stanley/Desktop/SBHacks-2019/videos/" + title,
            is_video=True,
            sample_ms=10000)
    except ApiError as e:
        print('Error status code: %d' % e.error_code)
        print('Error description: %s' % e.error_desc)
        if e.error_details:
            print('Error details: %s' % e.error_details)
        exit(1)

    total_concepts = {}
    frames = response['outputs'][0]['data']['frames']
    for frame in frames:
        print('Concepts in frame at time: %d ms' % frame['frame_info']['time'])
        for concept in frame['data']['concepts']:
            print(' %s %f' % (concept['name'], concept['value']))
            if concept['name'] in total_concepts.keys():
                total_concepts[concept['name']] += concept['value']
            else:
                total_concepts[concept['name']] = concept['value']

    final = sorted(total_concepts.items(),
                   key=operator.itemgetter(1),
                   reverse=True)

    for x in range(5):
        print(final[x])
    return render_template('results.html',
                           content1=captions,
                           content2=score,
                           content3=str(final[0][0]),
                           content4=str(final[1][0]),
                           content5=str(final[2][0]),
                           content6=str(final[3][0]),
                           content7=str(final[4][0]))
Exemple #4
0
 def test_video_does_not_exist(self):
     self.download = Download({
         'urls': ['12323123123'],
         'pattern': 'elephants',
         'e': False,
         'v': False
     })
     with self.assertRaises(DownloadException):
         self.download.get_captions()
    def test_result_is_zero(self):
        video_id = 'we342oij234'
        ydl = Mock()
        ydl.download = Mock(return_value=0)

        with patch('youtube_dl.YoutubeDL.__enter__', return_value=ydl):
            download = Download()
            self.assertEqual(0, download.get_result(video_id))
            ydl.download.assert_called_with(
                ['http://www.youtube.com/watch?v=we342oij234'])
Exemple #6
0
 def test_video_no_captions(self):
     self.download = Download({
         'urls': ['https://www.youtube.com/watch?v=jyoTZ69mWZE'],
         'pattern':
         'elephants',
         'e':
         False,
         'v':
         False
     })
     with self.assertRaises(NoCaptionsException):
         self.download.get_captions()
Exemple #7
0
 def test_video(self):
     self.download = Download({
         'urls': ['https://www.youtube.com/watch?v=jNQXAC9IVRw'],
         'pattern':
         'elephants',
         'e':
         False,
         'v':
         False
     })
     expected = '[00:00:01.300 --> 00:00:04.400] all right, so here we are in front of the ' + \
         red('elephants') + ','
     self.assertEqual(expected, self.download.get_captions())
Exemple #8
0
    def test_caption(self):
        tests = [{'name': '1 video, caption found',
                  'urls': ['https://www.swag.com/'],
                  'pattern': 'vision',
                  'regex': False,
                  'links': False,
                  'expected': '[00:00:17.350 --> 00:00:18.752] we have this ' + red('vision') + ' of einstein'},
                 {'name': '1 video, caption not found',
                  'urls': ['https://www.swag.com/'],
                  'pattern': 'iwontbefound',
                  'regex': False,
                  'links': False,
                  'expected': '',
                  },
                 {'name': '1 video, caption found more than once',
                  'urls': ['https://www.swag.com/'],
                  'pattern': 'light',
                  'regex': False,
                  'links': False,
                  'expected': '[00:00:33.666 --> 00:00:38.138] actor as einstein: what ' + red('light') + ' would i see if i rode on a beam of ' + red('light') + '?',
                  },
                 {'name': '1 video, regular expression',
                  'urls': ['https://www.swag.com/'],
                  'pattern': 'actor|light',
                  'regex': True,
                  'links': False,
                  'expected': '[00:00:33.666 --> 00:00:38.138] ' + red('actor') + ' as einstein: what ' + red('light') + ' would i see if i rode on a beam of ' + red('light') + '?',
                  },
                 {'name': '1 video, 1 link',
                  'urls': ['https://www.swag.com/'],
                  'pattern': 'actor|light',
                  'regex': True,
                  'links': True,
                  'expected': '[00:00:33.666 --> 00:00:38.138] ' + red('actor') + ' as einstein: what ' + red('light') + ' would i see if i rode on a beam of ' + red('light') + '? (https://www.swag.com/&t=33s)',
                  },
                 ]
        for test in tests:
            download = Download({'urls': test['urls'],
                                 'pattern': test['pattern'],
                                 'e': test['regex'],
                                 'v': False,
                                 'links': test['links']})
            m = mock_open(read_data=FIXTURE_WEBVTT)

            with patch('ytcc.download.open', m, create=True):
                with patch('ytcc.storage.Storage.remove_file', Mock()):
                    download.get_result = Mock(return_value=0)
                    actual = download.get_captions()
                    expected = test['expected']
                    self.assertEqual(actual, expected)
class TestRealVideo(unittest.TestCase):
    def setUp(self):
        self.download = Download()

    def test_video(self):
        video_id = 'jNQXAC9IVRw'
        excerpt = 'All right, so here we are in front of the elephants, the cool thing'
        self.assertEqual(excerpt,
                         self.download.get_captions(video_id)[:len(excerpt)])

    def test_failed(self):
        video_id = '12323123123'
        with self.assertRaises(DownloadException):
            self.download.get_captions(video_id)
Exemple #10
0
def main():
    id = 'PL4cUxeGkcC9jticTs2l6Nt2lsybNW0-4O'

    download = Download(playlist=len(id) > 12)

    captions_files = download.get_captions(id, 'it')

    Path("../Outputs").mkdir(parents=True, exist_ok=True)

    for video_id in captions_files:
        text_file = open("../Outputs/" + video_id + ".txt", "w")

        text_file.write(captions_files[video_id])

        text_file.close()
Exemple #11
0
 def test_video_multiline_match_with_regexp(self):
     self.download = Download({
         'urls': ['https://www.youtube.com/watch?v=jNQXAC9IVRw'],
         'pattern':
         'cool',
         'e':
         True,
         'v':
         False
     })
     expected = '[00:00:04.400 --> 00:00:09.166] the ' + red(
         'cool'
     ) + ' thing about these guys is that they have really,\n[00:00:12.700 --> 00:00:17.000] and thats, thats ' + red(
         'cool') + '."'
     self.assertEqual(expected, self.download.get_captions())
class TestDownloadGetUrlFromVideoId(unittest.TestCase):
    def setUp(self):
        self.download = Download()

    def test_set(self):
        video_id = 'vDOIDJdds'
        test = 'http://www.youtube.com/watch?v={0}'.format(video_id)
        result = self.download.get_url_from_video_id(video_id)
        self.assertEqual(test, result)

    def test_encoding(self):
        video_id = 'vDDD://'
        encoded_video_id = 'vDDD%3A%2F%2F'
        test = 'http://www.youtube.com/watch?v={0}'.format(encoded_video_id)
        result = self.download.get_url_from_video_id(video_id)
        self.assertEqual(test, result)
class TestGetFilePathFromVideoId(unittest.TestCase):
    def setUp(self):
        self.download = Download()

    def test_valid(self):
        self.assertEqual(
            FIXTURE_WEBVTT_STRIPPED,
            self.download.get_captions_from_output(FIXTURE_WEBVTT))
Exemple #14
0
    def test_video_one_missing_captions(self):
        self.download = Download({
            'urls': [
                'https://www.youtube.com/watch?v=jNQXAC9IVRw',
                'https://www.youtube.com/watch?v=jyoTZ69mWZE'
            ],
            'pattern':
            'elephants',
            'e':
            False,
            'v':
            False
        })
        expected = """https://www.youtube.com/watch?v=jNQXAC9IVRw
[00:00:01.300 --> 00:00:04.400] all right, so here we are in front of the elephants,"""
        expected = expected.replace('elephants', red('elephants'))
        self.assertEqual(expected, self.download.get_captions())
Exemple #15
0
class TestRemoveTimeFromCaption(unittest.TestCase):

    caption = '"00:00:49.860 --> 00:00:50.179\nand cook to taste like a hot\ndogs"'
    stripped_caption = "\nand cook to taste like a hot\ndogs"

    caption_two = "'00:00:14.848 --> 00:00:17.350\nMAN:\nWhen we think\nof E equals m c-squared,'"
    stripped_caption_two = "\nMAN:\nWhen we think\nof E equals m c-squared,"

    def setUp(self):
        self.download = Download()

    def test_removal(self):
        self.assertEqual(self.stripped_caption,
                         self.download.remove_time_from_caption(self.caption))

    def test_greedy_newline(self):
        self.assertEqual(
            self.stripped_caption_two,
            self.download.remove_time_from_caption(self.caption_two))
Exemple #16
0
class TestUpdateOpts(unittest.TestCase):

    opts = {'skip_download': True, 'writeautomaticsub': True}

    def setUp(self):
        self.download = Download()

    def test_defaults(self):
        self.assertEqual(self.opts['skip_download'],
                         self.download.opts['skip_download'])

    def test_update_default(self):
        self.download.update_opts({'skip_download': False})
        self.assertFalse(self.download.opts['skip_download'])

    def test_add_new(self):
        additional = {'new_option': 'test'}
        self.download.update_opts(additional)
        self.assertEqual('test', self.download.opts['new_option'])
        self.assertEqual(True, self.download.opts['skip_download'])
class TestRealVideo(unittest.TestCase):
    def setUp(self):
        self.download = Download()

    def test_video(self):
        video_id = 'jNQXAC9IVRw'
        excerpt = 'All right, so here we are in front of the elephants, the cool thing'
        self.assertEqual(excerpt,
                         self.download.get_captions(video_id)[:len(excerpt)])

    def test_video_french(self):
        video_id = 'VLAMC3NJsP4'
        excerpt = 'J\'ai fini Jouons à LoL (League of Legends). Oh mon Dieu,'
        self.assertEqual(
            excerpt,
            self.download.get_captions(video_id, 'fr')[:len(excerpt)])

    def test_failed(self):
        video_id = '12323123123'
        with self.assertRaises(DownloadException):
            self.download.get_captions(video_id)
Exemple #18
0
    def test_caption_captions_do_not_exist(self):
        test = {
            'name': 'captions do not exist',
            'urls': ['https://www.swag.com/'],
            'pattern': 'my pattern',
            'regex': False,
            'links': False,
        }

        download = Download({'urls': test['urls'],
                             'pattern': test['pattern'],
                             'e': test['regex'],
                             'v': False,
                             'links': test['links']})
        m = mock_open(read_data=FIXTURE_WEBVTT)
        m.side_effect = FileNotFoundError

        with patch('ytcc.download.open', m, create=True):
            with patch('ytcc.storage.Storage.remove_file', Mock()):
                download.get_result = Mock(return_value=0)
                with self.assertRaises(NoCaptionsException):
                    download.get_captions()
Exemple #19
0
class TestRealVideo(unittest.TestCase):
    def test_video(self):
        self.download = Download({
            'urls': ['https://www.youtube.com/watch?v=jNQXAC9IVRw'],
            'pattern':
            'elephants',
            'e':
            False,
            'v':
            False
        })
        expected = '[00:00:01.300 --> 00:00:04.400] all right, so here we are in front of the ' + \
            red('elephants') + ','
        self.assertEqual(expected, self.download.get_captions())

    def test_video_multiline_match_with_regexp(self):
        self.download = Download({
            'urls': ['https://www.youtube.com/watch?v=jNQXAC9IVRw'],
            'pattern':
            'cool',
            'e':
            True,
            'v':
            False
        })
        expected = '[00:00:04.400 --> 00:00:09.166] the ' + red(
            'cool'
        ) + ' thing about these guys is that they have really,\n[00:00:12.700 --> 00:00:17.000] and thats, thats ' + red(
            'cool') + '."'
        self.assertEqual(expected, self.download.get_captions())

    def test_video_does_not_exist(self):
        self.download = Download({
            'urls': ['12323123123'],
            'pattern': 'elephants',
            'e': False,
            'v': False
        })
        with self.assertRaises(DownloadException):
            self.download.get_captions()

    def test_video_no_captions(self):
        self.download = Download({
            'urls': ['https://www.youtube.com/watch?v=jyoTZ69mWZE'],
            'pattern':
            'elephants',
            'e':
            False,
            'v':
            False
        })
        with self.assertRaises(NoCaptionsException):
            self.download.get_captions()
Exemple #20
0
def initialize_worker():
    global download_global
    download_global = Download()
    def download_script(self):
        download = Download()
        script = download.get_captions(self.url[-11:], 'en')

        script_file = open(SCRIPT_PATH + self.name + '.txt', 'w')
        script_file.write(script)
def get_download(download):
    if download is None:
        print("new Download session")
        return Download()
    else:
        return download
Exemple #23
0
from ytcc.download import Download

video_id = '_un8mHYFJp0'
download = Download()
# Language is optional and default to "en"
# YouTube uses "en","fr" not "en-US", "fr-FR"
captions = download.get_captions(video_id, 'en')

review_file = open("review.txt","w+")
review_file.write(captions)
# print(captions)
 def setUp(self):
     self.download = Download()
    video = pafy.new(url_full)
    video_title = video.title
    for invalid_char in INVALID_CHARS:
        video_title = video_title.replace(invalid_char, '')
    print(video_title)

    timestamp = datetime.today().strftime('%y%m%d')

    best_audio_stream = video.getbestaudio()
    audio_path = AUDIO_PATH + timestamp + '_' + video_title + '.' + best_audio_stream.extension
    best_audio_stream.download(audio_path)

    sound = AudioSegment.from_file(audio_path)
    mp3_path = AUDIO_PATH + timestamp + '_' + video_title + '.mp3'
    sound.export(mp3_path, format="mp3", bitrate="128k")
    os.remove(audio_path)

    download = Download()
    caption = download.get_captions(url, 'en')

    json_data = OrderedDict()
    json_data['id'] = file_cnt
    json_data['url'] = url
    json_data['title'] = timestamp + '_' + video_title
    json_data['caption'] = caption
    json_path = DATABASE_PATH + json_data['id'] + '.json'
    with open(json_path, 'w') as json_file:
        json.dump(json_data, json_file)

    file_cnt += 1