Example #1
0
def write_to_subtitles(data: Union[List[dict], List[List[dict]]]):
    """Returns WebVTT object from data.

    Args:
        data (Union[List[dict], List[List[dict]]]):
            data must be either a 'word'-type tier with
            a list of dicts that have keys for 'start', 'end' and
           'text'. Or a 'sentence'-type tier with a list of lists of dicts.

    Returns:
        WebVTT: WebVTT subtitles
    """
    vtt = WebVTT()
    for caption in data:
        if isinstance(caption, list):
            formatted = Caption(
                float_to_timedelta(caption[0]["start"]),
                float_to_timedelta(caption[-1]["end"]),
                " ".join([w["text"] for w in caption]),
            )
        else:
            formatted = Caption(
                float_to_timedelta(caption["start"]),
                float_to_timedelta(caption["end"]),
                caption["text"],
            )
        vtt.captions.append(formatted)
    return vtt
Example #2
0
def enrichment_to_vtt(list_enrichment, video):
    webvtt = WebVTT()
    for enrich in list_enrichment:
        start = datetime.datetime.utcfromtimestamp(
            enrich.start).strftime("%H:%M:%S.%f")[:-3]
        end = datetime.datetime.utcfromtimestamp(
            enrich.end).strftime("%H:%M:%S.%f")[:-3]
        url = enrichment_to_vtt_type(enrich)
        caption = Caption(
            "{0}".format(start),
            "{0}".format(end),
            [
                "{",
                '"title": "{0}",'.format(enrich.title),
                '"type": "{0}",'.format(enrich.type),
                '"stop_video": "{0}",'.format("%s" %
                                              1 if enrich.stop_video else 0),
                '"url": "{0}"'.format(url),
                "}",
            ],
        )
        caption.identifier = enrich.slug
        webvtt.captions.append(caption)
    temp_vtt_file = NamedTemporaryFile(suffix=".vtt")
    with open(temp_vtt_file.name, "w") as f:
        webvtt.write(f)
    if FILEPICKER:
        videodir, created = UserFolder.objects.get_or_create(name="%s" %
                                                             video.slug,
                                                             owner=video.owner)
        previousEnrichmentFile = CustomFileModel.objects.filter(
            name__startswith="enrichment",
            folder=videodir,
            created_by=video.owner,
        )
        for enr in previousEnrichmentFile:
            enr.delete()  # do it like this to delete file
        enrichmentFile, created = CustomFileModel.objects.get_or_create(
            name="enrichment", folder=videodir, created_by=video.owner)

        if enrichmentFile.file and os.path.isfile(enrichmentFile.file.path):
            os.remove(enrichmentFile.file.path)
    else:
        enrichmentFile, created = CustomFileModel.objects.get_or_create()
    enrichmentFile.file.save("enrichment.vtt", File(temp_vtt_file))
    enrichmentVtt, created = EnrichmentVtt.objects.get_or_create(video=video)
    enrichmentVtt.src = enrichmentFile
    enrichmentVtt.save()
    return enrichmentFile.file.path
Example #3
0
def transcribe():
    command = [
        'ffmpeg', '-nostdin', '-loglevel', 'quiet', '-i', sys.argv[1], '-ar',
        str(sample_rate), '-ac', '1', '-f', 's16le', '-'
    ]
    process = subprocess.Popen(command, stdout=subprocess.PIPE)

    results = []
    while True:
        data = process.stdout.read(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            results.append(rec.Result())
    results.append(rec.FinalResult())

    vtt = WebVTT()
    for i, res in enumerate(results):
        words = json.loads(res).get('result')
        if not words:
            continue

        start = timeString(words[0]['start'])
        end = timeString(words[-1]['end'])
        content = ' '.join([w['word'] for w in words])

        caption = Caption(start, end, textwrap.fill(content))
        vtt.captions.append(caption)

    # save or return webvtt
    if len(sys.argv) > 2:
        vtt.save(sys.argv[2])
    else:
        print(vtt.content)
Example #4
0
def create_overview_vtt(video_id, nb_img, image, duration, overviewfilename):
    msg = "\ncreate overview vtt file"
    image_width = image["image_width"]
    image_height = image["image_height"]
    image_url = image["image_url"]
    # creating webvtt file
    webvtt = WebVTT()
    for i in range(0, nb_img):
        if nb_img == 99:
            start = format(float(duration * i / 100), '.3f')
            end = format(float(duration * (i + 1) / 100), '.3f')
        else:
            start = format(float(i), '.3f')
            end = format(float(i + 1), '.3f')

        start_time = time.strftime('%H:%M:%S',
                                   time.gmtime(int(str(start).split('.')[0])))
        start_time += ".%s" % (str(start).split('.')[1])
        end_time = time.strftime(
            '%H:%M:%S', time.gmtime(int(
                str(end).split('.')[0]))) + ".%s" % (str(end).split('.')[1])
        caption = Caption(
            '%s' % start_time, '%s' % end_time, '%s#xywh=%d,%d,%d,%d' %
            (image_url, image_width * i, 0, image_width, image_height))
        webvtt.captions.append(caption)
    webvtt.save(overviewfilename)
    if check_file(overviewfilename):
        msg += "\n- overviewfilename :\n%s" % overviewfilename
    else:
        msg = "overviewfilename Wrong file or path : "\
            + "\n%s" % overviewfilename
        add_encoding_log(video_id, msg)
        change_encoding_step(video_id, -1, msg)
        send_email(msg, video_id)
    return msg
def file_writing(path):
    vtt = WebVTT()
    caption = Caption()
    emotion = ""

    for line in webvtt.read('static/subtitle.vtt'):
        emotion = predict(str(line.text))

        if emotion is "joy":
            caption = Caption(
                line.start, line.end,
                "<c.green> " + emotion + ": " + line.text + "</c>")
        elif emotion is "fear":
            caption = Caption(line.start, line.end,
                              "<c.red> " + emotion + ": " + line.text + "</c>")

        elif emotion is "anger":
            caption = Caption(line.start, line.end,
                              "<c.red> " + emotion + ": " + line.text + "</c>")
        elif emotion is "sadness":
            caption = Caption(line.start, line.end,
                              "<c.red> " + emotion + ": " + line.text + "</c>")
        elif emotion is "neutral":
            caption = Caption(
                line.start, line.end,
                "<c.blue> " + emotion + ": " + line.text + "</c>")
        else:
            caption = Caption(
                line.start, line.end,
                "<c.blue> " + emotion + ": " + line.text + "</c>")
        vtt.captions.append(caption)
    vtt.save('static/my_captions.vtt')
Example #6
0
def generate_vtt_file(all_preds, logits, save_path):
    vtt = WebVTT()
    predictions = all_preds

    labels, starts, ends = get_labels_start_end_time(predictions, [1])

    # smaller boundaries
    for ix in range(len(labels)):
        if ix == len(labels)-1:
            break
        diff = starts[ix+1]-ends[ix]
        starts[ix+1] -= floor(diff/2)
        ends[ix] += floor(diff/2)

    # load i3d classes
    i3d_scores = logits
    with open('data/info/bslcp/info.pkl', 'rb') as f:
        info_data = pickle.load(f)

    # for start, end in zip(starts, ends):
    for start, end in zip(starts, ends):

        if logits is not None:
            i3d_score = np.sum(np.asarray(i3d_scores)[start:end], axis=0)
            ind = np.argpartition(i3d_score, -10)[-10:]       
            ind = ind[np.argsort(-i3d_score[ind])]
            classes = [info_data['words'][ix] for ix in ind]

            class_str = ','.join(classes)
        else:
            class_str = ''

        start = (start + 8) / 25
        end = (end + 8) / 25

        start_dt = datetime.timedelta(seconds=start)
        start_str = str(start_dt)
        if '.' not in start_str:
            start_str = f'{start_str}.000000'

        end_dt = datetime.timedelta(seconds=end)
        end_str = str(end_dt)
        if '.' not in end_str:
            end_str = f'{end_str}.000000'
        # creating a caption with a list of lines
        caption = Caption(
            start_str,
            end_str,
            [class_str]
        )

        # adding a caption
        vtt.captions.append(caption)


    # save to a different file
    vtt.save(f'{save_path}/demo.vtt')
Example #7
0
    def annotations_to_webvtt(self, annotations):
        webvtt = WebVTT()

        last_index = len(annotations) - 1
        index = 0
        while index <= last_index:
            focus_annotation = annotations[index]
            # print("Focus:: BT: %d | ET: %d | Value: %s" % focus_annotation[:3])
            if index == last_index:
                caption = Caption(
                    self.time_to_webvtt_time(focus_annotation[0]),
                    self.time_to_webvtt_time(focus_annotation[1]),
                    [focus_annotation[2]])
                # print("%s %s %s" % (caption.start, caption.end, caption.text))
                webvtt.captions.append(caption)
                index += 1
            else:
                for index_next in range(index + 1, last_index + 1):
                    index = index_next
                    next_annotation = annotations[index_next]
                    # print("Next :: BT: %d | ET: %d | Value: %s" % next_annotation[:3])
                    overlap = self.check_overlap(focus_annotation,
                                                 next_annotation)
                    if overlap:
                        # print("#%s#%s#" % (focus_annotation[2], next_annotation[2]))
                        if not (focus_annotation[2] == next_annotation[2]):
                            caption = Caption(
                                self.time_to_webvtt_time(focus_annotation[0]),
                                self.time_to_webvtt_time(next_annotation[0]),
                                [focus_annotation[2]])
                            # print("%s %s %s" % (caption.start, caption.end, caption.text))
                            webvtt.captions.append(caption)
                            break
                    else:
                        caption = Caption(
                            self.time_to_webvtt_time(focus_annotation[0]),
                            self.time_to_webvtt_time(
                                min(focus_annotation[1], next_annotation[0])),
                            [focus_annotation[2]])
                        # print("%s %s %s" % (caption.start, caption.end, caption.text))
                        webvtt.captions.append(caption)
                        break
        return webvtt
def store_remote_transcripting_video(video_id):
    #
    msg = ""
    video_to_encode = Video.objects.get(id=video_id)
    output_dir = create_outputdir(video_id, video_to_encode.video.path)
    info_video = {}

    if check_file(output_dir + "/transcript.json"):
        with open(output_dir + "/transcript.json") as json_file:
            info_video = json.load(json_file)

        print_if_debug(output_dir)
        print_if_debug(json.dumps(info_video, indent=2))

        webvtt = WebVTT()
        # They're sorted by confidence. First one is highest confidence result.
        words = info_video["transcripts"][0]["words"]
        """
        for transcript in info_video["transcripts"]:
            for word in transcript["words"]:
                words.append(word)
        """
        text_caption = []
        start_caption = None
        duration = 0
        for word in words:
            text_caption.append(word['word'])
            if start_caption is None:
                start_caption = word['start_time']
            if duration + word['duration'] > SENTENCE_MAX_LENGTH:
                caption = Caption(
                    format_time_caption(start_caption),
                    format_time_caption(start_caption + duration +
                                        word['duration']),
                    " ".join(text_caption))
                webvtt.captions.append(caption)
                text_caption = []
                start_caption = None
                duration = 0
            else:
                duration += word['duration']
        print_if_debug(webvtt)
        msg += saveVTT(video_to_encode, webvtt)
        add_encoding_log(video_id, msg)
        change_encoding_step(video_id, 0, "done")
        # envois mail fin transcription
        if EMAIL_ON_TRANSCRIPTING_COMPLETION:
            send_email_transcript(video_to_encode)

    else:
        msg += "Wrong file or path : "\
            + "\n%s" % video_to_encode.video.path
        add_encoding_log(video_id, msg)
        change_encoding_step(video_id, -1, msg)
        send_email(msg, video_id)
    def translate(self):
        newVTT = WebVTT()
        fileName = self.fileNameWOType + '.vtt'
        for caption in webvtt.read(fileName):
            #            print(caption.start)
            #            print(caption.end)
            #            print(caption.text)
            translation = Translate.AWSTranslate.translate_text(
                Text=caption.text,
                SourceLanguageCode=self.sourceLanguage,
                TargetLanguageCode=self.targetLanguage)

            newCaption = Caption(caption.start, caption.end,
                                 translation.get('TranslatedText'))
            newCaption.identifier = caption.identifier
            newVTT.captions.append(newCaption)

        translatedFileName = self.fileNameWOType + '_' + self.targetLanguage + '.vtt'
        newVTT.save(translatedFileName)
        return 1
Example #10
0
def createCaption(arg, rate):
    start, end, text = arg
    start = format(start / rate, '.3f')
    end = format(end / rate, '.3f')
    start_time = time.strftime('%H:%M:%S',
                               time.gmtime(int(str(start).split('.')[0])))
    start_time += ".%s" % (str(start).split('.')[1])
    end_time = time.strftime('%H:%M:%S',
                             time.gmtime(int(str(end).split('.')[0])))
    end_time += ".%s" % (str(end).split('.')[1])
    caption = Caption('%s' % start_time, '%s' % end_time, '%s' % text)
    return caption
def text_extract():
    try:

        # creating a folder named data
        if os.path.exists('static/Text'):
            shutil.rmtree('static/Text')

        os.makedirs('static/Text')

    # if not created then raise error
    except OSError:
        print('Error: Creating directory of data')
    vtt_pos = WebVTT()
    vtt_neg = WebVTT()
    vtt_neu = WebVTT()
    caption = Caption()
    emotion = ""

    for line in webvtt.read('static/subtitle.vtt'):
        emotion = predict(str(line.text))

        if emotion == "joy":
            caption = Caption(
                line.start, line.end,
                "<c.green> " + emotion + ": " + line.text + "</c>")
            vtt_pos.captions.append(caption)
        elif emotion == "anger" or emotion == "sadness" or emotion == "fear":
            caption = Caption(line.start, line.end,
                              "<c.red> " + emotion + ": " + line.text + "</c>")
            vtt_neg.captions.append(caption)
        elif emotion == "neutral":
            caption = Caption(
                line.start, line.end,
                "<c.blue> " + emotion + ": " + line.text + "</c>")
            vtt_neu.captions.append(caption)

    vtt_pos.save('static/Text/positive.vtt')
    vtt_neg.save('static/Text/negative.vtt')
    vtt_neu.save('static/Text/neutral.vtt')
Example #12
0
    def get(self, request, *args, **kwargs):
        feed = self.get_object()

        try:
            stream = get_object_or_404(feed.streams.all(), uuid=request.GET["stream"])
        except KeyError:
            return HttpResponseBadRequest(_("Bad request"))

        webvtt = WebVTT()
        resp = HttpResponse(content_type="text/vtt; charset=utf-8")

        try:
            start = parse_datetime(request.GET["start"])
            end = parse_datetime(request.GET["end"])
            epoch = parse_datetime(request.GET["epoch"])
        except KeyError:
            return HttpResponseBadRequest(_("Bad request"))

        if stream.program_date_time:
            start_diff = start - stream.started_at
            end_diff = end - stream.started_at
            start = stream.program_date_time + start_diff
            end = stream.program_date_time + end_diff
            epoch = stream.program_date_time

        start = start - timedelta(seconds=5)
        end = end + timedelta(seconds=5)

        items = feed.items.filter(starts_at__gte=start, ends_at__lt=end).order_by(
            "starts_at"
        )
        for item in items:
            start_timecode = self.get_vtt_timecode(epoch, item.starts_at)
            end_timecode = self.get_vtt_timecode(epoch, item.ends_at)
            data = {
                "uuid": item.uuid,
                "starts_at": item.starts_at.isoformat(),
                "ends_at": item.ends_at.isoformat(),
                "start_timecode": start_timecode,
                "end_timecode": end_timecode,
                "payload": item.payload,
            }
            cap = Caption(
                start_timecode, end_timecode, [json.dumps(data, cls=DjangoJSONEncoder)]
            )
            webvtt.captions.append(cap)

        webvtt.write(resp)
        return resp
Example #13
0
def read_from_existing_vtt(bucket_name, file_name):
    vtt = WebVTT()
    blob = read_data_from_storage(bucket_name, file_name)
    blob = [
        string for string in blob.decode("utf-8").split('\n')[2:] if string
    ]
    start, end = '', ''
    for string in blob:
        if '-->' in string:
            start, end = string.split(' --> ')
        else:
            caption = Caption(start, end, string)
            vtt.captions.append(caption)

    return vtt
Example #14
0
def generate_subtitles(input_file_name, output_file_name):
    
    if not os.path.isfile(input_file_name):
        print("Invalid file name")
        return

    f_content = open_file(input_file_name)
    
    vtt = WebVTT()
   
    count = 0
    prev_start = ""
    prev_end = ""
    prev_str = ""
    for line in f_content:
        if line.startswith('Word'):
            # print(line, prev_str)
            l = line.strip() + '\n'
            m = re.search('Word: (.+?), start_time: (.+?), end_time: (.+?)\n', line)
            
            current_start = str(timedelta(seconds = float(m.group(2)), microseconds = 1))
            current_end = str(timedelta(seconds = float(m.group(3)), microseconds = 1))

            if(count < 6):
                if(count == 0):
                    prev_start = current_start
                count += 1
                prev_end = current_end
                prev_str += " " + m.group(1)

            if(count == 6): 
                #caption = Caption(
                #        str(timedelta(seconds = float(m.group(2)), microseconds = 1)),
                #        str(timedelta(seconds = float(m.group(3)), microseconds = 1)),
                #        str(m.group(1)))

                caption = Caption(
                        prev_start,
                        prev_end,
                        prev_str)
                vtt.captions.append(caption)
                count = 0
                prev_start = ""
                prev_end = ""
                prev_str = ""

    write_vtt(vtt, output_file_name)
Example #15
0
def process_video_url(url, pk):
    vid_id = get_youtube_vid_id(url)
    captions = YouTubeTranscriptApi.get_transcript(video_id=vid_id)

    vtt = WebVTT()

    for t in captions:
        start = datetime.timedelta(milliseconds=t["start"] * 1000)
        end = datetime.timedelta(milliseconds=t["duration"] * 1000) + start

        if "." not in str(start):
            start = str(start) + ".000"

        if "." not in str(end):
            end = str(end) + ".000"

        caption = Caption(
            start=str(start),
            end=str(end),
            text=t["text"]
        )

        vtt.captions.append(caption)

    if not os.path.isdir(CACHE):
        os.mkdir(CACHE)

    path = os.path.join(CACHE, f"{vid_id}.vtt")
    vtt.save(path)

    transcript = File(open(path, "rb"))
    os.remove(path)

    obj = VidSpark.management.models.Video.objects.get(pk=pk)
    obj.transcript = transcript
    obj.save()
vtt = WebVTT()
vtt.read(filename)

stmp = StringIO()
print("<div>", file=stmp)
for caption in vtt:
    print('<span data-start="{}" data-end="{}">{}</span>'.format(
        caption.start, caption.end, caption.text),
          file=stmp)
print("</div>", file=stmp)

# Translate
driver = TranslationDriver(args.lang)
strans = driver.translate(stmp.getvalue())

# Convert translated HTML back to VTT
vtt = WebVTT()

soup = BeautifulSoup(strans, "lxml")
for span in soup.find_all("span"):
    start = span["data-start"]
    end = span["data-end"]
    caption = Caption(start, end, span.text)
    vtt.captions.append(caption)

# Remove the english file
os.remove(filename)

outfile = filename.replace(".en.", ".{}.".format(args.lang))
vtt.save(outfile)
print(green(outfile, bold=True))
Example #17
0
 def test_single_invalid_caption(self):
     self.assertRaises(
         InvalidCaptionsError,
         self.segmenter.segment,
         [Caption(), Caption(), 'text', Caption()]
     )
Example #18
0
 def write_caption(self, start, end, line):
     caption = Caption(start, end, line)
     self.vtt.captions.append(caption)
Example #19
0
def genarateSUB(url, lang):
    url = url
    language = lang
    # fine If any english captions available in the yputube url video
    captionTitle = Extract_Caption.extractTitle(url)

    videoName = Extract_Caption.download_video(url)
    wavFilePath = extractWavAudio.extractWAV(url)

    # spliting the audio file in to multiple audio
    AudioSplit.split(wavFilePath, captionTitle)

    # initiate the subtitle file path
    vtt = WebVTT()

    # initiate slite wav file
    num_files = len(os.listdir('../Datas/Splits/' + captionTitle + '/'))

    cnt = 0
    start = 0
    end = 5
    for i in range(1, num_files + 1):

        flag = 0
        text, confidence = ms_asr.transcribe('../Datas/Splits/' +
                                             captionTitle + '/' + str(i) +
                                             '.wav')
        print("Text: ", text)
        print("Confidence: ", confidence)
        if text == " ":
            translated_text = " "
        else:
            translated_text = TRANSLATR_TO_TEXT.translateFromTXT(
                text, language)
            flag = 1
            cnt += 1
        print("Translated Text: ", translated_text)
        if flag == 1:
            start_hours = start // 3600
            temp = start % 3600
            start_min = temp // 60
            start_sec = temp % 60
            end_hours = end // 3600
            temp = end % 3600
            end_min = temp // 60
            end_sec = temp % 60

            if (start_hours <= 9):
                start_hours = '0' + str(start_hours)
            else:
                start_hours = str(start_hours)
            if (start_min <= 9):
                start_min = '0' + str(start_min)
            else:
                start_min = str(start_min)
            if (start_sec <= 9):
                start_sec = '0' + str(start_sec)
            else:
                start_sec = str(start_sec)

            if (end_hours <= 9):
                end_hours = '0' + str(end_hours)
            else:
                end_hours = str(end_hours)
            if (end_min <= 9):
                end_min = '0' + str(end_min)
            else:
                end_min = str(end_min)
            if (end_sec <= 9):
                end_sec = '0' + str(end_sec)
            else:
                end_sec = str(end_sec)

            caption = Caption(
                start_hours + ':' + start_min + ':' + start_sec + '.001 ',
                end_hours + ':' + end_min + ':' + end_sec + '.000\n',
                str(translated_text) + '\n')

            vtt.captions.append(caption)
        start += 5
        end += 5

    vttFilePath = "../webApp/static/SubtitleFile/" + captionTitle + "_" + language + ".vtt"
    vtt.save(vttFilePath)
    vttName = captionTitle + "_" + language + ".vtt"
    files = glob.glob('../Datas/Splits/' + captionTitle + '/*')
    for f in files:
        os.remove(f)
    os.rmdir('../Datas/Splits/' + captionTitle)
    os.remove(wavFilePath)
    if language.__eq__('ta'):
        retlan = 'Tamil'
    if language.__eq__('si'):
        retlan = 'Sinhala',
    return videoName, vttName
Example #20
0
def main_transcript(video_to_encode, ds_model):
    msg = ""
    inference_start = timer()
    msg += '\nInference start %0.3fs.' % inference_start

    mp3file = video_to_encode.get_video_mp3(
    ).source_file if video_to_encode.get_video_mp3() else None
    if mp3file is None:
        msg += "\n no mp3 file found for video :%s." % video_to_encode.id
        change_encoding_step(video_to_encode.id, -1, msg)
        send_email(msg, video_to_encode.id)
        return msg

    # NORMALIZE mp3file
    norm_mp3_file = normalize_mp3(mp3file.path)

    desired_sample_rate = ds_model.sampleRate()

    webvtt = WebVTT()

    last_item = None
    sentences = []
    sentence = []
    metadata = None

    for start_trim in range(0, video_to_encode.duration, AUDIO_SPLIT_TIME):

        end_trim = video_to_encode.duration if start_trim + \
            AUDIO_SPLIT_TIME > video_to_encode.duration else (
                start_trim + AUDIO_SPLIT_TIME + SENTENCE_MAX_LENGTH)

        duration = (AUDIO_SPLIT_TIME + SENTENCE_MAX_LENGTH) if start_trim + \
            AUDIO_SPLIT_TIME + SENTENCE_MAX_LENGTH < video_to_encode.duration \
            else (video_to_encode.duration - start_trim)

        msg += "\ntake audio from %s to %s - %s" % (start_trim, end_trim,
                                                    duration)

        audio = convert_samplerate(norm_mp3_file, desired_sample_rate,
                                   start_trim, duration)
        msg += '\nRunning inference.'

        metadata = ds_model.sttWithMetadata(audio)

        msg += '\nConfidence : %s' % metadata.confidence

        sentences[:] = []  # empty list
        sentence[:] = []  # empty list

        if len(metadata.items) > 0:
            refItem = metadata.items[0]
            index = get_index(metadata, last_item,
                              start_trim) if last_item else 0
            # nb of character in AUDIO_SPLIT_TIME
            msg += "METADATA ITEMS : %d " % len(metadata.items)
            sentences = get_sentences(metadata, refItem, index)
            last_item = (
                sentences[-1][-1].character,
                sentences[-1][-1].start_time) if len(sentences) > 0 else ()
            for sent in sentences:
                if len(sent) > 0:
                    start_time = sent[0].start_time + start_trim
                    end_time = sent[-1].start_time + start_trim
                    str_sentence = ''.join(item.character for item in sent)
                    # print(start_time, end_time, str_sentence)
                    caption = Caption(
                        '%s.%s' %
                        (timedelta(seconds=int(str(start_time).split('.')[0])),
                         str('%.3f' % start_time).split('.')[1]), '%s.%s' %
                        (timedelta(seconds=int(str(end_time).split('.')[0])),
                         str('%.3f' % end_time).split('.')[1]),
                        ['%s' % str_sentence])
                    webvtt.captions.append(caption)
    # print(webvtt)
    msg += saveVTT(video_to_encode, webvtt)
    inference_end = timer() - inference_start
    msg += '\nInference took %0.3fs.' % inference_end
    # print(msg)
    change_encoding_step(video_to_encode.id, 0, "done")
    # envois mail fin transcription
    if EMAIL_ON_TRANSCRIPTING_COMPLETION:
        send_email_transcript(video_to_encode)
    return msg
Example #21
0
def main_transcript(video_to_encode):
    msg = ""

    mp3file = video_to_encode.get_video_mp3(
    ).source_file if video_to_encode.get_video_mp3() else None

    lang = video_to_encode.main_lang

    # check if DS_PARAM [lang] exist
    if not DS_PARAM.get(lang):
        msg += "\n no deepspeech model found for lang:%s." % lang
        msg += "Please add it in DS_PARAM."
        return msg

    ds_model = Model(DS_PARAM[lang]['model'], DS_PARAM[lang]['beam_width'])

    if all([
            cond in DS_PARAM[lang]
            for cond in ['alphabet', 'lm', 'trie', 'lm_alpha', 'lm_beta']
    ]):
        ds_model.enableDecoderWithLM(DS_PARAM[lang]['lm'],
                                     DS_PARAM[lang]['trie'],
                                     DS_PARAM[lang]['lm_alpha'],
                                     DS_PARAM[lang]['lm_beta'])

    desired_sample_rate = ds_model.sampleRate()

    webvtt = WebVTT()
    inference_start = timer()
    last_item = None
    sentences = []
    sentence = []
    metadata = None

    for start_trim in range(0, video_to_encode.duration, AUDIO_SPLIT_TIME):

        end_trim = video_to_encode.duration if start_trim + \
            AUDIO_SPLIT_TIME > video_to_encode.duration else (
                start_trim + AUDIO_SPLIT_TIME + SENTENCE_MAX_LENGTH)

        duration = (AUDIO_SPLIT_TIME + SENTENCE_MAX_LENGTH) if start_trim + \
            AUDIO_SPLIT_TIME + SENTENCE_MAX_LENGTH < video_to_encode.duration \
            else (video_to_encode.duration - start_trim)

        msg += "\ntake audio from %s to %s - %s" % (start_trim, end_trim,
                                                    duration)

        audio = convert_samplerate(mp3file.path, desired_sample_rate,
                                   start_trim, duration)
        msg += '\nRunning inference.'

        metadata = ds_model.sttWithMetadata(audio)

        msg += '\nConfidence : %s' % metadata.confidence

        sentences[:] = []  # empty list
        sentence[:] = []  # empty list

        refItem = metadata.items[0]

        index = get_index(metadata, last_item, start_trim) if last_item else 0

        # nb of character in AUDIO_SPLIT_TIME
        msg += "METADATA ITEMS : %d " % len(metadata.items)

        sentences = get_sentences(metadata, refItem, index)

        last_item = (
            sentences[-1][-1].character,
            sentences[-1][-1].start_time) if len(sentences) > 0 else ()

        for sent in sentences:
            if len(sent) > 0:
                start_time = sent[0].start_time + start_trim
                end_time = sent[-1].start_time + start_trim
                str_sentence = ''.join(item.character for item in sent)
                # print(start_time, end_time, str_sentence)
                caption = Caption(
                    '%s.%s' %
                    (timedelta(seconds=int(str(start_time).split('.')[0])),
                     str('%.3f' % start_time).split('.')[1]), '%s.%s' %
                    (timedelta(seconds=int(str(end_time).split('.')[0])),
                     str('%.3f' % end_time).split('.')[1]),
                    ['%s' % str_sentence])

                webvtt.captions.append(caption)
    # print(webvtt)
    msg += saveVTT(video_to_encode, webvtt)
    inference_end = timer() - inference_start
    msg += '\nInference took %0.3fs.' % inference_end
    # print(msg)
    return msg
Example #22
0
def main_transcript(norm_mp3_file, duration, ds_model):
    msg = ""
    inference_start = timer()
    msg += "\nInference start %0.3fs." % inference_start

    desired_sample_rate = ds_model.sampleRate()

    webvtt = WebVTT()

    last_word_added = ""
    metadata = None

    all_text = ""

    for start_trim in range(0, duration, AUDIO_SPLIT_TIME):

        end_trim = (duration if start_trim + AUDIO_SPLIT_TIME > duration else
                    (start_trim + AUDIO_SPLIT_TIME + SENTENCE_MAX_LENGTH))

        dur = ((AUDIO_SPLIT_TIME + SENTENCE_MAX_LENGTH) if start_trim +
               AUDIO_SPLIT_TIME + SENTENCE_MAX_LENGTH < duration else
               (duration - start_trim))

        msg += "\ntake audio from %s to %s - %s" % (start_trim, end_trim, dur)

        audio = convert_samplerate(norm_mp3_file, desired_sample_rate,
                                   start_trim, dur)
        msg += "\nRunning inference."

        metadata = ds_model.sttWithMetadata(audio)

        for transcript in metadata.transcripts:
            msg += "\nConfidence : %s" % transcript.confidence
            words = words_from_candidate_transcript(transcript)
            start_caption = start_trim + words[0]["start_time"]
            text_caption = []
            is_first_caption = True
            for word in words:
                all_text += word["word"] + " "
                # word : <class 'dict'> {'word': 'bonjour', 'start_time ':
                # 0.58, 'duration': 7.34}
                text_caption.append(word["word"])
                if not (((word["start_time"] + start_trim) - start_caption) <
                        SENTENCE_MAX_LENGTH):
                    # on créé le caption
                    if is_first_caption:
                        # A revoir, fusion de la nouvelle ligne avec
                        # l'ancienne...
                        is_first_caption = False
                        text_caption = get_text_caption(
                            text_caption, last_word_added)

                    stop_caption = start_trim + word["start_time"] + word[
                        "duration"]

                    # on evite le chevauchement
                    change_previous_end_caption(webvtt, start_caption)

                    caption = Caption(
                        format_time_caption(start_caption),
                        format_time_caption(stop_caption),
                        " ".join(text_caption),
                    )

                    webvtt.captions.append(caption)
                    # on remet tout à zero pour la prochaine phrase
                    start_caption = start_trim + word["start_time"]
                    text_caption = []
                    last_word_added = word["word"]
            if start_trim + AUDIO_SPLIT_TIME > duration:
                # on ajoute ici la dernière phrase de la vidéo
                stop_caption = (start_trim + words[-1]["start_time"] +
                                words[-1]["duration"])
                caption = Caption(
                    format_time_caption(start_caption),
                    format_time_caption(stop_caption),
                    " ".join(text_caption),
                )
                webvtt.captions.append(caption)
    inference_end = timer() - inference_start

    msg += "\nInference took %0.3fs." % inference_end
    return msg, webvtt, all_text
Example #23
0
def subtitle_generation(response, vtt, bin_size=3):
    """We define a bin of time period to display the words in sync with audio. 
    Here, bin_size = 3 means each bin is of 3 secs. 
    All the words in the interval of 3 secs in result will be grouped togather."""
    # response = "videoIntelligence-response-to-API"

    transcribed_text = ""
    index = 0
    flag = None

    for speech_transcription in response.annotation_results[
            0].speech_transcriptions:
        # The number of alternatives for each transcription is limited by
        # SpeechTranscriptionConfig.max_alternatives.
        # Each alternative is a different possible transcription
        # and has its own confidence score.
        for alternative in speech_transcription.alternatives:
            try:
                if alternative.words[0].start_time.seconds:
                    # bin start -> for first word of result
                    start_sec = alternative.words[0].start_time.seconds
                    start_microsec = alternative.words[
                        0].start_time.nanos * 0.001
                else:
                    # bin start -> For First word of response
                    start_sec = 0
                    start_microsec = 0
                end_sec = start_sec + bin_size  # bin end sec

                # for last word of result
                last_word_end_sec = alternative.words[-1].end_time.seconds
                last_word_end_microsec = alternative.words[
                    -1].end_time.nanos * 0.001

                # bin transcript
                transcript = alternative.words[0].word

                index += 1  # subtitle index

                for i in range(len(alternative.words) - 1):
                    try:
                        word = alternative.words[i + 1].word
                        word_start_sec = alternative.words[
                            i + 1].start_time.seconds
                        word_start_microsec = alternative.words[
                            i +
                            1].start_time.nanos * 0.001  # 0.001 to convert nana -> micro
                        word_end_sec = alternative.words[i +
                                                         1].end_time.seconds
                        word_end_microsec = alternative.words[
                            i + 1].end_time.nanos * 0.001

                        if word_end_sec < end_sec and not (
                                '!' in alternative.words[i].word
                                or '?' in alternative.words[i].word
                                or '.' in alternative.words[i].word):
                            transcript = transcript + " " + word
                        else:
                            previous_word_end_sec = alternative.words[
                                i].end_time.seconds
                            previous_word_end_microsec = alternative.words[
                                i].end_time.nanos * 0.001

                            # append bin transcript
                            start = str(
                                datetime.timedelta(0, start_sec,
                                                   start_microsec))[:12]
                            end = str(
                                datetime.timedelta(
                                    0, previous_word_end_sec,
                                    previous_word_end_microsec))[:12]
                            if len(start) <= 8: start += ".000"
                            if len(end) <= 8: end += ".000"
                            if flag and flag == start: break
                            if not (flag): flag = start
                            caption = Caption(start, end, transcript)
                            transcribed_text += transcript + " "
                            vtt.captions.append(caption)

                            # reset bin parameters
                            start_sec = word_start_sec
                            start_microsec = word_start_microsec
                            end_sec = start_sec + bin_size
                            transcript = alternative.words[i + 1].word
                            index += 1
                    except IndexError:
                        pass
                # append transcript of last transcript in bin
                start = str(datetime.timedelta(0, start_sec,
                                               start_microsec))[:12]
                end = str(
                    datetime.timedelta(0, last_word_end_sec,
                                       last_word_end_microsec))[:12]
                if len(start) <= 8: start += ".000"
                if len(end) <= 8: end += ".000"
                if flag and flag == start: break
                if not (flag): flag = start
                caption = Caption(start, end, transcript)
                vtt.captions.append(caption)
                index += 1
            except IndexError:
                pass

    # turn transcription list into subtitles
    return (transcribed_text, vtt)
Example #24
0
# -*- coding: utf-8 -*-
import pysrt
import webvtt
from webvtt import WebVTT, Caption
subs = pysrt.open(
    'Tanmay Bakshi - New Google Employee Indian Boy Going To Ninth Grade.srt',
    encoding='utf-8')
vtt = WebVTT()
for ligne in subs:
    print(str(ligne.start))
    print(str(ligne.end))
    print(str(ligne.text))
    caption = Caption(str(ligne.start), str(ligne.end), str(ligne.text))
    #print(caption.start)
    #print(caption.end)
    #print (var2)
    vtt.captions.append(caption)
vtt.save('_fr.vtt')