def convert_sup_to_srt(filename, file_info):
    column = 2
    try:
        (headers, group) = get_gformat_subs(filename)
        for language in headers:
            subs = pysrt.SubRipFile()
            column = column + 1
            tag = language.replace(" ", "_").decode('ascii', 'ignore')
            for line in group:
                if (len(line[column]) > 1):
                    current_sub = pysrt.SubRipItem()
                    current_sub.start = line[0].replace(',', '.')
                    current_sub.end = line[1].replace(',', '.')
                    current_sub.text = line[column].decode('utf-8')
                    subs.append(current_sub)
            subs.save('temp.vtt')
            new_filename = 'live/subtitles/' + os.path.splitext(
                os.path.basename(filename))[0] + tag + '.vtt'
            os.system('echo WEBVTT > ' + new_filename)
            os.system('cat temp.vtt >> ' + new_filename)
            shortname = os.path.splitext(os.path.basename(filename))[0]
            if (shortname in prog_dict):
                if (prog_dict[shortname] < len(subs)):
                    prog_dict[shortname] = len(subs)
            else:
                prog_dict[shortname] = len(subs)
            file_info.append([shortname, len(subs), new_filename, language])
            print(new_filename)
    except AttributeError:
        # We would expect this to be because we've been handed a file that's outside our type
        # TODO: we should identify exactly where this error appears for
        # various types of tests
        pass
Ejemplo n.º 2
0
def import_from_srt(subtitulation, vars):
    import pysrt
    import StringIO
    # Create the srt object
    mysrt = pysrt.SubRipFile(encoding=ENCODING)
    srtinput = [unicode(line, ENCODING) for line in \
                vars.source.file.read().splitlines()]
    # mysrt.read(vars.source.file)
    mysrt.read(srtinput)
    result = dict(removed=0, inserted=0, errors=[])
    if vars.overwrite:
        # Delete any existent subtitle
        result["removed"] = db(
            db.subtitle.subtitulation_id == subtitulation.id).count()
        db(db.subtitle.subtitulation_id == subtitulation.id).delete()
    for subtitle in mysrt:
        body = subtitle.text
        try:
            starts = subtitle.start.to_time()
            ends = subtitle.end.to_time()
        except ValueError:
            result["errors"].append(T("Invalid time input: %(start)s - %(end)s", lazy=False) % \
                dict(start=subtitle.start, end=subtitle.end))
        db.subtitle.insert(subtitulation_id=subtitulation.id,
                           body=body,
                           starts=starts,
                           ends=ends)
        result["inserted"] += 1
    return result
Ejemplo n.º 3
0
def srt_formatter(subtitles, show_before=0, show_after=0):
    sub_rip_file = pysrt.SubRipFile()
    for i, ((start, end), text) in enumerate(subtitles, start=1):
        item = pysrt.SubRipItem()
        item.index = i
        item.text = six.text_type(text)
        item.start.seconds = max(0, start - show_before)
        item.end.seconds = end + show_after
        sub_rip_file.append(item)
    return '\n'.join(six.text_type(item) for item in sub_rip_file)
Ejemplo n.º 4
0
def srt_formatter(subtitles, show_before=0, show_after=0):
    f = pysrt.SubRipFile()
    for (rng, text) in subtitles:
        item = pysrt.SubRipItem()
        item.text = force_unicode(text)
        start, end = rng
        item.start.seconds = max(0, start - show_before)
        item.end.seconds = end + show_after
        f.append(item)
    return '\n'.join(map(unicode, f))
Ejemplo n.º 5
0
    def CreateCleanSubAndMuteList(self, cleanSubsFileSpec=None):
        subFileParts = os.path.splitext(self.inputSubsFileSpec)

        self.tmpSubsFileSpec = subFileParts[0] + "_utf8" + subFileParts[1]
        shutil.copy2(self.inputSubsFileSpec, self.tmpSubsFileSpec)
        UTF8Convert(self.tmpSubsFileSpec)

        if cleanSubsFileSpec is not None:
            self.cleanSubsFileSpec = cleanSubsFileSpec
        else:
            self.cleanSubsFileSpec = subFileParts[0] + "_clean" + subFileParts[
                1]

        lines = []

        with open(self.swearsFileSpec) as f:
            lines = [line.rstrip('\n') for line in f]

        for line in lines:
            lineMap = line.split("|")
            if len(lineMap) > 1:
                self.swearsMap[lineMap[0]] = lineMap[1]
            else:
                self.swearsMap[lineMap[0]] = "*****"

        replacer = re.compile(
            r'\b(' + '|'.join(self.swearsMap.keys()) + r')\b', re.IGNORECASE)

        subs = pysrt.open(self.tmpSubsFileSpec)
        newSubs = pysrt.SubRipFile()
        for sub in subs:
            newText = replacer.sub(lambda x: self.swearsMap[x.group()],
                                   sub.text)
            if (newText != sub.text):
                newSub = sub
                newSub.text = newText
                newSubs.append(newSub)
        newSubs.save(self.cleanSubsFileSpec)

        newLines = []
        for sub in newSubs:
            newLines.append([sub.start.to_time(), sub.end.to_time()])

        self.muteTimeList = []
        for timePair in newLines:
            lineStart = (timePair[0].hour * 60.0 * 60.0) + (
                timePair[0].minute * 60.0) + timePair[0].second + (
                    timePair[0].microsecond / 1000000.0)
            lineEnd = (timePair[1].hour * 60.0 * 60.0) + (
                timePair[1].minute * 60.0) + timePair[1].second + (
                    timePair[1].microsecond / 1000000.0)
            self.muteTimeList.append("volume=enable='between(t," +
                                     format(lineStart, '.3f') + "," +
                                     format(lineEnd, '.3f') + ")':volume=0")
Ejemplo n.º 6
0
def srt_formatter(subtitles, show_before=0, show_after=0):
    f = pysrt.SubRipFile()
    for i, (rng, text) in enumerate(subtitles, 1):
        item = pysrt.SubRipItem()
        item.index = i
        item.text = force_unicode(text)
        start, end, num = rng
        item.start.seconds = max(0, start - show_before)
        item.end.seconds = end + show_after
        f.append(item)
    return '\n'.join(six.text_type(item) for item in f)
Ejemplo n.º 7
0
def criarArquivoSRT(tempodalegenda, preenchimento_antes=0, preenchimento_depois=0):
    rip = pysrt.SubRipFile()
    for i, ((inicio, fim), text) in enumerate(tempodalegenda, start=1):
        item = pysrt.SubRipItem()
        item.index = i
        item.text = six.text_type(text)
        item.start.seconds = max(0, inicio - preenchimento_antes)
        item.end.seconds = fim + preenchimento_depois
        rip.append(item)
    legenda = '\n'.join(six.text_type(item) for item in rip)
    return legenda
Ejemplo n.º 8
0
def srt_formatter(subtitles, show_before=0, show_after=0):
    f = pysrt.SubRipFile()
    for i, (rng, text) in enumerate(subtitles, 1):
        item = pysrt.SubRipItem()
        item.index = i
        item.text = force_unicode(text)
        start = rng[0]
        end = rng[1]
        item.start.seconds = max(0, start - show_before)
        item.end.seconds = end + show_after
        f.append(item)
    return '\n'.join(map(str, f))
Ejemplo n.º 9
0
 def auto_generate(self, widget, name):
     shutil.rmtree('./splitAudio')
     os.mkdir('./splitAudio')
     self.sound_file = AudioSegment.from_file(self.filename[8:])
     self.len_file = len(self.sound_file)
     print("Length of track: " ,self.len_file/second, "seconds")
     self.sub_write_file = pysrt.SubRipFile(encoding='utf-8')
     self.sub_write_file.save(self.filename[8:-4] + ".srt", encoding='utf-8')                    
     
     self.gen = multiprocessing.Process(target = self.start_generate, args=())
     self.gen.start()
     self.auto_generate_subtitles = thread.start_new_thread(self.show_generated, ())
     return
Ejemplo n.º 10
0
def xml_to_srt(xml_data):
    """
    xml_data - ET
    Converts XML data received from Google's servers and returns a SubRipFile instance.
    """
    f = pysrt.SubRipFile()
    for child in xml_data:
        sub = pysrt.SubRipItem()
        sub.text = h.unescape(child.text)
        sub.start.seconds = float(child.attrib["start"])
        sub.end.seconds = float(child.attrib["start"]) + float(child.attrib["dur"])
        f.append(sub)
    return f
Ejemplo n.º 11
0
def srt_formatter(subtitles, padding_before=0, padding_after=0):
    """
    Serialize a list of subtitles according to the SRT format, with optional time padding.
    """
    sub_rip_file = pysrt.SubRipFile()
    for i, ((start, end), text) in enumerate(subtitles, start=1):
        item = pysrt.SubRipItem()
        item.index = i
        item.text = six.text_type(text)
        item.start.seconds = max(0, start - padding_before)
        item.end.seconds = end + padding_after
        sub_rip_file.append(item)
    return '\n'.join(six.text_type(item) for item in sub_rip_file)
Ejemplo n.º 12
0
def join_srt_files(srt_top, srt_btm, srt_out):
    """Join two subtitles and save result.
    """
    top = pysrt.open(srt_top)
    btm = pysrt.open(srt_btm)

    merged = pysrt.SubRipFile(items=btm)
    for item in top:
        item.text = TOP_SRT_TEMPLATE.format(item.text)
        merged.append(item)

    merged.sort()
    merged.clean_indexes()
    merged.save(srt_out)
Ejemplo n.º 13
0
 def generate(self,
              subtitles,
              show_before=0,
              show_after=0,
              *args,
              **kwargs) -> str:
     sub_rip_file = pysrt.SubRipFile()
     for i, ((start, end), text) in enumerate(subtitles, start=1):
         item = pysrt.SubRipItem()
         item.index = i
         item.text = str(text)
         item.start.seconds = max(0, start - show_before)
         item.end.seconds = end + show_after
         sub_rip_file.append(item)
     return '\n'.join(str(item) for item in sub_rip_file)
Ejemplo n.º 14
0
def make_subtitles(frames_time, frames_annotation, user_id):
    file = pysrt.SubRipFile(encoding='utf-8')
    length = len(frames_time)
    for i in range(length - 1):
        sub = pysrt.SubRipItem()
        sub.index = frames_time[i][0] + 1
        sub.start.seconds = frames_time[i][1]
        sub.end.seconds = frames_time[i + 1][1]
        sub.text = frames_annotation[i][1]
        file.append(sub)
    sub = pysrt.SubRipItem()
    sub.index = frames_time[length - 1][0] + 1
    sub.start.seconds = frames_time[length - 1][1]
    sub.text = frames_annotation[length - 1][1]
    file.append(sub)
    file.save(MAIN_DIRECTORY + '%d/subtitles.srt' % user_id)
Ejemplo n.º 15
0
def write_transcripts(transcript_filename, transcript, reg):
    print(transcript)
    import six
    sub_rip = pysrt.SubRipFile()
    for i, (start, end), text in zip(range(len(transcript)), reg, transcript):
        print(i, start, end, text)
        item = pysrt.SubRipItem()
        item.index = i
        item.text = six.text_type(text)
        item.start.seconds = max(0, start)
        item.end.seconds = end
        sub_rip.append(item)
    fin_sub = '\n'.join(six.text_type(item) for item in sub_rip)
    with open(output_filepath + transcript_filename, "wb") as f:
        f.write(fin_sub.encode("utf-8"))
    print("+ Successfully Generated Subtitles.")
    return True
Ejemplo n.º 16
0
def combine_srt(srt_list):
    """
    srt_list - a list of SubRipFiles
    Combines the text of all SubRipFiles in srt_list and returns a SubRipFile instance.
    """
    if srt_list is None or len(srt_list) == 0:
        return None
    f = pysrt.SubRipFile()
    for index in xrange(len(srt_list[0])):
        sub = pysrt.SubRipItem()
        for srt in srt_list:
            sub.text += (srt[index].text + "\n")
        sub.text = sub.text.rstrip()
        sub.start = srt_list[0][index].start
        sub.end = srt_list[0][index].end
        f.append(sub)
    return f
Ejemplo n.º 17
0
    def execute(self, context):
        scene = context.scene
        edit_channel = scene.subtitle_edit_channel

        fps = scene.render.fps / scene.render.fps_base

        original_start = scene.frame_start
        original_end = scene.frame_end

        all_strips = list(
            sorted(scene.sequence_editor.sequences_all,
                   key=lambda x: x.frame_start))

        text_strips = []
        for x in range(len(all_strips)):
            if (all_strips[x].type == "TEXT"
                    and all_strips[x].channel == edit_channel):
                text_strips.append(all_strips[x])

        wav_path = os.path.join(os.path.dirname(__file__), 'temp.wav')
        txt_path = os.path.join(os.path.dirname(__file__), 'temp.txt')
        srt_path = os.path.join(os.path.dirname(__file__), 'temp.srt')

        subs = pysrt.SubRipFile()

        for i in range(len(text_strips)):
            frame_start = text_strips[i].frame_start
            frame_end = text_strips[i].frame_final_end - 1
            start = (frame_start + 1) / fps
            text = text_strips[i].text

            scene.frame_start = frame_start
            scene.frame_end = frame_end

            bpy.ops.sound.mixdown(filepath=wav_path,
                                  container="WAV",
                                  codec="PCM")
            write_word_level(text, txt_path)
            subs.extend(make_subs(wav_path, txt_path, srt_path, start))

        subsutils.addSubs(context, subs, use_color=True)

        return {"FINISHED"}
Ejemplo n.º 18
0
def export_to_srt(subtitulation):
    subtitles = db(db.subtitle.subtitulation_id == subtitulation.id).select(
        orderby=db.subtitle.starts)
    import pysrt
    import StringIO
    sio = StringIO.StringIO()
    mysrt = pysrt.SubRipFile(encoding=ENCODING)
    for i, subtitle in enumerate(subtitles):
        sri = pysrt.SubRipItem()
        if isinstance(subtitle.body, unicode):
            print "is unicode"
            sri.text = subtitle.body
        else:
            print "is not unicode"
            sri.text = unicode(subtitle.body, ENCODING)
        sri.start = pysrt.SubRipTime.from_time(subtitle.starts)
        sri.end = pysrt.SubRipTime.from_time(subtitle.ends)
        sri.index = i
        mysrt.append(sri)
    mysrt.write_into(sio)
    sio.seek(0)
    return sio
Ejemplo n.º 19
0
def split_subtitles(srt_file, invert_commercials, out_file):
    subs = pysrt.open(srt_file)
    parts = []
    prev = 0.0
    shift = 0
    for c in invert_commercials:
        shift = shift - float(c[0]) + prev
        s = []
        for i in subs.data:
            if i.start >= to_time(c[0]) and (c[1] is None
                                             or i.start < to_time(c[1])):
                temp = copy(i)
                time = to_time(shift)
                temp.shift(hours=time['hours'],
                           minutes=time['minutes'],
                           seconds=time['seconds'],
                           milliseconds=time['milliseconds'])
                parts.append(temp)
            else:
                pass
            prev = c[1] if c[1] is not None else -1
    subs = pysrt.SubRipFile(items=parts)
    subs.save(out_file)
Ejemplo n.º 20
0
def merge_sub(sub1, sub2, bar, driver):
    if space_var.get() == 1:
        space_sub = '\n&nbsp;\n'
    else:
        space_sub = '\n'
    sub1_df = dataframe_sub(sub1, "en")
    sub2_df = dataframe_sub(sub2, "ru")
    df = pd.concat([sub1_df, sub2_df], axis=0)
    df['sum'] = df[['start', 'end']].sum(axis=1)
    df['plus'] = (df['start'] + df['end']) / 2
    df = df.sort_values(by='start', ascending=True)
    # агломеративная кластеризация
    if clusters_auto_var.get() == 1:
        clusters_list = []
        # оценка качества с помощью "силуэта"
        silhouette = []
        for i in np.linspace(0.2, 1, 20):
            root.update()
            threshold = float(i) * 10000
            clustering = AgglomerativeClustering(
                n_clusters=None,
                distance_threshold=threshold).fit(df[['start', 'end']])
            clusters = clustering.labels_
            clusters_list.append(len(pd.unique(clusters)))
            score = silhouette_score(df[['start', 'end']], clusters)
            silhouette.append(score)
        max_silhouette = np.argmax(silhouette)
        clustering = AgglomerativeClustering(
            n_clusters=clusters_list[max_silhouette]).fit(df[['start', 'end']])
    else:
        threshold = float(clusters_manual_entry.get()) * 10000
        clustering = AgglomerativeClustering(
            n_clusters=None,
            distance_threshold=threshold,
            linkage=clusters_method_combobox.get()).fit(df[['start', 'end']])
    clusters = clustering.labels_
    # добавление найденных кластеров
    df['cluster'] = clusters
    bar_subs = float(bar) / float(len(pd.unique(clusters)))
    # создание нового файла субтитров
    double_sub = pysrt.SubRipFile(encoding='utf-8')
    translate_list = pysrt.SubRipFile(encoding='utf-8')
    for n, i in enumerate(pd.unique(clusters)):
        root.update()
        progressBar['value'] += bar_subs
        df_en = df[(df['language'] == 'en') & (df['cluster'] == i)]
        df_ru = df[(df['language'] == 'ru') & (df['cluster'] == i)]
        df_group_en = df_en.groupby('cluster').agg({
            'text': ' '.join,
            'start': min,
            'end': max,
            'language': 'first'
        })
        df_group_ru = df_ru.groupby('cluster').agg({
            'text': ' '.join,
            'start': min,
            'end': max,
            'language': 'first'
        })
        df_group = df_group_en.merge(
            df_group_ru,
            on=['cluster', 'text', 'start', 'end', 'language'],
            how='outer').groupby('cluster').agg({
                'text': space_sub.join,
                'start': 'first',
                'end': 'first',
                'language': ''.join
            })
        sub = pysrt.SubRipItem(index=n + 1,
                               start=int(df_group.iloc[0]['start']),
                               end=int(df_group.iloc[0]['end']),
                               text=str(df_group.iloc[0]['text']))
        double_sub.append(sub)
        if translate_var.get() == 1 and df_group['language'].values == 'en':
            translate_list.append(sub)
    if translate_var.get() == 1 and translate_list:
        translate_sub(translate_list, bar, driver)
    # переиндексация субтитров
    double_sub.clean_indexes()
    return double_sub
args = parser.parse_args()

# Set punctuation that we are looking for
srtRawPath = args.srtRawPath
inputName = args.inputName
pathRealLine = args.pathRealLine
outputSRT = args.outputSRT
END_PUNCTUATION = '.?!'


# Read files and debugging files
srtOriginal = pysrt.open(srtRawPath+inputName+'.srt', encoding='utf-8')#, encoding='iso-8859-1'
fileLine = pathRealLine+inputName+'_linea.txt'
linesIterator = open(fileLine, encoding='utf-8')

srtTransformed = pysrt.SubRipFile()
srtTransformedName = outputSRT+inputName + '.srt'
srtTransformedName1  = outputSRT+inputName + '_trans1.srt'
srtTransformedName2 = outputSRT+inputName + '_trans2.srt'
alertNumLines = open(outputSRT+inputName+"_alerta.txt",'w')


# Number of lines in per-line file should be the same as the number of lines calculated in SRT-raw
numLinesOrig = sum(1 for line in linesIterator)
numSubs = len(srtOriginal)
first_sub = srtOriginal[0]


prevText = ''
prevEnd = first_sub.end
prevStart = first_sub.start
Ejemplo n.º 22
0
def find_summary_regions(srt_filename, summarizer, duration, language,
                         bonusWords, stigmaWords, videonamepart):
    srt_file = pysrt.open(srt_filename)
    # Find the average amount of time required for each subtitle to be showned

    clipList = list(map(srt_item_to_range, srt_file))

    avg_subtitle_duration = total_duration_of_regions(clipList) / len(srt_file)

    # Find the no of sentences that will be required in the summary video
    n_sentences = duration / avg_subtitle_duration
    print("nsentance : " + str(n_sentences))

    # get the summarize video's subtitle array
    [summary,
     summarizedSubtitles] = summarize(srt_file, summarizer, n_sentences,
                                      language, bonusWords, stigmaWords)
    # Check whether the total duration is less than the duration required for the video
    total_time = total_duration_of_regions(summary)
    print("total_time : " + str(total_time))
    try_higher = total_time < duration
    prev_total_time = -1
    # If the duration which we got is higher than required
    if try_higher:
        # Then until the resultant duration is higher than the required duration run a loop in which the no of sentence is increased by 1
        while total_time < duration:
            if (prev_total_time == total_time):
                print("1 : Maximum summarization time reached")
                break
            print("1 : total_time : duration " + str(total_time) + " " +
                  str(duration))
            n_sentences += 1
            [summary, summarizedSubtitles] = summarize(srt_file, summarizer,
                                                       n_sentences, language,
                                                       bonusWords, stigmaWords)
            prev_total_time = total_time
            total_time = total_duration_of_regions(summary)
    else:
        # Else if  the duration which we got is lesser than required
        # Then until the resultant duration is lesser than the required duration run a loop in which the no of sentence is increased by 1
        while total_time > duration:
            if (n_sentences <= 2):
                print("2 : Minimum summarization time reached")
                break
            print("2 : total_time : duration " + str(total_time) +
                  str(duration))
            n_sentences -= 1
            [summary, summarizedSubtitles] = summarize(srt_file, summarizer,
                                                       n_sentences, language,
                                                       bonusWords, stigmaWords)
            total_time = total_duration_of_regions(summary)

    print("************ THis is summary array *********")
    print(summary)
    print("**********************************")

    print(
        "************************THis is summarizedSubtitles array *******************"
    )
    print(summarizedSubtitles)
    print("**********************************************************")
    # Find the duration of each subtitle and add it to the ending time of the previous subtitle
    subs = []
    starting = 0
    sub_rip_file = pysrt.SubRipFile()
    for index, item in enumerate(summarizedSubtitles):
        newSubitem = pysrt.SubRipItem()
        newSubitem.index = index
        newSubitem.text = item.text
        # First find duration
        duration = summary[index][1] - summary[index][0]
        # Then find the ending time
        ending = starting + duration
        newSubitem.start.seconds = starting
        newSubitem.end.seconds = ending
        sub_rip_file.append(newSubitem)
        # subs.append((index,starting,ending,item.text))
        starting = ending

    print(sub_rip_file)

    # print(subs)

    path = videonamepart + ".srt"
    with open(path, "w+") as sf:
        for i in range(0, len(sub_rip_file)):
            sf.write(str(sub_rip_file[i]))
            sf.write("\n")
    sf.close()

    #test file for finding emotions
    # path = "./media/documents/summarizedSubtitleText.txt"
    # with open(path,"w+") as stf:
    #     for i in range(0,len(summarizedSubtitles)):
    #         stf.write(str(summarizedSubtitles[i].text))
    #         stf.write("\n")
    # stf.close()

    # return the resulant summarized subtitle array
    return summary
Ejemplo n.º 23
0
    def CreateCleanSubAndMuteList(self):
        if (self.inputSubsFileSpec is None) or (not os.path.isfile(self.inputSubsFileSpec)):
            raise IOError(
                errno.ENOENT,
                f"Input subtitle file unspecified or not found ({os.strerror(errno.ENOENT)})",
                self.inputSubsFileSpec,
            )

        subFileParts = os.path.splitext(self.inputSubsFileSpec)

        self.tmpSubsFileSpec = subFileParts[0] + "_utf8" + subFileParts[1]
        shutil.copy2(self.inputSubsFileSpec, self.tmpSubsFileSpec)
        UTF8Convert(self.tmpSubsFileSpec)

        if not self.cleanSubsFileSpec:
            self.cleanSubsFileSpec = subFileParts[0] + "_clean" + subFileParts[1]

        if not self.edlFileSpec:
            cleanSubFileParts = os.path.splitext(self.cleanSubsFileSpec)
            self.edlFileSpec = cleanSubFileParts[0] + '.edl'

        lines = []

        with open(self.swearsFileSpec) as f:
            lines = [line.rstrip('\n') for line in f]

        for line in lines:
            lineMap = line.split("|")
            if len(lineMap) > 1:
                self.swearsMap[lineMap[0]] = lineMap[1]
            else:
                self.swearsMap[lineMap[0]] = "*****"

        replacer = re.compile(r'\b(' + '|'.join(self.swearsMap.keys()) + r')\b', re.IGNORECASE)

        subs = pysrt.open(self.tmpSubsFileSpec)
        newSubs = pysrt.SubRipFile()
        newTimestampPairs = []

        # for each subtitle in the set
        # if text contains profanity...
        # OR if the next text contains profanity and lies within the pad ...
        # OR if the previous text contained profanity and lies within the pad ...
        # then include the subtitle in the new set
        prevNaughtySub = None
        for sub, subPeek in pairwise(subs):
            newText = replacer.sub(lambda x: self.swearsMap[x.group()], sub.text)
            newTextPeek = (
                replacer.sub(lambda x: self.swearsMap[x.group()], subPeek.text) if (subPeek is not None) else None
            )
            # this sub contains profanity, or
            if (
                (newText != sub.text)
                or
                # we have defined a pad, and
                (
                    (self.swearsPadMillisec > 0)
                    and (newTextPeek is not None)
                    and
                    # the next sub contains profanity and is within pad seconds of this one, or
                    (
                        (
                            (newTextPeek != subPeek.text)
                            and ((subPeek.start.ordinal - sub.end.ordinal) <= self.swearsPadMillisec)
                        )
                        or
                        # the previous sub contained profanity and is within pad seconds of this one
                        (
                            (prevNaughtySub is not None)
                            and ((sub.start.ordinal - prevNaughtySub.end.ordinal) <= self.swearsPadMillisec)
                        )
                    )
                )
            ):
                subScrubbed = newText != sub.text
                newSub = sub
                newSub.text = newText
                newSubs.append(newSub)
                if subScrubbed:
                    prevNaughtySub = sub
                    newTimes = [
                        pysrt.SubRipTime.from_ordinal(sub.start.ordinal - self.swearsPadMillisec).to_time(),
                        pysrt.SubRipTime.from_ordinal(sub.end.ordinal + self.swearsPadMillisec).to_time(),
                    ]
                else:
                    prevNaughtySub = None
                    newTimes = [sub.start.to_time(), sub.end.to_time()]
                newTimestampPairs.append(newTimes)
            else:
                if self.fullSubs:
                    newSubs.append(sub)
                prevNaughtySub = None

        newSubs.save(self.cleanSubsFileSpec)

        self.muteTimeList = []
        edlLines = []
        for timePair in newTimestampPairs:
            lineStart = (
                (timePair[0].hour * 60.0 * 60.0)
                + (timePair[0].minute * 60.0)
                + timePair[0].second
                + (timePair[0].microsecond / 1000000.0)
            )
            lineEnd = (
                (timePair[1].hour * 60.0 * 60.0)
                + (timePair[1].minute * 60.0)
                + timePair[1].second
                + (timePair[1].microsecond / 1000000.0)
            )
            self.muteTimeList.append(
                "volume=enable='between(t," + format(lineStart, '.3f') + "," + format(lineEnd, '.3f') + ")':volume=0"
            )
            if self.edl:
                edlLines.append(f"{format(lineStart, '.1f')}\t{format(lineEnd, '.3f')}\t1")
        if self.edl and (len(edlLines) > 0):
            with open(self.edlFileSpec, 'w') as edlFile:
                for item in edlLines:
                    edlFile.write(f"{item}\n")
Ejemplo n.º 24
0
    def process(self):
        gentle_file = self.jsonfile
        out_file = self.outfile
        srt_file = self.srtfile

        g = json.load(open(gentle_file, 'r'))
        t = g['transcript']
        g_words = [
            w for w in g['words'] if w['case'] != 'not-found-in-transcript'
        ]

        sentences = t.split('\n')
        sentences = [sent.replace('-', ' ') for sent in sentences]

        inputsrt_elems = pysrt.open(srt_file)
        assert (len(inputsrt_elems) == len(sentences))

        srt_elems = pysrt.SubRipFile()
        counter = 0
        for sent_i, sent in enumerate(sentences):
            if type(sent) != type(u''):
                sent = sent.decode('utf-8')

            words = sent.split()
            start_time_found = False
            for cur_word in words:
                if not re.search(r'(\w|\’\w|\'\w)+', cur_word, re.UNICODE):
                    continue

                for w in re.finditer(r'(\w|\’\w|\'\w)+', cur_word, re.UNICODE):
                    word = w.group()
                    gentle_word = g_words[counter]
                    clean_word = re.search(r'(\w|\’\w|\'\w)+', word,
                                           re.UNICODE).group()
                    if False:
                        if (clean_word.lower() != gentle_word['word'].lower()):
                            pdb.set_trace()
                            print("Error")
                    else:
                        assert (
                            clean_word.lower() == gentle_word['word'].lower())

                    if gentle_word['case'] == 'success':
                        #Retain first valid time boundary
                        if start_time_found == False:
                            start_time = gentle_word['start']
                            start_time_found = True

                        #keep scanning until the last valid time bounday
                        end_time = gentle_word['end']

                    counter += 1

            if start_time_found == False:
                start_time = inputsrt_elems[sent_i].start
                end_time = inputsrt_elems[sent_i].end
            else:
                start_time = extract_time_tuple(start_time)
                end_time = extract_time_tuple(end_time)

            elem = pysrt.SubRipItem()
            elem.index = sent_i + 1
            elem.text = sent
            elem.start = start_time
            elem.end = end_time

            srt_elems.append(elem)

        srt_elems.save(out_file, encoding='utf-8')
Ejemplo n.º 25
0
            audio = r.record(source)
            text = r.recognize_sphinx(audio)
            file = pysrt.open('my_srt.srt', encoding='utf-8')
            sub = pysrt.SubRipItem()
            sub.index = counter
            counter += 1
            sub.start.milliseconds = start_chunk + splits[i]
            sub.end.milliseconds = start_chunk + splits[i + 1]
            sub.text = text
            file.append(sub)
            file.save('my_srt.srt', encoding='utf-8')
            print(text)


shutil.rmtree('./splitAudio')
os.mkdir('./splitAudio')
sound_file = AudioSegment.from_file(sys.argv[1], "mp4")
len_file = len(sound_file)
print("Length of track: ", len_file / second, "seconds")
file = pysrt.SubRipFile(encoding='utf-8')
file.save('my_srt.srt', encoding='utf-8')

chunk_end = chunk_size
while (chunk_end < len_file):
    chunk_file = sound_file[chunk_end - chunk_size:chunk_end]
    do_subtitles_generation(chunk_file, chunk_end - chunk_size)
    chunk_end += chunk_size

do_subtiles_generation(sound_file[chunk_end - chunk_size:],
                       chunk_end - chunk_size)
                    html = driver.page_source
                    soup = BeautifulSoup(html.encode('utf-8'),
                                         features='html.parser')
                    maindiv = soup.findAll("div", {"id": "show"})[0]
                    basicdiv = maindiv.findAll("b")
                    break
                except:
                    # html = driver.find_element_by_tag_name('html')
                    time.sleep(1)
                    driver.refresh()

            submissing = False
            if html.find(
                    "Sorry, there are no subtitle available for this video."
            ) != -1:
                file = pysrt.SubRipFile()
                sub = pysrt.SubRipItem(1,
                                       start='00:00:00,000',
                                       end='00:00:01,000',
                                       text="Sub was not found")
                file.append(sub)
                subtype = "F-"
                file.save("H:\#Everything Else\#Project Ashwini\SRT\\" +
                          channelName + "\\" + subtype + channelName + "-" +
                          str(videolinknum) + ".srt",
                          encoding='utf-8')
                submissing = True
                print("Sub missing")

            elif len(basicdiv) <= 1:
                file = pysrt.SubRipFile()
Ejemplo n.º 27
0
  def CreateCleanSubAndMuteList(self, cleanSubsFileSpec=None):
    subFileParts = os.path.splitext(self.inputSubsFileSpec)
    if cleanSubsFileSpec is not None:
      self.cleanSubsFileSpec = cleanSubsFileSpec
      subFileParts = os.path.splitext(self.cleanSubsFileSpec)
      self.cleanSubsNotModFileSpec = subFileParts[0] + "_all_not_cleaned" + subFileParts[1]
    else:
      #self.cleanSubsFileSpec = subFileParts[0] + "_clean" + subFileParts[1]
      subFileFirstParts = os.path.splitext(subFileParts[0])
      self.cleanSubsFileSpec = subFileFirstParts[0] + ".clean" + subFileFirstParts[1] + ".forced" + subFileParts[1]
      #self.cleanSubsNotModFileSpec = subFileFirstParts[0] + ".all_not_cleaned" + subFileFirstParts[1] + subFileParts[1]
      self.cleanSubsNotModFileSpec = subFileFirstParts[0] + '.clean' + subFileFirstParts[1] + subFileParts[1]
      if os.path.isfile(self.inputSubsFileSpec):
        shutil.copyfile(self.inputSubsFileSpec, subFileFirstParts[0] + '.orig' + subFileFirstParts[1] + subFileParts[1])
            
    # remove brackets that interfere with ffmpeg subtitles filter
    self.cleanSubsFileSpec = self.cleanSubsFileSpec.translate({ord(x): '' for x in ['[',']']})

    lines = []

    with open(self.swearsFileSpec) as f:
      lines = [line.rstrip('\n') for line in f]

    for line in lines:
      lineMap = line.split("|")
      if len(lineMap) > 1:
        self.swearsMap[lineMap[0]] = lineMap[1]
      else:
        self.swearsMap[lineMap[0]] = "*****"

    replacer = re.compile(r'\b(' + '|'.join(self.swearsMap.keys()) + r')\b', re.IGNORECASE)


    blob = open(self.inputSubsFileSpec, 'rb').read()
    m = magic.open(magic.MAGIC_MIME_ENCODING)
    m.load()
    encoding = m.buffer(blob)

    subs = pysrt.open(self.inputSubsFileSpec, encoding=encoding)
    newSubs = pysrt.SubRipFile()
    newSubsNotMod = pysrt.SubRipFile()
    for sub in subs:
      newText = replacer.sub(lambda x: self.swearsMap[x.group()], sub.text)
      #print("old: "+sub.text+", new: "+newText)
      if (newText != sub.text):
        newSub = sub
        newSub.text = newText
        newSubs.append(newSub)
      #else:
      newSubsNotMod.append(sub)
    newSubs.save(self.cleanSubsFileSpec)
    newSubsNotMod.save(self.cleanSubsNotModFileSpec)

    newLines = []
    for sub in newSubs:
      newLines.append([sub.start.to_time(), sub.end.to_time()])

    self.muteTimeList = []
    for timePair in newLines:
      lineStart = (timePair[0].hour * 60.0 * 60.0) + (timePair[0].minute * 60.0) + timePair[0].second + (timePair[0].microsecond / 1000000.0)
      lineEnd = (timePair[1].hour * 60.0 * 60.0) + (timePair[1].minute * 60.0) + timePair[1].second + (timePair[1].microsecond / 1000000.0)
      self.muteTimeList.append("volume=enable='between(t," + format(lineStart, '.3f') + "," + format(lineEnd, '.3f') + ")':volume=0")