def main():
    speaker_jon = lambda text: re.match(r">> (Jon:|JON:|jon:) (.*)$",
                                        text)
    speaker_stephen = lambda text: re.match(
        r">> (Stephen:|STEPHEN:|stephen:) (.*)$", text)
    speaker = r">> (Stephen|STEPHEN|stephen)*(Jon|JON|jon)*:?(.*)$"
    vtt = webvtt.WebVTT()
    vtt_files = glob.iglob(f"{DATA_DIR}/*.vtt")
    txt_file = os.path.join(DATA_DIR, "../", "captions.txt")
    with open(txt_file, "w+") as f:
        for vtt_file in vtt_files:
            text = []
            stephen_speaking = True
            for caption in vtt.read(vtt_file):
                t = caption.text
                if speaker_jon(t):
                    stephen_speaking = False
                    continue
                if speaker_stephen(t):
                    stephen_speaking = True
                    t = "".join(x[1] for x in re.findall(
                        r">> (Stephen:|STEPHEN:|stephen:) (.*)$", t))
                    # print(text)
                if not stephen_speaking:
                    continue
                # print(text)
                t = re.sub(speaker, r"\3", t, flags=re.M)
                text += [t.replace("\n", " ")]
                # print(text)
            f.writelines([" ".join(text)] + ["\n", "<|endoftext|>", "\n"])
def test_webvtt():
	vtt = webvtt.WebVTT()
	vtt.save('test.vtt')
	caption = webvtt.Caption('00:00:00.500','00:00:07.000',['Caption line 1\nCaption line 2'])
	vtt.captions.append(caption)

	with open('test.vtt','w') as fd:
		vtt.write(fd)
def write2vtt(dict_list,output_file):
	vtt = webvtt.WebVTT()
	vtt.save(output_file)
	for i in range(len(dict_list)):
		caption = webvtt.Caption(float2string(dict_list[i]["start"]),float2string(dict_list[i]["end"]), [dict_list[i]["content"]])
		vtt.captions.append(caption)

	with open(output_file,'w') as fd:
		vtt.write(fd)

	return
def write_vtt(df, filename):
    """Output to VTT format"""
    logging.info("Writing VTT")

    # Initialize vtt
    vtt = webvtt.WebVTT()

    # Iterate through df
    for index, row in df.iterrows():

        # If the segment has 80 or less characters
        if len(row["comment"]) <= 80:

            caption = webvtt.Caption(
                start=row["start_time"] + ".000",
                end=row["end_time"] + ".000",
                text=row["comment"],
            )

        # If the segment has more than 80 characters, use lines
        else:

            lines = []
            text = row["comment"]

            while len(text) > 80:
                text = text.lstrip()
                last_space = text[:80].rindex(" ")
                lines.append(text[:last_space])
                text = text[last_space:]

            caption = webvtt.Caption(
                row["start_time"] + ".000", row["end_time"] + ".000", lines
            )

        if row["speaker"]:
            caption.identifier = row["speaker"]

        vtt.captions.append(caption)

    vtt.save(filename)
    logging.info(f"VTT saved to {filename}")
Exemple #5
0
 def test_save_no_filename(self):
     vtt = webvtt.WebVTT()
     self.assertRaises(webvtt.errors.MissingFilenameError, vtt.save)
    def download(self,
                 filepath,
                 img_filepath=None,
                 annotation_format=entities.ViewAnnotationOptions.MASK,
                 height=None,
                 width=None,
                 thickness=1,
                 with_text=False):
        """
            Save annotations to file

        :param filepath: path to save annotation
        :param img_filepath: img file path - needed for img_mask
        :param annotation_format:
        :param height:
        :param width:
        :param thickness:
        :param with_text:
        :return:
        """
        dir_name, ex = os.path.splitext(filepath)

        if annotation_format == entities.ViewAnnotationOptions.JSON:
            if not ex:
                filepath = '{}/{}.json'.format(
                    dir_name,
                    os.path.splitext(self.item.name)[0])
            _json = {'_id': self.item.id, 'filename': self.item.filename}
            annotations = list()
            for ann in self.annotations:
                annotations.append(ann.to_json())
            _json['annotations'] = annotations
            with open(filepath, 'w+') as f:
                json.dump(_json, f, indent=2)
        elif annotation_format in [
                entities.ViewAnnotationOptions.MASK,
                entities.ViewAnnotationOptions.INSTANCE,
                entities.ViewAnnotationOptions.ANNOTATION_ON_IMAGE
        ]:
            if not ex:
                filepath = '{}/{}.png'.format(
                    dir_name,
                    os.path.splitext(self.item.name)[0])
            image = None
            if annotation_format == entities.ViewAnnotationOptions.ANNOTATION_ON_IMAGE:
                annotation_format = entities.ViewAnnotationOptions.MASK
                image = np.asarray(Image.open(img_filepath))
            mask = self.show(image=image,
                             thickness=thickness,
                             with_text=with_text,
                             height=height,
                             width=width,
                             annotation_format=annotation_format)
            img = Image.fromarray(mask.astype(np.uint8))
            img.save(filepath)
        elif annotation_format == entities.ViewAnnotationOptions.VTT:
            if not ex:
                filepath = '{}/{}.vtt'.format(
                    dir_name,
                    os.path.splitext(self.item.name)[0])
            annotations_dict = [{
                'start_time': annotation.start_time,
                'end_time': annotation.end_time,
                'text': annotation.coordinates['text']
            } for annotation in self.annotations
                                if annotation.type in ['subtitle']]
            sorted_by_start_time = sorted(annotations_dict,
                                          key=lambda i: i['start_time'])
            vtt = webvtt.WebVTT()
            for ann in sorted_by_start_time:
                s = str(datetime.timedelta(seconds=ann['start_time']))
                if len(s.split('.')) == 1:
                    s += '.000'
                e = str(datetime.timedelta(seconds=ann['end_time']))
                if len(e.split('.')) == 1:
                    e += '.000'
                caption = webvtt.Caption('{}'.format(s), '{}'.format(e),
                                         '{}'.format(ann['text']))
                vtt.captions.append(caption)
            vtt.save(filepath)
        else:
            raise PlatformException(
                error="400",
                message="Unknown annotation option: {}".format(
                    annotation_format))
        return filepath
 def test_captions_attribute(self):
     self.assertListEqual([], webvtt.WebVTT().captions)
 def test_webvtt_total_length_no_parser(self):
     self.assertEqual(webvtt.WebVTT().total_length, 0)