def test_write_to_sqlite(input_file): """ Test production of sqlite output GIVEN an input file WHEN writing to sqlite THEN check output exists and contains content """ # GIVEN an input file # WHEN writing to sqlite output_filename = Path(f"{uuid4().hex}.db") tscribe.write(input_file, save_as=output_filename, format="sqlite") # THEN check output exists and contains content assert output_filename.is_file(), "Output file should exist" conn = sqlite3.connect(str(output_filename)) c = conn.cursor() c.execute("SELECT * FROM transcript") query = c.fetchall() data = tscribe.load_json(input_file) df = tscribe.decode_transcript(data) assert len(query) == len(df), "Database table should be length of dataframe" # Teardown os.remove(output_filename)
def test_write_to_csv(input_file): """ Test production of csv output GIVEN an input file WHEN writing to csv THEN check output exists and contains content """ # GIVEN an input file # WHEN writing to csv output_filename = Path(f"{uuid4().hex}.csv") tscribe.write(input_file, save_as=output_filename, format="csv") # THEN check output exists and contains content assert output_filename.is_file(), "Output file should exist" with open(output_filename, "r") as file: lines = file.readlines() data = tscribe.load_json(input_file) df = tscribe.decode_transcript(data) assert len(lines) == len(df) + 1, "CSV should be length of dataframe + headers" # Teardown os.remove(output_filename)
def test_decode_transcript(input_file): """ Test transcript decoding function GIVEN a data dict WHEN calling decode_transcript(...) THEN """ # GIVEN a data dict data = tscribe.load_json(input_file) # WHEN calling decode_transcript(...) df = tscribe.decode_transcript(data) # THEN assert isinstance( df, pandas.DataFrame ), "decode_transcript should return a Pandas Data Frame" rows, cols = df.shape assert cols == 4, "Dataframe should have four columns" if input_file == "sample_single.json": # TODO pass if input_file == "sample_multiple.json": assert rows == len( data["results"]["speaker_labels"]["segments"] ), "Rows should match number of segments"
def test_write_to_docx(input_file): """ Test production of docx output GIVEN an input file WHEN writing to docx THEN check output exists and contains content """ logging.info("test_write_to_docx") # GIVEN an input file # WHEN writing to docx output_filename = Path(f"{uuid4().hex}.docx") tscribe.write(input_file, save_as=output_filename, format="docx") # THEN check output exists and contains content assert output_filename.is_file(), "Output file should exist" document = Document(output_filename) assert (len(document.tables) == 2 ), "Document should contain two tables, stats and transcript" t_conf = document.tables[0].cell(0, 0).text t_count = document.tables[0].cell(0, 1).text t_perc = document.tables[0].cell(0, 2).text assert (t_conf, t_count, t_perc) == ( "Confidence", "Count", "Percentage", ), "First table should be stats headers" assert len( document.tables[0].rows) == 12, "Stats table should hold 12 rows" t_time = document.tables[1].cell(0, 0).text t_speaker = document.tables[1].cell(0, 1).text t_content = document.tables[1].cell(0, 2).text assert (t_time, t_speaker, t_content) == ( "Time", "Speaker", "Content", ), "Second table should be transcript headers" data = tscribe.load_json(input_file) df = tscribe.decode_transcript(data) assert (len(document.tables[1].rows) == len(df) + 1), "Second table should be length of dataframe + headers" assert ( "chart.png" in document.paragraphs[6]._p.xml), "Chart should be in paragraph six" # Teardown os.remove(output_filename)
def test_write_to_vtt(input_file): """ Test production of vtt format GIVEN an input file WHEN writing to vtt THEN check output exists and contains content """ logging.info("test_write_to_vtt") # GIVEN an input file # WHEN writing to vtt output_filename = Path(f"{uuid4().hex}.vtt") tscribe.write(input_file, save_as=output_filename, format="vtt") # THEN check output exists and contains content vtt = webvtt.read(output_filename) data = tscribe.load_json(input_file) df = tscribe.decode_transcript(data) assert len(vtt.captions) == len( df), "vtt file should have equal captions to df rows" for caption in vtt.captions: assert hasattr(caption, "start"), "each caption should have a start_time" assert hasattr(caption, "end"), "each caption should have a end_time" assert hasattr(caption, "text"), "each caption should have text" assert len(caption.lines) >= len( caption.text) / 80, "text should be split into max 80 long lines" if input_file != "sample_single.json": assert hasattr( caption, "identifier"), "each caption should have an identifier" # Teardown os.remove(output_filename)