def test_write_to_sqlite(input_file): """ Test production of sqlite output GIVEN an input file WHEN writing to sqlite THEN check output exists and contains content """ logging.info("test_write_to_sqlite") # GIVEN an input file # WHEN writing to sqlite output_filename = Path(f"{uuid4().hex}.db") tscribe.write(input_file, save_as=output_filename, format="sqlite") # THEN check output exists and contains content assert output_filename.is_file(), "Output file should exist" conn = sqlite3.connect(str(output_filename)) c = conn.cursor() c.execute("SELECT * FROM transcript") query = c.fetchall() data = tscribe.load_json_as_dict(input_file) df = tscribe.decode_transcript_to_dataframe(data) assert len(query) == len(df), "Database table should be length of dataframe" # Teardown os.remove(output_filename)
def test_calculate_confidence_statistics(input_file): """ Test confidence stats data modeling GIVEN a data dict WHEN calling calculate_confidence_statistics(...) THEN return the data model with the right components """ logging.info("test_calculate_confidence_statistics") # GIVEN a data dict # input_file = "sample_multiple.json" data = tscribe.load_json_as_dict(input_file) # WHEN calling calculate_confidence_statistics(...) stats = tscribe.calculate_confidence_statistics(data) # THEN return the data model with the right components assert isinstance(stats, dict), "Stats should be of dict type" assert "timestamps" in stats, "Data model should include timestamps" assert "9.8" in stats, "Data model should include 9.8" assert "9" in stats, "Data model should include 9" assert "8" in stats, "Data model should include 8" assert "7" in stats, "Data model should include 7" assert "6" in stats, "Data model should include 6" assert "5" in stats, "Data model should include 5" assert "4" in stats, "Data model should include 4" assert "3" in stats, "Data model should include 3" assert "2" in stats, "Data model should include 2" assert "1" in stats, "Data model should include 1" assert "0" in stats, "Data model should include 0"
def test_write_to_csv(input_file): """ Test production of csv output GIVEN an input file WHEN writing to csv THEN check output exists and contains content """ logging.info("test_write_to_csv") # GIVEN an input file # WHEN writing to csv output_filename = Path(f"{uuid4().hex}.csv") tscribe.write(input_file, save_as=output_filename, format="csv") # THEN check output exists and contains content assert output_filename.is_file(), "Output file should exist" with open(output_filename, "r") as file: lines = file.readlines() data = tscribe.load_json_as_dict(input_file) df = tscribe.decode_transcript_to_dataframe(data) assert len(lines) == len(df) + 1, "CSV should be length of dataframe + headers" # Teardown os.remove(output_filename)
def test_sample_files(sample): """Confirm test files accessible and safe""" logging.info("test_sample_files") assert Path(sample).is_file(), "Sample file should exist" assert Path(sample).suffix == ".json", "Sample files should be json files" data = tscribe.load_json_as_dict(sample) assert data["accountId"] == "XXXXXXXXXXXX"
def test_decode_transcript_to_dataframe(input_file): """ Test transcript decoding function GIVEN a data dict WHEN calling decode_transcript_to_dataframe(...) THEN """ logging.info("test_decode_transcript_to_dataframe") # GIVEN a data dict data = tscribe.load_json_as_dict(input_file) # WHEN calling decode_transcript_to_dataframe(...) df = tscribe.decode_transcript_to_dataframe(data) # THEN assert isinstance( df, pandas.DataFrame ), "decode_transcript_to_dataframe should return a Pandas Data Frame" rows, cols = df.shape assert cols == 4, "Dataframe should have four columns" if input_file == "sample_single.json": # TODO pass if input_file == "sample_multiple.json": assert rows == len( data["results"]["speaker_labels"]["segments"] ), "Rows should match number of segments"
def test_make_graph_png(input_file): """ Test function for creating graphs from confidence stats GIVEN confidence stats from an input file WHEN calling make_graph_png(...) THEN produce chart.png """ logging.info("test_make_graph_png") filepath = Path("chart.png") # Ensure blank slate if filepath.is_file(): os.remove(filepath) # GIVEN confidence stats from an input file data = tscribe.load_json_as_dict(input_file) stats = tscribe.calculate_confidence_statistics(data) # WHEN calling make_graph_png(...) tscribe.make_graph_png(stats, "./") # THEN produce chart.png assert filepath.is_file(), "chart.png should be created" os.remove(filepath)
def test_write_to_docx(input_file): """ Test production of docx output GIVEN an input file WHEN writing to docx THEN check output exists and contains content """ logging.info("test_write_to_docx") # GIVEN an input file # WHEN writing to docx output_filename = Path(f"{uuid4().hex}.docx") tscribe.write(input_file, save_as=output_filename, format="docx") # THEN check output exists and contains content assert output_filename.is_file(), "Output file should exist" document = Document(output_filename) assert ( len(document.tables) == 2 ), "Document should contain two tables, stats and transcript" t_conf = document.tables[0].cell(0, 0).text t_count = document.tables[0].cell(0, 1).text t_perc = document.tables[0].cell(0, 2).text assert (t_conf, t_count, t_perc) == ( "Confidence", "Count", "Percentage", ), "First table should be stats headers" assert len(document.tables[0].rows) == 12, "Stats table should hold 12 rows" t_time = document.tables[1].cell(0, 0).text t_speaker = document.tables[1].cell(0, 1).text t_content = document.tables[1].cell(0, 2).text assert (t_time, t_speaker, t_content) == ( "Time", "Speaker", "Content", ), "Second table should be transcript headers" data = tscribe.load_json_as_dict(input_file) df = tscribe.decode_transcript_to_dataframe(data) assert ( len(document.tables[1].rows) == len(df) + 1 ), "Second table should be length of dataframe + headers" assert ( "chart.png" in document.paragraphs[6]._p.xml ), "Chart should be in paragraph six" # Teardown os.remove(output_filename)
def test_load_json_as_dict(input_file): """ Test json to dict function GIVEN a sample json file WHEN calling tscribe.load_json_as_dict(...) THEN return a dict """ logging.info("test_load_json_as_dict") # GIVEN a sample json file # provided through parametrize # WHEN calling tscribe.load_json_as_dict(...) data = tscribe.load_json_as_dict(input_file) # THEN return a dict assert isinstance(data, dict), "Data should by of dict type"
def test_write_to_vtt(input_file): """ Test production of vtt format GIVEN an input file WHEN writing to vtt THEN check output exists and contains content """ logging.info("test_write_to_vtt") # GIVEN an input file # WHEN writing to vtt output_filename = Path(f"{uuid4().hex}.vtt") tscribe.write(input_file, save_as=output_filename, format="vtt") # THEN check output exists and contains content vtt = webvtt.read(output_filename) data = tscribe.load_json_as_dict(input_file) df = tscribe.decode_transcript_to_dataframe(data) assert len(vtt.captions) == len( df ), "vtt file should have equal captions to df rows" for caption in vtt.captions: assert hasattr(caption, "start"), "each caption should have a start_time" assert hasattr(caption, "end"), "each caption should have a end_time" assert hasattr(caption, "text"), "each caption should have text" assert ( len(caption.lines) >= len(caption.text) / 80 ), "text should be split into max 80 long lines" if input_file != "sample_single.json": assert hasattr( caption, "identifier" ), "each caption should have an identifier" # Teardown os.remove(output_filename)