def test_full_transcription_of_one_file(self): # given config = load_config("config.cfg.sample") config_logging(config) adapter = ArchimobAdapter(config) fileToConvert = os.path.join(adapter._validateKorpusPath(), "Archimob_Release_2", "1007.xml") expectedOutputSentenceContainingPauseAndVocal = "#ehm s ich bin am sächsezwänzgischte jänner nünzehundertzwölf @ gibore" expectedOutputSentenceContainingUnclear = "maitschi und de" expectedOutputSentenceContainingDeletion = "de he det hend" expectedOutputSentenceContainingGap = "d1007-T62" # when transcriptionForThisSpeaker = adapter._extractSingleXmlFileThread(fileToConvert) # then assert len( transcriptionForThisSpeaker) == 3, "Format should be: result(bool), filename(str), transcriptions(dataframe)" assert transcriptionForThisSpeaker[0] == True, "Should have successfully parsed" assert transcriptionForThisSpeaker[1] == fileToConvert, "Filename should be the same as inputted" assert "chönd sii" == (transcriptionForThisSpeaker[2]).loc[ 0, "transcript"], "Some example for correct transcription" transcript = (transcriptionForThisSpeaker[2]) assert expectedOutputSentenceContainingPauseAndVocal == transcript[ transcript.Filename == "d1007-T5"].iloc[0]["transcript"], "Output sentence does not look like it should" assert expectedOutputSentenceContainingUnclear in transcript[ transcript.Filename == "d1007-T40"].iloc[0]["transcript"], "Output sentence does not contain unclear word" assert expectedOutputSentenceContainingDeletion in transcript[ transcript.Filename == "d1007-T977"].iloc[0]["transcript"], "Output sentence does not containg deletion" assert expectedOutputSentenceContainingGap not in set( (transcriptionForThisSpeaker[2])["Filename"]), "Should not contain known sentence with <gap> tag"
def _clearWorkingDirs(): print("Cleaning working dirs") config = load_config("config.cfg.sample") config_logging(config) clearWorkingDirsInput(config) adapter = FairseqWav2VecAdapter(config) korpusPath = adapter._basePath() shutil.rmtree(korpusPath, ignore_errors=True)
def test_video_returns_media_session(self): # given config = load_config("config.cfg.sample") config_logging(config) adapter = UntranscribedVideoAdapter(config) # when metamodel = adapter.toMetamodel() # then assert len(metamodel.mediaSessionActors) == 1, "Muss genau einen Speaker (Unknown) enthalten" assert metamodel.mediaSessionActors.pop().id == "UNKNOWN", "Muss genau einen Speaker (Unknown) enthalten" assert len(metamodel.mediaAnnotationBundles) > 2, "Muss mehr als ein Media bundle enthalten"
def test_integration_test_archimob_input(self): # given config = load_config("config.cfg.sample") config_logging(config) adapter = ArchimobAdapter(config) # when mediaSession = adapter.toMetamodel() # then print(mediaSession)
def test_small_integration_test_with_everything_already_in_place(self): # given # assuming test before has been run successfully and files are not deleted config = load_config("config.cfg.sample") config_logging(config) inputAdapters = _createInputAdapters(config, ExistingInputAdapter.UNTRANSCRIBED_VIDEO.value) outputAdapters = _createOutputAdapters(config, ExistingOutputAdapter.FAIRSEQ_WAV2VEC.value) # when metamodels = _transformInputsToMetamodel(inputAdapters) outputs = _transformMetamodelsToOutputs(metamodels, outputAdapters)
def clearWorkingDirs(): config = load_config("config.cfg.sample") config_logging(config) adapters = [UntranscribedVideoAdapter(config), ChJugendspracheAdapter(config), ArchimobAdapter(config)] korpusPaths = [adapter._validateKorpusPath() for adapter in adapters] for korpusPath in korpusPaths: for filename in glob.glob(os.path.join(korpusPath, "**", "*chunk*.wav"), recursive=True): print("Triggered deleting files for folder {}".format(filename)) os.remove(filename) for filename in glob.glob(os.path.join(korpusPath, "**", "*.mono.wav"), recursive=True): print("Triggered deleting files for folder {}".format(filename)) os.remove(filename)
def main(): """ Console script for audio_korpora_pipeline. Implement here CLI args parsing """ parser = argparse.ArgumentParser() parser.add_argument("-c", "--config", dest="config", help="path to config file", required=True) parser.add_argument( "-i", "--input_corpora", dest="input", help="comma separated list of which corpora to transform", required=True) parser.add_argument( "-o", "--output_corpora", dest="output", help="comma separated list of which corpora to produce", required=True) args = parser.parse_args() config_path = args.config if not os.path.isfile(config_path): parser.print_help() config = load_config(config_path) config_logging(load_config(config_path)) # Creating Adapters input_adapters = _createInputAdapters(config, args.input) output_adapters = _createOutputAdapters(config, args.output) print("Started with {} input corpora to transform".format( len(input_adapters))) print("Started with {} output corpora as target format".format( len(output_adapters))) # Creating metamodels metamodels = _transformInputsToMetamodel(input_adapters) # Doing output work _transformMetamodelsToOutputs(metamodels, output_adapters) return 0
def test_indicating_1063error(self): # given config = load_config("config.cfg.sample") config_logging(config) adapter = ArchimobAdapter(config) # assuming this will have all original transcripts ready for testing filelist = set(adapter._getAllMediaFilesInBasepath(adapter._validateKorpusPath(), {".wav"})) assert any(list(filter(lambda file: os.path.sep + "1063" + os.path.sep + "1063" + os.path.sep in file, filelist))), "We start with some wrong folders in place" # when assert adapter._fixForDuplicateWavs1063Necessary( filelist), "Should return true, as we expect to have those files within"
def test_from_metamodel_integration_test(self): # given _clearWorkingDirs() # Clear directories config = load_config("config.cfg.sample") config_logging(config) inputAdapters = _createInputAdapters(config, ExistingInputAdapter.ARCHIMOB.value + "," + ExistingInputAdapter.CH_JUGENDSPRACHE.value + "," + ExistingInputAdapter.UNTRANSCRIBED_VIDEO.value) outputAdapters = _createOutputAdapters(config, ExistingOutputAdapter.FAIRSEQ_WAV2VEC.value) # when metamodels = _transformInputsToMetamodel(inputAdapters) outputs = _transformMetamodelsToOutputs(metamodels, outputAdapters)
def test_validate_tsv(self): # given allExistingWavsInTargetFolder = [ "1 gegen 100-1 gegen 100 – Jahresrückblick mit Angélique Beldner-0943170628_chunk_00014.wav", "1 gegen 100-1 gegen 100 – Jahresrückblick mit Angélique Beldner-0943170628_chunk_00016.wav", "shouldnetbeHere_butignored.wav"] config = load_config("config.cfg.sample") config_logging(config) outputAdapter = FairseqWav2VecAdapter(config) self._createDummyFileToValidate(outputAdapter._validateBasePath()) # when filesToProcess = outputAdapter._validate_tsv_file(allExistingWavsInTargetFolder, "dummy.tsv", 16000) # then print(filesToProcess)
def test_filtering_1063flaw(self): # given config = load_config("config.cfg.sample") config_logging(config) adapter = ArchimobAdapter(config) # assuming this will have all original transcripts ready for testing filelist = set(adapter._getAllMediaFilesInBasepath(adapter._validateKorpusPath(), {".wav"})) assert any(list(filter(lambda file: os.path.sep + "1063" + os.path.sep + "1063" + os.path.sep in file, filelist))), "We start with some wrong folders in place" # when newFilelist = adapter._fixForDuplicateWavs1063(filelist) assert (len(newFilelist) < len(filelist)), "It should have filtered something" assert adapter._fixForDuplicateWavs1063Necessary( newFilelist) == False, "The new list should not contain any fixable wavs anymore"
def test_fixing_1083flaw(self): # given config = load_config("config.cfg.sample") config_logging(config) adapter = ArchimobAdapter(config) # assuming this will have all original transcripts ready for testing filelist = set(adapter._getAllMediaFilesInBasepath(adapter._validateKorpusPath(), {".wav"})) assert any(list(filter(lambda file: "1082_2d1082_2_TLI_3.wav" in file, filelist))), "We start with some wrong folders in place" # when newFilelist = adapter._fixForWrongFilenames1082(filelist) assert (len(newFilelist) == len(filelist)), "It should have same length entries" assert (newFilelist != filelist), "It should have changed something" assert adapter._fixForWrongFilenames1082Necessary( newFilelist) == False, "The new list should not contain any fixable wavs anymore"
def test_transcription_plus_other(self): # given config = load_config("config.cfg.sample") config_logging(config) adapter = ArchimobAdapter(config) fileToConvert1 = os.path.join(adapter._validateKorpusPath(), "Archimob_Release_2", "1007.xml") fileToConvert2 = os.path.join(adapter._validateKorpusPath(), "Archimob_Release_2", "1044.xml") filelist = set(adapter._getAllMediaFilesInBasepath(adapter._validateWorkdir(), {".wav"})) # assuming wav generation was done properly transcriptions = adapter._extract([fileToConvert1, fileToConvert2]) # assuming this works as expected # when versa = adapter._onlyTranscriptionsWithMediaFilesAndViceVersa(transcriptions, filelist) bundles = adapter._createActualMediaAnnotationBundles(versa) # then assert {'FullpathFilename', 'transcript'}.issubset( versa.columns), "Columns of frame should be FilenameFullpath and transcript" print(bundles)
def test_full_transcription_of_two_files(self): # given config = load_config("config.cfg.sample") config_logging(config) adapter = ArchimobAdapter(config) fileToConvert1 = os.path.join(adapter._validateKorpusPath(), "Archimob_Release_2", "1007.xml") fileToConvert2 = os.path.join(adapter._validateKorpusPath(), "Archimob_Release_2", "1082_2.xml") # when extraction = adapter._extract([fileToConvert1, fileToConvert2]) # then print(extraction) assert len(extraction) == 2, "Should have two speaker tuples back" assert len(extraction[0]) == 2, "Should have a tuple back" assert extraction[0][0] == fileToConvert1 or extraction[0][ 0] == fileToConvert2, "Should have file one or two set as origin" assert type(extraction[0][1]) == DataFrame, "Should have a frame bcak" assert {'Filename', 'transcript'}.issubset( extraction[0][1].columns), "Columns of frame should be filename and transcript"