예제 #1
0
def main():
    """
    Console script for audio_korpora_pipeline.
    Implement here CLI args parsing

  """

    parser = argparse.ArgumentParser()
    parser.add_argument("-c",
                        "--config",
                        dest="config",
                        help="path to config file",
                        required=True)
    parser.add_argument(
        "-i",
        "--input_corpora",
        dest="input",
        help="comma separated list of which corpora to transform",
        required=True)
    parser.add_argument(
        "-o",
        "--output_corpora",
        dest="output",
        help="comma separated list of which corpora to produce",
        required=True)

    args = parser.parse_args()
    config_path = args.config

    if not os.path.isfile(config_path):
        parser.print_help()

    config = load_config(config_path)
    config_logging(load_config(config_path))

    # Creating Adapters
    input_adapters = _createInputAdapters(config, args.input)
    output_adapters = _createOutputAdapters(config, args.output)

    print("Started with {} input corpora to transform".format(
        len(input_adapters)))
    print("Started with {} output corpora as target format".format(
        len(output_adapters)))

    # Creating metamodels
    metamodels = _transformInputsToMetamodel(input_adapters)

    # Doing output work
    _transformMetamodelsToOutputs(metamodels, output_adapters)

    return 0
예제 #2
0
  def test_full_transcription_of_one_file(self):
    # given
    config = load_config("config.cfg.sample")
    config_logging(config)
    adapter = ArchimobAdapter(config)
    fileToConvert = os.path.join(adapter._validateKorpusPath(), "Archimob_Release_2", "1007.xml")

    expectedOutputSentenceContainingPauseAndVocal = "#ehm s ich bin am sächsezwänzgischte jänner nünzehundertzwölf @ gibore"
    expectedOutputSentenceContainingUnclear = "maitschi und de"
    expectedOutputSentenceContainingDeletion = "de he det hend"
    expectedOutputSentenceContainingGap = "d1007-T62"

    # when
    transcriptionForThisSpeaker = adapter._extractSingleXmlFileThread(fileToConvert)

    # then
    assert len(
        transcriptionForThisSpeaker) == 3, "Format should be: result(bool), filename(str), transcriptions(dataframe)"
    assert transcriptionForThisSpeaker[0] == True, "Should have successfully parsed"
    assert transcriptionForThisSpeaker[1] == fileToConvert, "Filename should be the same as inputted"
    assert "chönd sii" == (transcriptionForThisSpeaker[2]).loc[
      0, "transcript"], "Some example for correct transcription"

    transcript = (transcriptionForThisSpeaker[2])

    assert expectedOutputSentenceContainingPauseAndVocal == transcript[
      transcript.Filename == "d1007-T5"].iloc[0]["transcript"], "Output sentence does not look like it should"
    assert expectedOutputSentenceContainingUnclear in transcript[
      transcript.Filename == "d1007-T40"].iloc[0]["transcript"], "Output sentence does not contain unclear word"
    assert expectedOutputSentenceContainingDeletion in transcript[
      transcript.Filename == "d1007-T977"].iloc[0]["transcript"], "Output sentence does not containg deletion"

    assert expectedOutputSentenceContainingGap not in set(
        (transcriptionForThisSpeaker[2])["Filename"]), "Should not contain known sentence with <gap> tag"
예제 #3
0
def _clearWorkingDirs():
  print("Cleaning working dirs")
  config = load_config("config.cfg.sample")
  config_logging(config)
  clearWorkingDirsInput(config)
  adapter = FairseqWav2VecAdapter(config)
  korpusPath = adapter._basePath()
  shutil.rmtree(korpusPath, ignore_errors=True)
예제 #4
0
 def test_video_returns_media_session(self):
   # given
   config = load_config("config.cfg.sample")
   config_logging(config)
   adapter = UntranscribedVideoAdapter(config)
   # when
   metamodel = adapter.toMetamodel()
   # then
   assert len(metamodel.mediaSessionActors) == 1, "Muss genau einen Speaker (Unknown) enthalten"
   assert metamodel.mediaSessionActors.pop().id == "UNKNOWN", "Muss genau einen Speaker (Unknown) enthalten"
   assert len(metamodel.mediaAnnotationBundles) > 2, "Muss mehr als ein Media bundle enthalten"
예제 #5
0
  def test_integration_test_archimob_input(self):
    # given
    config = load_config("config.cfg.sample")
    config_logging(config)
    adapter = ArchimobAdapter(config)

    # when
    mediaSession = adapter.toMetamodel()

    # then
    print(mediaSession)
예제 #6
0
  def test_small_integration_test_with_everything_already_in_place(self):
    # given
    # assuming test before has been run successfully and files are not deleted
    config = load_config("config.cfg.sample")
    config_logging(config)

    inputAdapters = _createInputAdapters(config, ExistingInputAdapter.UNTRANSCRIBED_VIDEO.value)
    outputAdapters = _createOutputAdapters(config, ExistingOutputAdapter.FAIRSEQ_WAV2VEC.value)
    # when
    metamodels = _transformInputsToMetamodel(inputAdapters)
    outputs = _transformMetamodelsToOutputs(metamodels, outputAdapters)
예제 #7
0
def clearWorkingDirs():
  config = load_config("config.cfg.sample")
  config_logging(config)
  adapters = [UntranscribedVideoAdapter(config), ChJugendspracheAdapter(config), ArchimobAdapter(config)]
  korpusPaths = [adapter._validateKorpusPath() for adapter in adapters]

  for korpusPath in korpusPaths:
    for filename in glob.glob(os.path.join(korpusPath, "**", "*chunk*.wav"), recursive=True):
      print("Triggered deleting files for folder {}".format(filename))
      os.remove(filename)
    for filename in glob.glob(os.path.join(korpusPath, "**", "*.mono.wav"), recursive=True):
      print("Triggered deleting files for folder {}".format(filename))
      os.remove(filename)
예제 #8
0
  def test_indicating_1063error(self):
    # given
    config = load_config("config.cfg.sample")
    config_logging(config)
    adapter = ArchimobAdapter(config)

    # assuming this will have all original transcripts ready for testing
    filelist = set(adapter._getAllMediaFilesInBasepath(adapter._validateKorpusPath(), {".wav"}))
    assert any(list(filter(lambda file: os.path.sep + "1063" + os.path.sep + "1063" + os.path.sep in file,
                           filelist))), "We start with some wrong folders in place"

    # when
    assert adapter._fixForDuplicateWavs1063Necessary(
        filelist), "Should return true, as we expect to have those files within"
예제 #9
0
  def test_from_metamodel_integration_test(self):
    # given
    _clearWorkingDirs()  # Clear directories
    config = load_config("config.cfg.sample")
    config_logging(config)

    inputAdapters = _createInputAdapters(config,
                                         ExistingInputAdapter.ARCHIMOB.value + "," +
                                         ExistingInputAdapter.CH_JUGENDSPRACHE.value + "," +
                                         ExistingInputAdapter.UNTRANSCRIBED_VIDEO.value)
    outputAdapters = _createOutputAdapters(config, ExistingOutputAdapter.FAIRSEQ_WAV2VEC.value)
    # when
    metamodels = _transformInputsToMetamodel(inputAdapters)
    outputs = _transformMetamodelsToOutputs(metamodels, outputAdapters)
예제 #10
0
  def test_validate_tsv(self):
    # given
    allExistingWavsInTargetFolder = [
      "1 gegen 100-1 gegen 100 – Jahresrückblick mit Angélique Beldner-0943170628_chunk_00014.wav",
      "1 gegen 100-1 gegen 100 – Jahresrückblick mit Angélique Beldner-0943170628_chunk_00016.wav",
      "shouldnetbeHere_butignored.wav"]

    config = load_config("config.cfg.sample")
    config_logging(config)
    outputAdapter = FairseqWav2VecAdapter(config)
    self._createDummyFileToValidate(outputAdapter._validateBasePath())
    # when
    filesToProcess = outputAdapter._validate_tsv_file(allExistingWavsInTargetFolder, "dummy.tsv", 16000)
    # then
    print(filesToProcess)
예제 #11
0
  def test_filtering_1063flaw(self):
    # given
    config = load_config("config.cfg.sample")
    config_logging(config)
    adapter = ArchimobAdapter(config)

    # assuming this will have all original transcripts ready for testing
    filelist = set(adapter._getAllMediaFilesInBasepath(adapter._validateKorpusPath(), {".wav"}))
    assert any(list(filter(lambda file: os.path.sep + "1063" + os.path.sep + "1063" + os.path.sep in file,
                           filelist))), "We start with some wrong folders in place"

    # when
    newFilelist = adapter._fixForDuplicateWavs1063(filelist)

    assert (len(newFilelist) < len(filelist)), "It should have filtered something"
    assert adapter._fixForDuplicateWavs1063Necessary(
        newFilelist) == False, "The new list should not contain any fixable wavs anymore"
예제 #12
0
  def test_fixing_1083flaw(self):
    # given
    config = load_config("config.cfg.sample")
    config_logging(config)
    adapter = ArchimobAdapter(config)

    # assuming this will have all original transcripts ready for testing
    filelist = set(adapter._getAllMediaFilesInBasepath(adapter._validateKorpusPath(), {".wav"}))
    assert any(list(filter(lambda file: "1082_2d1082_2_TLI_3.wav" in file,
                           filelist))), "We start with some wrong folders in place"

    # when
    newFilelist = adapter._fixForWrongFilenames1082(filelist)

    assert (len(newFilelist) == len(filelist)), "It should have same length entries"
    assert (newFilelist != filelist), "It should have changed something"
    assert adapter._fixForWrongFilenames1082Necessary(
        newFilelist) == False, "The new list should not contain any fixable wavs anymore"
예제 #13
0
  def test_transcription_plus_other(self):
    # given
    config = load_config("config.cfg.sample")
    config_logging(config)
    adapter = ArchimobAdapter(config)
    fileToConvert1 = os.path.join(adapter._validateKorpusPath(), "Archimob_Release_2", "1007.xml")
    fileToConvert2 = os.path.join(adapter._validateKorpusPath(), "Archimob_Release_2", "1044.xml")
    filelist = set(adapter._getAllMediaFilesInBasepath(adapter._validateWorkdir(),
                                                       {".wav"}))  # assuming wav generation was done properly
    transcriptions = adapter._extract([fileToConvert1, fileToConvert2])  # assuming this works as expected

    # when
    versa = adapter._onlyTranscriptionsWithMediaFilesAndViceVersa(transcriptions, filelist)
    bundles = adapter._createActualMediaAnnotationBundles(versa)

    # then
    assert {'FullpathFilename', 'transcript'}.issubset(
        versa.columns), "Columns of frame should be FilenameFullpath and transcript"

    print(bundles)
예제 #14
0
  def test_full_transcription_of_two_files(self):
    # given
    config = load_config("config.cfg.sample")
    config_logging(config)
    adapter = ArchimobAdapter(config)
    fileToConvert1 = os.path.join(adapter._validateKorpusPath(), "Archimob_Release_2", "1007.xml")
    fileToConvert2 = os.path.join(adapter._validateKorpusPath(), "Archimob_Release_2", "1082_2.xml")

    # when
    extraction = adapter._extract([fileToConvert1, fileToConvert2])

    # then
    print(extraction)
    assert len(extraction) == 2, "Should have two speaker tuples back"
    assert len(extraction[0]) == 2, "Should have a tuple back"
    assert extraction[0][0] == fileToConvert1 or extraction[0][
      0] == fileToConvert2, "Should have file one or two set as origin"
    assert type(extraction[0][1]) == DataFrame, "Should have a frame bcak"
    assert {'Filename', 'transcript'}.issubset(
        extraction[0][1].columns), "Columns of frame should be filename and transcript"