Exemple #1
0
def getSelectedTxtFiles(txtPath, wavPath):

    outputPath = join(txtPath, "selected_txt")
    utils.makeDir(outputPath)

    nameList = utils.findFiles(wavPath, filterExt=".wav", stripExt=True)
    nameList = [name.split("_")[0] for name in nameList]
    nameList = list(set(nameList))

    for name in utils.findFiles(txtPath, filterExt=".txt", stripExt=True):
        if name not in nameList:
            continue
        shutil.copy(join(txtPath, name + ".txt"),
                    join(outputPath, name + ".txt"))
Exemple #2
0
def getSelectedTxtFiles(txtPath, wavPath):
    
    outputPath = join(txtPath, "selected_txt")
    utils.makeDir(outputPath)
    
    nameList = utils.findFiles(wavPath, filterExt=".wav", stripExt=True)
    nameList = [name.split("_")[0] for name in nameList]
    nameList = list(set(nameList))
    
    for name in utils.findFiles(txtPath, filterExt=".txt", stripExt=True):
        if name not in nameList:
            continue
        shutil.copy(join(txtPath, name + ".txt"),
                    join(outputPath, name + ".txt"))
Exemple #3
0
def splitAudio(path):
    
    outputPath = join(path, "split_audio")
    utils.makeDir(outputPath)
    
    for fn in utils.findFiles(path, filterExt=".wav"):
        audioScripts.splitStereoAudio(path, fn, outputPath)
Exemple #4
0
def splitAudio(path):

    outputPath = join(path, "split_audio")
    utils.makeDir(outputPath)

    for fn in utils.findFiles(path, filterExt=".wav"):
        audioScripts.splitStereoAudio(path, fn, outputPath)
Exemple #5
0
def forceAlignCrest(wavPath, txtPath, outputPath, juliusScriptPath, soxPath):

    totalNumPhonesFailed = 0
    totalNumPhones = 0

    totalNumIntervalsFailed = 0
    totalNumIntervals = 0

    finishedList = utils.findFiles(outputPath,
                                   filterExt=".TextGrid",
                                   stripExt=True)
    for name in utils.findFiles(txtPath,
                                filterExt=".txt",
                                skipIfNameInList=finishedList,
                                stripExt=True):

        (numPhonesFailedAlignment, numPhones, numFailedIntervals,
         numIntervals) = forceAlignFile(wavPath, name, txtPath, name + ".txt",
                                        outputPath, juliusScriptPath, soxPath)

        percentFailed = utils.divide(numPhonesFailedAlignment, numPhones,
                                     0) * 100
        percentFailedIntervals = utils.divide(numFailedIntervals, numIntervals,
                                              0) * 100
        print("%d intervals of %d total intervals (%0.2f%%) and %d phones "
              "of %d total phones (%0.2f%%) failed to align for %s" %
              (numFailedIntervals, numIntervals, percentFailedIntervals,
               numPhonesFailedAlignment, numPhones, percentFailed, name))

        totalNumPhonesFailed += numPhonesFailedAlignment
        totalNumPhones += numPhones

        totalNumIntervalsFailed += numFailedIntervals
        totalNumIntervals += numIntervals

    totalPercentFailed = utils.divide(totalNumPhonesFailed, totalNumPhones,
                                      0) * 100
    totalPercentFailedIntervals = utils.divide(totalNumIntervalsFailed,
                                               totalNumIntervals, 0) * 100
    print("====Summary====")
    print("%d intervals of %d total intervals (%0.2f%%) and %d phones of %d "
          "total phones (%0.2f%%) failed to align" %
          (totalNumIntervalsFailed, totalNumIntervals,
           totalPercentFailedIntervals, totalNumPhonesFailed, totalNumPhones,
           totalPercentFailed))
Exemple #6
0
def renameMP3Files(path):
    
    outputPath = join(path, "renamed")
    utils.makeDir(outputPath)

    for name in utils.findFiles(path, filterExt=".mp3", stripExt=True):
        if name[-1] == "x":
            newName = name[:-1]
            shutil.move(join(path, name + ".mp3"),
                        join(outputPath, newName + ".mp3"))
Exemple #7
0
def forceAlignCrest(wavPath, txtPath, outputPath, juliusScriptPath, soxPath):
    
    totalNumPhonesFailed = 0
    totalNumPhones = 0
    
    totalNumIntervalsFailed = 0
    totalNumIntervals = 0
    
    finishedList = utils.findFiles(outputPath, filterExt=".TextGrid",
                                   stripExt=True)
    for name in utils.findFiles(txtPath, filterExt=".txt",
                                skipIfNameInList=finishedList, stripExt=True):
        
        (numPhonesFailedAlignment, numPhones, numFailedIntervals,
         numIntervals) = forceAlignFile(wavPath, name, txtPath,
                                        name + ".txt", outputPath,
                                        juliusScriptPath, soxPath)

        percentFailed = utils.divide(numPhonesFailedAlignment,
                                     numPhones, 0) * 100
        percentFailedIntervals = utils.divide(numFailedIntervals,
                                              numIntervals, 0) * 100
        print("%d intervals of %d total intervals (%0.2f%%) and %d phones "
              "of %d total phones (%0.2f%%) failed to align for %s" %
              (numFailedIntervals, numIntervals, percentFailedIntervals,
               numPhonesFailedAlignment, numPhones, percentFailed, name))

        totalNumPhonesFailed += numPhonesFailedAlignment
        totalNumPhones += numPhones
        
        totalNumIntervalsFailed += numFailedIntervals
        totalNumIntervals += numIntervals
    
    totalPercentFailed = utils.divide(totalNumPhonesFailed,
                                      totalNumPhones, 0) * 100
    totalPercentFailedIntervals = utils.divide(totalNumIntervalsFailed,
                                               totalNumIntervals, 0) * 100
    print("====Summary====")
    print("%d intervals of %d total intervals (%0.2f%%) and %d phones of %d "
          "total phones (%0.2f%%) failed to align" %
          (totalNumIntervalsFailed, totalNumIntervals,
           totalPercentFailedIntervals, totalNumPhonesFailed,
           totalNumPhones, totalPercentFailed))
Exemple #8
0
def renameMP3Files(path):

    outputPath = join(path, "renamed")
    utils.makeDir(outputPath)

    for name in utils.findFiles(path, filterExt=".mp3", stripExt=True):
        if name[-1] == "x":
            newName = name[:-1]
            shutil.move(join(path, name + ".mp3"),
                        join(outputPath, newName + ".mp3"))
Exemple #9
0
def convertCorpusToUTF8(path):
    
    outputDir = join(path, "output")
    utils.makeDir(outputDir)
    
    for fn in utils.findFiles(path, filterExt=".txt"):
        # cp932 = Japanese
        with io.open(join(path, fn), "rU", encoding="cp932") as fd:
            text = fd.read()
        with io.open(join(outputDir, fn), "w", encoding='utf-8') as fd:
            fd.write(text)
Exemple #10
0
def convertCorpusToUTF8(path):

    outputDir = join(path, "output")
    utils.makeDir(outputDir)

    for fn in utils.findFiles(path, filterExt=".txt"):
        # cp932 = Japanese
        with io.open(join(path, fn), "rU", encoding="cp932") as fd:
            text = fd.read()
        with io.open(join(outputDir, fn), "w", encoding='utf-8') as fd:
            fd.write(text)
def forceAlignCorpus(wavPath,
                     txtPath,
                     outputPath,
                     juliusScriptPath=None,
                     soxPath=None,
                     perlPath=None):
    '''Force aligns every file and prints out summary statistics'''
    totalNumPhonesFailed = 0
    totalNumPhones = 0

    totalNumIntervalsFailed = 0
    totalNumIntervals = 0

    utils.makeDir(outputPath)

    for name in utils.findFiles(txtPath, filterExt=".txt", stripExt=True):
        wavNameDict = {name: "%s.wav" % name}
        output = forceAlignFile([
            name,
        ], wavPath, wavNameDict, txtPath, name + ".txt", outputPath, name,
                                juliusScriptPath, soxPath, perlPath)

        (numPhonesFailedAlignment, numPhones, numFailedIntervals,
         numIntervals) = output

        percentFailed = utils.divide(numPhonesFailedAlignment, numPhones,
                                     0) * 100
        percentFailedIntervals = utils.divide(numFailedIntervals, numIntervals,
                                              0) * 100
        print("%d intervals of %d total intervals (%0.2f%%) and %d phones "
              "of %d total phones (%0.2f%%) successfully aligned for %s" %
              (numIntervals - numFailedIntervals, numIntervals, 100 *
               (1 - percentFailedIntervals),
               numPhones - numPhonesFailedAlignment, numPhones, 100 *
               (1 - percentFailed), name))

        totalNumPhonesFailed += numPhonesFailedAlignment
        totalNumPhones += numPhones

        totalNumIntervalsFailed += numFailedIntervals
        totalNumIntervals += numIntervals

    totalPercentFailed = utils.divide(totalNumPhonesFailed, totalNumPhones,
                                      0) * 100
    totalPercentFailedIntervals = utils.divide(totalNumIntervalsFailed,
                                               totalNumIntervals, 0) * 100
    print("====Summary====")
    print(
        "%d intervals of %d total intervals (%0.2f%%) and %d phones of "
        "%d total phones (%0.2f%%) successfully aligned" %
        (totalNumIntervals - totalNumIntervalsFailed, totalNumIntervals, 100 *
         (1 - totalPercentFailedIntervals), totalNumPhones -
         totalNumPhonesFailed, totalNumPhones, 100 * (1 - totalPercentFailed)))
Exemple #12
0
def textgridToCSV(inputPath, outputPath, outputExt='.csv'):
    utils.makeDir(outputPath)

    for fn in utils.findFiles(inputPath, filterExt=".TextGrid"):
        tg = tgio.openTextgrid(join(inputPath, fn))
        tier = tg.tierDict["utterances"]
        outputList = []
        for start, stop, label in tier.entryList:
            outputList.append("%s,%s,%s" % (start, stop, label))
        
        name = os.path.splitext(fn)[0]
        outputTxt = "\n".join(outputList)
        outputFN = join(outputPath, "%s%s" % (name, outputExt))
        with io.open(outputFN, "w", encoding="utf-8") as fd:
            fd.write(outputTxt)
def forceAlignCorpus(wavPath, txtPath, outputPath, juliusScriptPath=None,
                     soxPath=None, perlPath=None):
    '''Force aligns every file and prints out summary statistics'''
    totalNumPhonesFailed = 0
    totalNumPhones = 0
    
    totalNumIntervalsFailed = 0
    totalNumIntervals = 0
    
    utils.makeDir(outputPath)
    
    for name in utils.findFiles(txtPath, filterExt=".txt", stripExt=True):
        wavNameDict = {name: "%s.wav" % name}
        output = forceAlignFile([name, ], wavPath, wavNameDict, txtPath,
                                name + ".txt", outputPath, name,
                                juliusScriptPath, soxPath, perlPath)

        (numPhonesFailedAlignment, numPhones,
         numFailedIntervals, numIntervals) = output
        
        percentFailed = utils.divide(numPhonesFailedAlignment,
                                     numPhones, 0) * 100
        percentFailedIntervals = utils.divide(numFailedIntervals,
                                              numIntervals, 0) * 100
        print("%d intervals of %d total intervals (%0.2f%%) and %d phones "
              "of %d total phones (%0.2f%%) successfully aligned for %s" %
              (numIntervals - numFailedIntervals, numIntervals, 100 * (1 - percentFailedIntervals),
               numPhones - numPhonesFailedAlignment, numPhones, 100 * (1 - percentFailed), name))
        
        totalNumPhonesFailed += numPhonesFailedAlignment
        totalNumPhones += numPhones
        
        totalNumIntervalsFailed += numFailedIntervals
        totalNumIntervals += numIntervals
    
    totalPercentFailed = utils.divide(totalNumPhonesFailed,
                                      totalNumPhones, 0) * 100
    totalPercentFailedIntervals = utils.divide(totalNumIntervalsFailed,
                                               totalNumIntervals, 0) * 100
    print("====Summary====")
    print("%d intervals of %d total intervals (%0.2f%%) and %d phones of "
          "%d total phones (%0.2f%%) successfully aligned" %
          (totalNumIntervals - totalNumIntervalsFailed, totalNumIntervals, 100 * (1 - totalPercentFailedIntervals),
           totalNumPhones - totalNumPhonesFailed, totalNumPhones, 100 * (1 - totalPercentFailed)))
def convertCorpusToKanaAndRomaji(inputPath,
                                 outputPath,
                                 cabochaEncoding,
                                 cabochaPath=None,
                                 encoding="cp932"):
    '''
    Reduces a corpus of typical Japanese text to both kana and romaji

    Each line of input should be of the form:
    startTime, stopTime, Japanese text
    '''
    utils.makeDir(outputPath)

    numUnnamedEntities = 0
    numUnidentifiedUtterances = 0
    numWordsProcessedWithNoError = 0

    fnList = utils.findFiles(inputPath, filterExt=".txt")
    for fn in fnList:
        with io.open(join(inputPath, fn), "rU", encoding=encoding) as fd:
            text = fd.read()
        textList = text.split("\n")

        numUnnamedEntitiesForFN = 0
        numUnidentifiedUtterancesForFN = 0
        speakerInfoList = []
        for line in textList:
            line = line.strip()
            try:
                startTime, stopTime, line = line.split(",", 2)
            except ValueError:
                print("error")
                continue
            origLine = line

            dataPrepTuple = juliusAlignment.formatTextForJulius(
                line, cabochaEncoding, cabochaPath)

            (line, tmpWordList, tmpKanaList, tmpRomajiList,
             unidentifiedUtterance, unnamedEntity,
             tmpWordCount) = dataPrepTuple

            numUnnamedEntities += unnamedEntity
            numUnidentifiedUtterances += unidentifiedUtterance
            numWordsProcessedWithNoError += tmpWordCount

            name = os.path.splitext(fn)[0]
            outputList = [
                u"%s,%s,%s" % (name, startTime, stopTime), origLine,
                tmpWordList, tmpKanaList, tmpRomajiList
            ]
            outputStr = ";".join(outputList)

            speakerInfoList.append(outputStr)

        if (numUnnamedEntities > 0 or numUnidentifiedUtterances > 0):
            print(fn)
            print("Number of unnamed entities for fn: %d" %
                  numUnnamedEntitiesForFN)
            print("Number of unidentified utterances for fn: %d" %
                  numUnidentifiedUtterancesForFN)

        numUnnamedEntities += numUnnamedEntitiesForFN
        numUnidentifiedUtterances += numUnidentifiedUtterancesForFN

        with io.open(join(outputPath, fn), "w", encoding="utf-8") as fd:
            fd.write("\n".join(speakerInfoList))

    print("\n")
    print("Number of transcripts converted: %d" % len(fnList))
    print("Number of unnamed entities: %d" % numUnnamedEntities)
    print("Number of unidentified utterances: %d" % numUnidentifiedUtterances)
    print("Number of words processed without error: %d" %
          numWordsProcessedWithNoError)
Exemple #15
0
def convertCRESTToKanaAndRomaji(inputPath,
                                outputPath,
                                cabochaEncoding,
                                cabochaPath,
                                encoding="cp932"):

    timeInfoPath = join(outputPath, "speaker_info_and_utterance_timing")

    for path in [timeInfoPath]:
        utils.makeDir(path)

    numUnnamedEntities = 0
    numUnidentifiedUtterances = 0
    finishedList = utils.findFiles(timeInfoPath, filterExt=".txt")
    for fn in utils.findFiles(inputPath,
                              filterExt=".txt",
                              skipIfNameInList=finishedList):
        with io.open(join(inputPath, fn), "r", encoding=encoding) as fd:
            text = fd.read()
        textList = text.split("\n")

        numUnnamedEntitiesForFN = 0
        numUnidentifiedUtterancesForFN = 0
        speakerInfoList = []
        for line in textList:
            line = line.strip()
            try:
                speakerCode, startTime, stopTime, line = line.split(" ", 3)
            except ValueError:
                continue

            origLine = line

            # Clean up the line before it gets processed
            # Not sure what "・" is but cabocha doesn't like it
            for char in [
                    u"(", u")", u" ", u".", u"?", u"「", u"」", u"[", u"]",
                    u"@W", u"@S", u"<", u">", u" ", u"。"
            ]:
                line = line.replace(char, "")

            # Used to split names?
            for char in [u"・", u"·"]:
                line = line.replace(char, " ")

            line = line.strip()

            try:
                tmp = jProcessingSnippet.getChunkedKana(
                    line, cabochaEncoding, cabochaPath)
                tmpWordList, tmpKanaList, tmpromajiList = tmp
            except (jProcessingSnippet.ChunkingError,
                    jProcessingSnippet.NonKatakanaError) as e:
                print(u"%s, %s" % (str(e), origLine))
                tmpWordList = [""]
                tmpKanaList = [""]
                tmpromajiList = [""]
                numUnidentifiedUtterancesForFN += 1
            except jProcessingSnippet.UnidentifiedJapaneseText as e:
                if all([char == u"X" for char in e.word]):
                    numUnnamedEntitiesForFN += 1
                else:
                    print(u"%s" % str(e))
                    numUnidentifiedUtterancesForFN += 1
                tmpWordList = [""]
                tmpKanaList = [""]
                tmpromajiList = [""]
            except jProcessingSnippet.EmptyStrError as e:
                tmpWordList = [""]
                tmpKanaList = [""]
                tmpromajiList = [""]
            except Exception:
                print(line)
                raise
            line = line.replace(u",", u"")
            outputList = [
                u"%s,%s,%s" % (speakerCode, startTime, stopTime), origLine,
                ','.join(tmpWordList), ",".join(tmpKanaList),
                ",".join(tmpromajiList)
            ]
            outputStr = ";".join(outputList)

            speakerInfoList.append(outputStr)

        print(fn)
        print("Number of unnamed entities for fn: %d" %
              numUnnamedEntitiesForFN)
        print("Number of unidentified utterances for fn: %d" %
              numUnidentifiedUtterancesForFN)
        numUnnamedEntities += numUnnamedEntitiesForFN
        numUnidentifiedUtterances += numUnidentifiedUtterancesForFN

        outputFN = join(timeInfoPath, fn)
        with io.open(outputFN, "w", encoding="utf-8") as fd:
            fd.write("\n".join(speakerInfoList))

    print("\n")
    print("Number of unnamed entities: %d" % numUnnamedEntities)
    print("Number of unidentified utterances: %d" % numUnidentifiedUtterances)
Exemple #16
0
def convertCRESTToKanaAndRomaji(inputPath, outputPath, cabochaEncoding,
                                cabochaPath, encoding="cp932"):
    
    timeInfoPath = join(outputPath, "speaker_info_and_utterance_timing")
    
    for path in [timeInfoPath]:
        utils.makeDir(path)
    
    numUnnamedEntities = 0
    numUnidentifiedUtterances = 0
    finishedList = utils.findFiles(timeInfoPath, filterExt=".txt")
    for fn in utils.findFiles(inputPath, filterExt=".txt",
                              skipIfNameInList=finishedList):
        with io.open(join(inputPath, fn), "r", encoding=encoding) as fd:
            text = fd.read()
        textList = text.split("\n")
        
        numUnnamedEntitiesForFN = 0
        numUnidentifiedUtterancesForFN = 0
        speakerInfoList = []
        for line in textList:
            line = line.strip()
            try:
                speakerCode, startTime, stopTime, line = line.split(" ", 3)
            except ValueError:
                continue
            
            origLine = line
            
            # Clean up the line before it gets processed
            # Not sure what "・" is but cabocha doesn't like it
            for char in [u"(", u")", u" ", u".", u"?", u"「", u"」",
                         u"[", u"]", u"@W", u"@S", u"<", u">", u" ", u"。"]:
                line = line.replace(char, "")
            
            # Used to split names?
            for char in [u"・", u"·"]:
                line = line.replace(char, " ")
            
            line = line.strip()
            
            try:
                tmp = jProcessingSnippet.getChunkedKana(line, cabochaEncoding,
                                                        cabochaPath)
                tmpWordList, tmpKanaList, tmpromajiList = tmp
            except (jProcessingSnippet.ChunkingError,
                    jProcessingSnippet.NonKatakanaError) as e:
                print(u"%s, %s" % (str(e), origLine))
                tmpWordList = [""]
                tmpKanaList = [""]
                tmpromajiList = [""]
                numUnidentifiedUtterancesForFN += 1
            except jProcessingSnippet.UnidentifiedJapaneseText as e:
                if all([char == u"X" for char in e.word]):
                    numUnnamedEntitiesForFN += 1
                else:
                    print(u"%s" % str(e))
                    numUnidentifiedUtterancesForFN += 1
                tmpWordList = [""]
                tmpKanaList = [""]
                tmpromajiList = [""]
            except jProcessingSnippet.EmptyStrError as e:
                tmpWordList = [""]
                tmpKanaList = [""]
                tmpromajiList = [""]
            except Exception:
                print(line)
                raise
            line = line.replace(u",", u"")
            outputList = [u"%s,%s,%s" % (speakerCode, startTime, stopTime),
                          origLine, ','.join(tmpWordList),
                          ",".join(tmpKanaList), ",".join(tmpromajiList)]
            outputStr = ";".join(outputList)
            
            speakerInfoList.append(outputStr)
        
        print(fn)
        print("Number of unnamed entities for fn: %d" %
              numUnnamedEntitiesForFN)
        print("Number of unidentified utterances for fn: %d" %
              numUnidentifiedUtterancesForFN)
        numUnnamedEntities += numUnnamedEntitiesForFN
        numUnidentifiedUtterances += numUnidentifiedUtterancesForFN

        outputFN = join(timeInfoPath, fn)
        with io.open(outputFN, "w", encoding="utf-8") as fd:
            fd.write("\n".join(speakerInfoList))
    
    print("\n")
    print("Number of unnamed entities: %d" % numUnnamedEntities)
    print("Number of unidentified utterances: %d" % numUnidentifiedUtterances)
def convertCorpusToKanaAndRomaji(inputPath, outputPath, cabochaEncoding,
                                 cabochaPath=None, encoding="cp932"):
    '''
    Reduces a corpus of typical Japanese text to both kana and romaji
    
    Each line of input should be of the form:
    startTime, stopTime, Japanese text
    '''
    utils.makeDir(outputPath)
    
    numUnnamedEntities = 0
    numUnidentifiedUtterances = 0
    numWordsProcessedWithNoError = 0

    fnList = utils.findFiles(inputPath, filterExt=".txt")
    for fn in fnList:
        with io.open(join(inputPath, fn), "rU", encoding=encoding) as fd:
            text = fd.read()
        textList = text.split("\n")
        
        numUnnamedEntitiesForFN = 0
        numUnidentifiedUtterancesForFN = 0
        speakerInfoList = []
        for line in textList:
            line = line.strip()
            try:
                startTime, stopTime, line = line.split(",", 2)
            except ValueError:
                print("error")
                continue
            origLine = line
            
            dataPrepTuple = juliusAlignment.formatTextForJulius(line, cabochaEncoding,
                                                                cabochaPath)
            
            (line, tmpWordList, tmpKanaList, tmpRomajiList,
             unidentifiedUtterance, unnamedEntity, tmpWordCount) = dataPrepTuple
             
            numUnnamedEntities += unnamedEntity
            numUnidentifiedUtterances += unidentifiedUtterance
            numWordsProcessedWithNoError += tmpWordCount
            
            name = os.path.splitext(fn)[0]
            outputList = [u"%s,%s,%s" % (name, startTime, stopTime), origLine,
                          tmpWordList, tmpKanaList, tmpRomajiList]
            outputStr = ";".join(outputList)
            
            speakerInfoList.append(outputStr)
        
        if(numUnnamedEntities > 0 or numUnidentifiedUtterances > 0):
            print(fn)
            print("Number of unnamed entities for fn: %d" %
                  numUnnamedEntitiesForFN)
            print("Number of unidentified utterances for fn: %d" %
                  numUnidentifiedUtterancesForFN)
        
        numUnnamedEntities += numUnnamedEntitiesForFN
        numUnidentifiedUtterances += numUnidentifiedUtterancesForFN

        with io.open(join(outputPath, fn), "w", encoding="utf-8") as fd:
            fd.write("\n".join(speakerInfoList))
     
    print("\n")
    print("Number of transcripts converted: %d" % len(fnList))
    print("Number of unnamed entities: %d" % numUnnamedEntities)
    print("Number of unidentified utterances: %d" % numUnidentifiedUtterances)
    print("Number of words processed without error: %d" % numWordsProcessedWithNoError)