Beispiel #1
0
def extractPraatPitchForEpochs(pitchPath, epochPath, tgInfoPath, outputPath):
    
    utils.makeDir(outputPath)
       
    for fn in utils.findFiles(pitchPath, filterExt=".txt"):
        name = os.path.splitext(fn)[0]
        
        print name

        epochList = utils.openCSV(epochPath, fn)
        epochList = [(epochNum, float(start), float(stop)) for epochNum, start, stop in epochList]
        
        entryList = utils.openCSV(tgInfoPath, fn)
        entryList = [(float(start), float(stop), label) for start, stop, label in entryList]
        
        dataList = praat_pi.loadPitchAndTime(pitchPath, fn)
        
        # Get F0 values for the intervals when the mother was speaking
        speechDataList = []
        for start, stop, label in entryList:
            speechDataList.extend(praat_pi.getAllValuesInTime(start, stop, dataList))
        
        # Get F0 values for the times the mother is speaking for each epoch
        pitchData = []
        for epochNum, start, stop in epochList:
            start, stop = float(start), float(stop)
            duration = stop - start
            epochValueList = praat_pi.getAllValuesInTime(start, stop, speechDataList)
            f0List = [f0Val for time, f0Val, intVal in epochValueList]
            
            pitchData.append(praat_pi.extractPitchMeasuresForSegment(f0List, name, epochNum, medianFilterWindowSize=None, filterZeroFlag=True))
        
        open(join(outputPath, "%s.txt" % name), "w").write("\n".join(pitchData) + "\n")
Beispiel #2
0
def aggregateSpeechRate(tgInfoPath, speechRatePath, outputPath, samplingRate):
    
    utils.makeDir(outputPath)
    
    finishedList = utils.findFiles(outputPath, filterExt=".txt")
    
    for fn in utils.findFiles(tgInfoPath, filterExt=".txt",
                              skipIfNameInList=finishedList):
        
        # Load subset speech rate
        name = os.path.splitext(fn)[0]
        speechRateFNList = utils.findFiles(speechRatePath, filterExt=".txt",
                                           filterPattern=name)
        
        subSplitList = utils.openCSV(tgInfoPath, fn)
    
        # Convert the sample numbers to seconds
        # They are in terms of the beginning of the subset they are in but
        # need to be in terms of the start of the file the larger file the
        # subset originated from
        outputList = []
        for splitInfo, speechRateFN in utils.safeZip([subSplitList,
                                                      speechRateFNList],
                                                     enforceLength=True):
            start, stop, label = splitInfo
            
            speechRateList = utils.openCSV(speechRatePath, speechRateFN, valueIndex=0)
            speechRateList = [value for value in speechRateList if value != '']
            speechRateList = [str(float(start) + float(sampleNum) / float(samplingRate)) for sampleNum in speechRateList]
            
            outputList.append( ",".join(speechRateList) )
    
        open(join(outputPath, fn), "w").write("\n".join(outputList) + "\n")
def manualPhoneCount(tgInfoPath, isleFN, outputPath, skipList=None):
    
    if skipList is None:
        skipList = []
    
    utils.makeDir(outputPath)
    
    isleDict = isletool.LexicalTool(isleFN)
    
    existFNList = utils.findFiles(outputPath, filterPaths=".txt")
    for fn in utils.findFiles(tgInfoPath, filterExt=".txt",
                              skipIfNameInList=existFNList):

        if os.path.exists(join(outputPath, fn)):
            continue
        print(fn)
        
        dataList = utils.openCSV(tgInfoPath, fn)
        dataList = [row[2] for row in dataList]  # start, stop, tmpLabel
        outputList = []
        for tmpLabel in dataList:
            if tmpLabel not in skipList:
                syllableCount, phoneCount = isletool.getNumPhones(isleDict,
                                                                  tmpLabel,
                                                                  maxFlag=True)
            else:
                syllableCount, phoneCount = 0, 0
            
            outputList.append("%d,%d" % (syllableCount, phoneCount))
        
        outputTxt = "\n".join(outputList)
        
        with open(join(outputPath, fn), "w") as fd:
            fd.write(outputTxt)
Beispiel #4
0
def toAbsoluteTime(namePrefix, matlabOutputPath, startTimeList):
    '''
    Converts the sampled times from relative to absolute time
    
    The input may be split across a number of files.  This script assumes
    that files of the pattern <<namePrefix>><<nameSuffix>>.txt correspond
    to different parts of the same source file.
    
    namePrefix - name of the original wav file with no suffix
    speechRatePath - the path where the output of the matlab script is placed
    startTimeList - there needs to be one file here for each file in
                    speechRatePath with the pattern namePrefix
    
    Returns a list of lists where each sublist corresponds to the output of
    one file matching <<namePrefix>>
    '''
    # Load subset speech rate
    speechRateFNList = utils.findFiles(matlabOutputPath,
                                       filterExt=".txt",
                                       filterPattern=namePrefix)

    returnList = []
    for start, speechRateFN in utils.safeZip([startTimeList, speechRateFNList],
                                             enforceLength=True):
        speechRateList = utils.openCSV(matlabOutputPath,
                                       speechRateFN,
                                       valueIndex=0)
        speechRateList = [value for value in speechRateList if value != '']
        speechRateList = [
            str(float(start) + float(sampNum)) for sampNum in speechRateList
        ]

        returnList.append(speechRateList)

    return returnList
Beispiel #5
0
def toAbsoluteTime(namePrefix, matlabOutputPath, startTimeList):
    '''
    Converts the sampled times from relative to absolute time
    
    The input may be split across a number of files.  This script assumes
    that files of the pattern <<namePrefix>><<nameSuffix>>.txt correspond
    to different parts of the same source file.
    
    namePrefix - name of the original wav file with no suffix
    speechRatePath - the path where the output of the matlab script is placed
    startTimeList - there needs to be one file here for each file in
                    speechRatePath with the pattern namePrefix
    
    Returns a list of lists where each sublist corresponds to the output of
    one file matching <<namePrefix>>
    '''
    # Load subset speech rate
    speechRateFNList = utils.findFiles(matlabOutputPath, filterExt=".txt",
                                       filterPattern=namePrefix)
    
    returnList = []
    for start, speechRateFN in utils.safeZip([startTimeList, speechRateFNList],
                                             enforceLength=True):
        speechRateList = utils.openCSV(matlabOutputPath,
                                       speechRateFN,
                                       valueIndex=0)
        speechRateList = [value for value in speechRateList if value != '']
        speechRateList = [str(float(start) + float(sampNum))
                          for sampNum in speechRateList]

        returnList.append(speechRateList)
    
    return returnList
Beispiel #6
0
def removeFilledPauses(inputPath, outputPath):
    
    utils.makeDir(outputPath)
    
    for fn in utils.findFiles(inputPath, filterExt=".txt"):
        dataList = utils.openCSV(inputPath, fn)
        dataList = [[start, stop, label] for start, stop, label in dataList if label == "MS"]
        dataList = [",".join(row) for row in dataList]
        open(join(outputPath, fn), "w").write("\n".join(dataList) + "\n")
Beispiel #7
0
def adjustEpochNumbers(inputPath, outputPath):
    
    utils.makeDir(outputPath)
    
    for fn in utils.findFiles(inputPath, filterExt=".txt"):
        dataList = utils.openCSV(inputPath, fn)
        dataList = ["%02d,%s,%s" % (int(id)+1,start, stop) 
                    for id, start, stop in dataList]
        
        open(join(outputPath, fn), "w").write("\n".join(dataList) + "\n")
def manualPhoneCountForEpochs(manualCountsPath, tgInfoPath, epochPath,
                              outputPath):
    
    utils.makeDir(outputPath)
    
    skipList = utils.findFiles(outputPath, filterExt=".txt")
    for fn in utils.findFiles(tgInfoPath, filterExt=".txt",
                              skipIfNameInList=skipList):
        
        epochList = utils.openCSV(epochPath, fn)
        tgInfo = utils.openCSV(tgInfoPath, fn)
        manualCounts = utils.openCSV(manualCountsPath, fn)
        
        epochOutputList = []
        for epochTuple in epochList:  # Epoch num, start, stop
            epochStart, epochStop = float(epochTuple[1]), float(epochTuple[2])
            
            # Find all of the intervals that are at least partially
            # contained within the current epoch
            epochSyllableCount = 0
            epochPhoneCount = 0
            speechDuration = 0
            for info, counts in utils.safeZip([tgInfo, manualCounts],
                                              enforceLength=True):
                start, stop = float(info[0]), float(info[1])
                syllableCount, phoneCount = float(counts[0]), float(counts[1])
            
                # Accounts for intervals that straddle an epoch boundary
                multiplicationFactor = percentInside(start, stop,
                                                     epochStart, epochStop)
                
                speechDuration += (stop - start) * multiplicationFactor
                
                epochSyllableCount += syllableCount * multiplicationFactor
                epochPhoneCount += phoneCount * multiplicationFactor
            
            epochOutputList.append("%f,%f,%f" % (epochSyllableCount,
                                                 epochPhoneCount,
                                                 speechDuration))
        
        with open(join(outputPath, fn), "w") as fd:
            fd.write("\n".join(epochOutputList))
Beispiel #9
0
def generateEpochRowHeader(epochPath, outputPath, sessionCode):
    
    utils.makeDir(outputPath)
    
    for fn in utils.findFiles(epochPath, filterExt=".txt"):
        epochList = utils.openCSV(epochPath, fn)
        
        id = fn.split("_")[2]
        
        outputList = [",".join([id, sessionCode, epoch, epochStart, epochEnd, str(float(epochEnd) - float(epochStart))]) for epoch, epochStart, epochEnd in epochList]
        
        open(join(outputPath, fn), "w").write("\n".join(outputList) + "\n")
def aggregateFeatures(featurePath, featureList, headerStr=None):

    outputDir = join(featurePath, "aggr")
    utils.makeDir(outputDir)

    fnList = []
    dataList = []

    # Find the files that exist in all features
    for feature in featureList:
        fnSubList = utils.findFiles(join(featurePath, feature),
                                    filterExt=".txt")
        fnList.append(fnSubList)

    actualFNList = []
    for featureFN in fnList[0]:
        if all([featureFN in subList for subList in fnList]):
            actualFNList.append(featureFN)

    for featureFN in actualFNList:
        dataList = []
        for feature in featureList:
            featureDataList = utils.openCSV(join(featurePath, feature),
                                            featureFN,
                                            encoding="utf-8")
            dataList.append([",".join(row) for row in featureDataList])

        name = os.path.splitext(featureFN)[0]

        dataList.insert(0, [name for _ in range(len(dataList[0]))])
        tDataList = utils.safeZip(dataList, enforceLength=True)
        outputList = [",".join(row) for row in tDataList]
        outputTxt = "\n".join(outputList)

        outputFN = join(outputDir, name + ".csv")
        with io.open(outputFN, "w", encoding="utf-8") as fd:
            fd.write(outputTxt)

    # Cat all files together
    aggrOutput = []

    if headerStr is not None:
        aggrOutput.append(headerStr)

    for fn in utils.findFiles(outputDir, filterExt=".csv"):
        if fn == "all.csv":
            continue
        with io.open(join(outputDir, fn), "r", encoding='utf-8') as fd:
            aggrOutput.append(fd.read())

    with io.open(join(outputDir, "all.csv"), "w", encoding='utf-8') as fd:
        fd.write("\n".join(aggrOutput))
def aggregateFeatures(featurePath, featureList, headerStr=None):
    
    outputDir = join(featurePath, "aggr")
    utils.makeDir(outputDir)
    
    fnList = []
    dataList = []
    
    # Find the files that exist in all features
    for feature in featureList:
        fnSubList = utils.findFiles(join(featurePath, feature),
                                    filterExt=".txt")
        fnList.append(fnSubList)
        
    actualFNList = []
    for featureFN in fnList[0]:
        if all([featureFN in subList for subList in fnList]):
            actualFNList.append(featureFN)
    
    for featureFN in actualFNList:
        dataList = []
        for feature in featureList:
            featureDataList = utils.openCSV(join(featurePath, feature),
                                            featureFN, encoding="utf-8")
            dataList.append([",".join(row) for row in featureDataList])
        
        name = os.path.splitext(featureFN)[0]
        
        dataList.insert(0, [name for _ in range(len(dataList[0]))])
        tDataList = utils.safeZip(dataList, enforceLength=True)
        outputList = [",".join(row) for row in tDataList]
        outputTxt = "\n".join(outputList)
        
        outputFN = join(outputDir, name + ".csv")
        with io.open(outputFN, "w", encoding="utf-8") as fd:
            fd.write(outputTxt)
        
    # Cat all files together
    aggrOutput = []
    
    if headerStr is not None:
        aggrOutput.append(headerStr)
    
    for fn in utils.findFiles(outputDir, filterExt=".csv"):
        if fn == "all.csv":
            continue
        with io.open(join(outputDir, fn), "r", encoding='utf-8') as fd:
            aggrOutput.append(fd.read())
    
    with io.open(join(outputDir, "all.csv"), "w", encoding='utf-8') as fd:
        fd.write("\n".join(aggrOutput))
Beispiel #12
0
def uwePhoneCountForEpochs(epochPath, tgInfoPath, manualCountsPath, outputPath):
    
    utils.makeDir(outputPath)
    
    for fn in utils.findFiles(tgInfoPath, filterExt=".txt"):
        print fn
        epochList = utils.openCSV(epochPath, fn)
        tgInfo = utils.openCSV(tgInfoPath, fn)
        manualCounts = utils.openCSV(manualCountsPath, fn)
        
        epochOutputList = []
        for epochNumber, epochStart, epochStop in epochList:
            epochStart, epochStop = float(epochStart), float(epochStop)
            
            # Find all of the intervals that are at least partially contained within
            # the current epoch
            epochSyllableCount = 0
            unadjustedEpochSyllableCount = 0
            epochArticulationRate = 0
            epochAverageSyllableDuration = 0
            for info, nucleusList in utils.safeZip([tgInfo, manualCounts],
                                                   enforceLength=True):
                start, stop, wordList = info
                start, stop = float(start), float(stop)
                
                syllableCount = len(nucleusList)
                unadjustedEpochSyllableCount += syllableCount
                # Accounts for intervals that straddle an epoch boundary
                multiplicationFactor = _percentInside(start, stop, epochStart,
                                                      epochStop)
                
                epochSyllableCount += syllableCount * multiplicationFactor
            
#             epochOutputList.append("%f,%f" % (unadjustedEpochSyllableCount,epochSyllableCount))
            epochOutputList.append("%f" % (epochSyllableCount))
                    
        open(join(outputPath, fn), "w").write("\n".join(epochOutputList) + "\n")
Beispiel #13
0
def addEpochsToTextgrids(tgPath, epochPath, outputPath):
    
    utils.makeDir(outputPath)
    
    for name in utils.findFiles(tgPath, filterExt=".TextGrid", stripExt=True):
        print name
        tg = tgio.openTextGrid(join(tgPath, name+".TextGrid"))

        entryList = utils.openCSV(epochPath, name+".txt")
        entryList = [(float(start), float(end), label) for label, start, end in entryList]
        
        tier = tgio.IntervalTier("epochs", entryList, minT=0, maxT=tg.maxTimestamp)
        
        tg.addTier(tier)
        tg.save(join(outputPath, name+".TextGrid"))
Beispiel #14
0
def findFrequenciesForWordLists(featurePath, countObj, frequencyNormFunc):
    
    frequencyPath = join(featurePath, "frequency")
    utils.makeDir(frequencyPath)
    
    wordsPath = join(featurePath, "words")

    for fn in utils.findFiles(wordsPath):
        wordList = utils.openCSV(wordsPath, fn, valueIndex=0, encoding="utf-8")
        countList = []
        for word in wordList:
            tmp = countObj.getFrequency(word,
                                        frequencyNormFunc,
                                        outOfDictionaryValue=1)
            count, freq, logFreq = tmp
            countList.append("%f,%f,%f" % (count, freq, logFreq))
            
        with open(join(frequencyPath, fn), "w") as fd:
            fd.write("\n".join(countList))
Beispiel #15
0
def findFrequenciesForWordLists(featurePath, countObj, frequencyNormFunc):
    
    frequencyPath = join(featurePath, "frequency")
    utils.makeDir(frequencyPath)
    
    wordsPath = join(featurePath, "words")

    for fn in utils.findFiles(wordsPath):
        wordList = utils.openCSV(wordsPath, fn, valueIndex=0, encoding="utf-8")
        countList = []
        for word in wordList:
            tmp = countObj.getFrequency(word,
                                        frequencyNormFunc,
                                        outOfDictionaryValue=1)
            count, freq, logFreq = tmp
            countList.append("%f,%f,%f" % (count, freq, logFreq))
            
        with open(join(frequencyPath, fn), "w") as fd:
            fd.write("\n".join(countList))
Beispiel #16
0
def medianFilter(f0Path, outputPath, windowSize):

    # windowSize must be odd
    assert (windowSize % 2 != 0)

    utils.makeDir(outputPath)

    for fn in utils.findFiles(f0Path, filterExt=".txt"):
        valueList = utils.openCSV(f0Path, fn)

        f0List = [
            float(row[1]) if row[1] != "--undefined--" else 0
            for row in valueList
        ]  # time, f0Val, intensityVal
        f0Filtered = filters.medianFilter(f0List,
                                          windowSize,
                                          useEdgePadding=True)

        outputList = [
            "%s,%0.3f,%s" % (row[0], f0Val, row[2])
            for row, f0Val in zip(*[valueList, f0Filtered])
        ]
        open(join(outputPath, fn), "w").write("\n".join(outputList) + "\n")
Beispiel #17
0
def eventStructurePerEpoch(epochPath, fullyFilteredTGPath, 
                           childFilteredTGPath, noiseFilteredTGPath,
                           unfilteredTGPath, outputPath, 
                           speechTierName, laughterTierName):
    '''
    How frequent and with what duration did laughter, pauses, and speech occur
    '''
    
    def _getCountsAndDurations(tier, searchLabel):
        entryList = tier.find(searchLabel)
        durationList = [float(stop) - float(start) 
                        for start, stop, label in entryList]
        count = len(entryList)
        
        return sum(durationList), count
    
    utils.makeDir(outputPath)
    
    for name in utils.findFiles(epochPath, filterExt=".txt", stripExt=True):
        
        epochList = utils.openCSV(epochPath, name+".txt")
        epochList = [(epochNum, float(start), float(stop)) 
                     for epochNum, start, stop in epochList]
        tg = tgio.openTextGrid(join(fullyFilteredTGPath, 
                                       name + ".TextGrid"))
        childFilteredTG = tgio.openTextGrid(join(childFilteredTGPath,
                                                   name + ".TextGrid"))
        noiseFilteredTG = tgio.openTextGrid(join(noiseFilteredTGPath,
                                                    name + ".TextGrid"))
        origTG = tgio.openTextGrid(join(unfilteredTGPath, 
                                           name + ".TextGrid"))
        
        outputList = []
        for epochNum, start, stop in epochList:
            subTG = tg.crop(strictFlag=False, softFlag=False, 
                            startTime=start, endTime=stop)
            
            speechTier = subTG.tierDict[speechTierName]
            laughterTier = subTG.tierDict[laughterTierName]
            
            pauseDur, numPauses = _getCountsAndDurations(speechTier, "FP")
            speechDur, numSpeech = _getCountsAndDurations(speechTier, "MS")
            laughDur, numLaughter = _getCountsAndDurations(laughterTier, "LA")
            
            subCSFilteredTG = childFilteredTG.crop(strictFlag=False, 
                                                softFlag=False,
                                                startTime=start,
                                                endTime=stop)
            csFilteredTier = subCSFilteredTG.tierDict[speechTierName]
            csFiltSpeech, numCSFiltSpeech = _getCountsAndDurations(csFilteredTier, 
                                                               "MS")            

            subNoiseFilteredTG = noiseFilteredTG.crop(strictFlag=False, 
                                                softFlag=False,
                                                startTime=start,
                                                endTime=stop)
            nsFilteredTier = subNoiseFilteredTG.tierDict[speechTierName]
            nsFiltSpeech, numNsFiltSpeech = _getCountsAndDurations(nsFilteredTier, 
                                                               "MS")     
            
            subOrigTG = origTG.crop(strictFlag=False,
                                    softFlag=False,
                                    startTime=start,
                                    endTime=stop)
            origSpeechTier = subOrigTG.tierDict[speechTierName]
            fullSpeechDur, fullNumSpeech = _getCountsAndDurations(origSpeechTier, 
                                                                  "MS")
            
            epochTuple = (speechDur, numSpeech, csFiltSpeech, nsFiltSpeech, 
                          fullSpeechDur, fullSpeechDur - speechDur,
                          pauseDur, numPauses, laughDur, numLaughter)
            outputList.append("%.02f, %d, %.02f, %.02f, %.02f, %.02f, %.02f, %d, %.02f, %d" % epochTuple)
        
        open(join(outputPath, name+".txt"), "w").write("\n".join(outputList) + "\n")