Exemple #1
0
def getFeatureFilesDetails(featureNames):
    '''
    Get a list of the files for a particular feature and their details
    '''
    fileDetails = []

    for featureName in featureNames:
        # Get names of feature folders
        rootPath = FFP.getRootPath(featureName)
        featuresPath = FFP.getFeatureFolderPath(rootPath, featureName)
        pieceFolders = getFolderNames(featuresPath, orderAlphabetically=True)
        # Iterate over pieces
        for pieceFolder in pieceFolders:
            performanceFiles = getFileNames(featuresPath + pieceFolder,
                                            endsWith='.csv',
                                            orderAlphabetically=True)
            # Iterate over performances
            for performanceFile in performanceFiles:
                fileDetails.append({
                    'Feature':
                    featureName,
                    'Piece':
                    pieceFolder,
                    'Performance':
                    rcut(performanceFile, FFP.fileSuffix[featureName]),
                    'Filename':
                    performanceFile
                })

    df = pd.DataFrame(fileDetails)
    df.to_csv('Feature File Details.csv')
Exemple #2
0
def getFeatureValuesDataFrame(featureName,
                              numFolders,
                              numFilesPerFolder,
                              featureFileType='.csv'):
    '''
    Returns a dataframe of the feature values for a specified number of performances 
    '''
    # Get names of feature folders
    rootPath = FFP.getRootPath(featureName)
    pieceFolders = getFolderNames(rootPath,
                                  contains='mazurka',
                                  orderAlphabetically=True)

    featureDataFrames = []
    # Iterate over pieces
    for pieceFolder in pieceFolders:
        print 'processing folder: %s' % pieceFolder
        featuresPath = FFP.getFeatureFolderPath(rootPath + pieceFolder,
                                                featureName)
        performanceFiles = getFileNames(
            featuresPath, endsWith=featureFileType,
            orderAlphabetically=True)[:numFilesPerFolder]
        # Iterate over performances
        for performanceFile in performanceFiles:
            print '\tprocessing file: %s' % performanceFile
            featureFn = os.path.join(featuresPath, performanceFile)
            if featureFileType == '.csv':
                featureDataFrames.append(
                    pd.read_csv(featureFn, header=None, index_col=0))
            elif featureFileType == '.pkl':
                featureDataFrames.append(pd.read_pickle(featureFn))

    dfAllPerformances = pd.concat(featureDataFrames, ignore_index=True)
    return dfAllPerformances
Exemple #3
0
def getFeatureFileDict(piecesPath,
                       pieceFolder,
                       featuresPath,
                       featureName,
                       numFiles=None):
    '''
    Returns a dictionary of all the performances of a given piece
    The dictionary keys are the filenames with the standard suffix for that feature
    removed.
    The dictionary values are new dictionaries with keys and associated values for
    FileName, FilePath and PieceId
    If numFiles is None then all files will be returned, 
        otherwise return the first numFiles files alphabetically
    '''
    featureFileDict = {}
    featureFileSuffix = FeatureFileProps.fileSuffix[featureName]
    featureFileNames = sorted(
        getFileNames(featuresPath, featureFileSuffix, True))
    if numFiles is not None:
        featureFileNames = featureFileNames[:numFiles]
    featureFilePaths = [featuresPath + fName for fName in featureFileNames]
    featureFileIds = [
        fName.rstrip(featureFileSuffix) for fName in featureFileNames
    ]
    for i in np.arange(len(featureFileIds)):
        fileName = featureFileNames[i]
        filePath = featureFilePaths[i]
        pieceId = pieceFolder
        featureFileDict[featureFileIds[i]] = FeatureFileProps(
            fileName, filePath, pieceId)

    return featureFileDict
def loadCRPfiles():
    '''
    Loads all CRP files into memory
    '''
    CRPfiles = sorted(getFileNames(CRPpath, '.npy'))
    CRPs = []
    for CRPfile in CRPfiles:
        fCRP = open(CRPpath + CRPfile, 'rb')
        CRPdata = fCRP.read()
        fCRP.close()
        CRPs.append(CRPdata)

    return CRPfiles, CRPs
Exemple #5
0
def cleanRunFolder(runName=None, cleanCRPfolder=True, cleanNCDfolder=True):

    # Remove CRP files
    crpFiles = getFileNames(CRPpath, endsWith='.npy')
    for crpFile in crpFiles:
        os.remove(CRPpath + crpFile)

    # Remove NCD files
    ncdFiles = getFileNames(NCDpath, endsWith='.pkl')
    for ncdFile in ncdFiles:
        os.remove(NCDpath + ncdFile)

    # Empty Run Folder and Run History Folder
    if runName is not None:
        resultsPath = NCDpath + runName + '/'
        resultsFiles = getFileNames(resultsPath)
        for resultsFile in resultsFiles:
            os.remove(resultsPath + resultsFile)
        historyPath = runHistoryPath + runName + '/'
        historyFiles = getFileNames(historyPath)
        for historyFile in historyFiles:
            os.remove(historyPath + historyFile)
Exemple #6
0
def getRunHistoryDataFrame():
    '''
    Loads the history of runs and returns a dataframe of the settings
    '''
    lstRunHistory = []
    runHistoryFiles = getFileNames(runHistoryPath,
                                   endsWith='.pkl',
                                   orderAlphabetically=True)
    for rhFile in runHistoryFiles:
        lstRunHistory.append(pickle.load(open(runHistoryPath + rhFile, 'rb')))

    df = pd.DataFrame(lstRunHistory)
    return df
Exemple #7
0
    def getFeatureFileDictTestSet(cls, pieceFolder, featuresPath, featureName):

        featureFileDict = {}
        featureFileSuffix = FeatureFileProps.getFileSuffix(featureName)
        featureFileNames = getFileNames(featuresPath,
                                        endsWith=featureFileSuffix,
                                        orderAlphabetically=True)
        featureFileNames = getTestSetPerformances(featureFileNames)
        featureFilePaths = [featuresPath + fName for fName in featureFileNames]
        featureFileIds = [
            fName[:-len(featureFileSuffix)] for fName in featureFileNames
        ]
        for i in range(len(featureFileIds)):
            fileName = featureFileNames[i]
            filePath = featureFilePaths[i]
            pieceId = pieceFolder
            featureFileDict[featureFileIds[i]] = FeatureFileProps(
                fileName, filePath, pieceId, featureFileIds[i])

        return featureFileDict
Exemple #8
0
def getFeatureFrequenciesDataFrame(featureName, weightMatrix, biasesMatrix,
                                   featureOffset, featureScaling,
                                   NNtimeStacking, numFolders,
                                   numFilesPerFolder):

    # Get the folders (performances)
    piecesPath = FFP.getRootPath(featureName)
    piecesFolders = getFolderNames(
        piecesPath, contains='mazurka', orderAlphabetically=True
    )[:
      20]  # added the contains parameter to avoid the new powerspectrum folder
    if numFolders is not None:
        piecesFolders = piecesFolders[:numFolders]

    # For each piece
    featureDataFrames = []
    for piecesFolder in piecesFolders:
        # Get performances of the piece
        featuresPath = FFP.getFeatureFolderPath(piecesPath + piecesFolder,
                                                featureName)
        performances = getFileNames(featuresPath,
                                    orderAlphabetically=True,
                                    endsWith='.csv')
        if numFilesPerFolder is not None:
            performances = performances[:numFilesPerFolder]
        pf = 0
        for performance in performances:
            pf += 1
            print 'Transforming Features %i' % pf
            # Load feature file and transform
            dfTransformedFeatures = loadAndTransformFeatureFile(
                featuresPath + performance, featureOffset, featureScaling,
                NNtimeStacking, weightMatrix, biasesMatrix)
            featureDataFrames.append(dfTransformedFeatures)

    # Calculate Histogram of the transformed features
    dfAllPerformances = pd.concat(featureDataFrames, ignore_index=True)

    return dfAllPerformances
def getFeatureFileDict(piecesPath, pieceFolder, featuresPath, featureName):
    '''
    Returns a dictionary of all the performances of a given piece
    The dictionary keys are the filenames with the standard suffix for that feature
    removed.
    The dictionary values are new dictionaries with keys and associated values for
    FileName, FilePath and PieceId
    '''
    featureFileDict = {}
    featureFileSuffix = featuresDict[featureName]['file suffix']
    featureFileNames = getFileNames(featuresPath, featureFileSuffix)
    featureFilePaths = [featuresPath + fName for fName in featureFileNames]
    featureFileIds = [
        fName.rstrip(featureFileSuffix) for fName in featureFileNames
    ]
    for i in np.arange(len(featureFileIds)):
        featureFileDict[featureFileIds[i]] = {}
        featureFileDict[featureFileIds[i]]['FileName'] = featureFileNames[i]
        featureFileDict[featureFileIds[i]]['FilePath'] = featureFilePaths[i]
        featureFileDict[featureFileIds[i]]['PieceId'] = pieceFolder

    return featureFileDict
def getFeatureFileDict(piecesPath, pieceFolder, featuresPath, featureName,
                       runType):
    '''
    Returns a dictionary of all the performances of a given piece
    The dictionary keys are the filenames with the standard suffix for that feature
    removed.
    The dictionary values are new dictionaries with keys and associated values for
    FileName, FilePath and PieceId
    If numFiles is None then all files will be returned, 
        otherwise return the first numFiles files alphabetically
    '''
    featureFileDict = {}
    featureFileSuffix = FeatureFileProps.getFileSuffix(featureName)
    featureFileNames = getFileNames(featuresPath,
                                    endsWith=featureFileSuffix,
                                    orderAlphabetically=True)

    if runType == 'training':
        featureFileNames = getTrainingSetPerformances(featureFileNames)
    elif runType == 'validation':
        featureFileNames = getValidationSetPerformances(featureFileNames)
    elif runType == 'test':
        featureFileNames = getTestSetPerformances(featureFileNames)

    featureFilePaths = [featuresPath + fName for fName in featureFileNames]
    featureFileIds = [
        fName[:-len('featureFileSuffix')] for fName in featureFileNames
    ]
    for i in np.arange(len(featureFileIds)):
        fileName = featureFileNames[i]
        filePath = featureFilePaths[i]
        pieceId = pieceFolder
        featureFileDict[featureFileIds[i]] = FeatureFileProps(
            fileName, filePath, pieceId, featureFileIds[i])

    return featureFileDict
Exemple #11
0
def createNCDfiles(existingNCDs,
                   processPool,
                   featureName,
                   downSampleFactor,
                   timeDelay,
                   dimension,
                   method,
                   neighbourhoodSize,
                   numFilesPerFolder,
                   sequenceLength,
                   weightMatrix=None,
                   biases=None,
                   featureOffset=0.0,
                   featureScaling=1.0):
    '''
    Inputs:
        :existingNCDs: a list of existing NCD files in order to avoid duplication
        :processPool: a pool of multiprocessing processes to use for running the script
        :featureName: the name of the feature e.g 'chroma', 'mfcc'
        :downSampleFactor: the factor to use in downsampling the original signals before creating CRPs
        :timeDelay: the time delay to use in creating the CRPs
        :method: the method to use in creating the CRPs
        :neighbourhoodSize: the neighbourhood size to use in creating the CRPs
        :numFilesPerFolder: the number of performances of each piece to use - set to None to use all performances
        :sequenceLength: fixed sequence length to normalise CRPs to (use 'var' for variable length)
        :weightMatrix: a matrix of weights (inputFeatureLength rows x outputFeatureLength columns) 
                       to transform the input feature files with before calculating the CRPs
    '''

    mazurkasPath = FeatureFileProps.rootPath[featureName]

    mazurkaIds = getFolderNames(mazurkasPath, True)[:20]
    if existingNCDs is not None:
        existingNCDs = set(existingNCDs)  # makes checking faster

    # Get performances from folders
    featureFileDict = getFeatureFileDictAllFolders(mazurkasPath, mazurkaIds,
                                                   featureName,
                                                   numFilesPerFolder)

    # Create list of required NCD files
    requiredNCDs = []
    featureFileIds = featureFileDict.keys()
    numFeatureFiles = len(featureFileIds)
    print 'Checking for existing NCD files...'
    for f1 in np.arange(numFeatureFiles - 1):
        featureFilePath1 = featureFileDict[featureFileIds[f1]].filePath
        pc1Id = featureFileDict[featureFileIds[f1]].pieceId
        pc1pfId = featureFileIds[f1]
        for f2 in np.arange(f1, numFeatureFiles):
            featureFilePath2 = featureFileDict[featureFileIds[f2]].filePath
            pc2Id = featureFileDict[featureFileIds[f2]].pieceId
            pc2pfId = featureFileIds[f2]
            ncdProps = NCDprops(pc1Id, pc1pfId, pc2Id, pc2pfId, method,
                                dimension, timeDelay, neighbourhoodSize,
                                downSampleFactor, sequenceLength, featureName,
                                featureFilePath1, featureFilePath2)
            if not NCDexists(ncdProps.getFileName(),
                             existingNCDs=existingNCDs):
                requiredNCDs.append(ncdProps)
    print 'Number of NCD files missing for combination: %i' % len(requiredNCDs)

    # Create Required CRPs for NCD files
    if len(requiredNCDs) > 0:

        # Create CRP files and save to the CRPs folder
        print 'Calculating # of required CRP files'
        requiredCRPs = []
        sourceCRPs = []
        for requiredNCD in requiredNCDs:
            crp1 = requiredNCD.getCRP1()
            crp2 = requiredNCD.getCRP2()
            sourceCRPs.append(crp1)
            sourceCRPs.append(crp2)
            if not crp1.hasExistingFile():
                requiredCRPs.append(crp1)
            if not crp2.hasExistingFile():
                requiredCRPs.append(crp2)
        requiredCRPs = CRPprops.uniqueCRPprops(requiredCRPs)
        sourceCRPs = CRPprops.uniqueCRPprops(sourceCRPs)
        numRequiredCRPs = len(requiredCRPs)
        print 'Creating %i required CRP files' % numRequiredCRPs
        if numRequiredCRPs > 0:
            CRPargList = []
            for crp in requiredCRPs:
                crp.weightMatrix = weightMatrix
                crp.biases = biases
                crp.featureOffset = featureOffset
                crp.featureScaling = featureScaling
                CRPargList.append((crp, ))
            processPool.map(multi_createCRPfile, CRPargList)

        # Load CRP files into memory
        print 'Loading %i CRP files' % len(sourceCRPs)
        CRPfiles = loadCRPfiles(sourceCRPs)

        # Create NCD files
        numNCDs = len(requiredNCDs)
        print 'Creating %i NCD files' % numNCDs
        NCDindex = 0
        while NCDindex < numNCDs:
            NCDargList = []
            for iNCD in np.arange(NCDindex, min(NCDindex + 100, numNCDs)):
                requiredNCD = requiredNCDs[iNCD]
                NCDfn = requiredNCD.getFileName()
                CRPtuple1 = requiredNCD.getCRP1().toTuple(False)
                CRPtuple2 = requiredNCD.getCRP2().toTuple(False)
                try:
                    NCDargList.append(
                        (NCDfn, CRPfiles[CRPtuple1], CRPfiles[CRPtuple2]))
                except:
                    pass
            if NCDargList:
                processPool.map(multi_createNCDfile, NCDargList)
            NCDindex += 100
            print '\r%i...' % NCDindex,

        # Delete CRP files
        print 'Deleting CRP files'
        for CRPfilename in getFileNames(CRPpath, '.npy', True):
            try:
                os.remove(CRPpath + CRPfilename)
            except:
                pass
# Load weights and biases       
if NNtype is not None:
    weightMatrix, biases, featureOffset, featureScaling = get_NN_NCD_params(
                                                            NNtype, featureName, learningRate, learningRateBoostFactor,
                                                            corruptionLevel, numOriginalFeatures, numNewFeatures, batchSize, 
                                                            freqStd = frequencyStandardisation, NNnumFolders = numFolders, 
                                                            NNnumFilesPerFolder = numFilesPerFolder,
                                                            NNtimeStacking = timeStacking)
# Load (and optionally transform) the feature files
p = 0
featuresDataFrames = []
for piecesFolder in piecesFolders:
    performancesPath = FFP.getFeatureFolderPath(piecesPath + piecesFolder + '/', featureName)
    performances = getFileNames(performancesPath, 
                                orderAlphabetically = True, 
                                endsWith = '.csv')
    if numFilesPerFolder is not None:
        performances = performances[: numFilesPerFolder]
    for performance in performances:
        p+= 1
        print '\rloading feature file %i...' % p,

        performanceFilePath = performancesPath + performance
        if NNtype is None:
            featuresDataFrames.append(loadFeatureFile(performanceFilePath))
        else:
            featuresDataFrames.append(loadAndTransformFeatureFile(performanceFilePath, 
                                                                  featureOffset, featureScaling,
                                                                  timeStacking, weightMatrix, biases))
    print
Exemple #13
0
def cleanNCDfolder():
    
    # Remove NCD files
    ncdFiles = getFileNames(NCDpath, endsWith = '.pkl')
    for ncdFile in  ncdFiles:
        os.remove(NCDpath + ncdFile)
Exemple #14
0
def cleanCRPfolder():
    
    # Remove CRP files
    crpFiles = getFileNames(CRPpath, endsWith = '.npy')
    for crpFile in  crpFiles:
        os.remove(CRPpath + crpFile)
Exemple #15
0
def convertNCDfiles(dataFrameFileName):
    '''
    Converts NCD results files in the NCD folder into a pandas dataframe
    If the dataframe already exists with old results then the new results are appended
    '''

    dataFrameFileName = rcut(dataFrameFileName, '.pkl.res') + '.pkl.res'

    # Load new NCD files
    NCDfiles = [
        fn for fn in getFileNames(NCDpath, endsWith='.pkl')
        if reNCDfilename.search(fn)
    ]
    print 'Total number of files: %i' % len(NCDfiles)
    lstNCDs = []
    iFile = 0

    print 'Reading files...'
    for NCDfile in NCDfiles:
        try:
            NCDfileDict = pickle.load(open(NCDpath + NCDfile, 'rb'))
            NCDfileDict['FileName'] = NCDfile.rstrip('.pkl')
            m = reNCDfilename.search(NCDfile)
            NCDfileDict['Piece 1 Id'] = m.group(1)
            NCDfileDict['Piece 1 Performance Id'] = m.group(2)
            NCDfileDict['Piece 2 Id'] = m.group(3)
            NCDfileDict['Piece 2 Performance Id'] = m.group(4)
            NCDfileDict['CRP Method'] = m.group(5)
            NCDfileDict['CRP Dimension'] = float(m.group(6))
            NCDfileDict['CRP Time Delay'] = float(m.group(7))
            NCDfileDict['CRP Neighbourhood Size'] = float(m.group(8))
            NCDfileDict['Downsample Factor'] = m.group(9)
            NCDfileDict['Feature'] = m.group(10)
            NCDfileDict['Sequence Length'] = m.group(11)
            NCDfileDict['File DateTime'] = time.ctime(
                os.path.getmtime(NCDpath + NCDfile))

            lstNCDs.append(copy.deepcopy(NCDfileDict))
        except:
            print 'Error reading file: %s' % NCDfile

        iFile += 1
        if float(iFile) / 10000 == int(float(iFile) / 10000):
            print 'Processing file #%i' % iFile

    print 'Creating dataframe from results files'
    dfNewNCDs = pd.DataFrame(lstNCDs)

    # Check for existing NCD dataframe
    if os.path.exists(NCDpath + dataFrameFileName):
        # Read old NCDs dataframe and concatenate new NCDs
        print 'Reading existing results dataframe...'
        dfOldNCDs = pd.read_pickle(NCDpath + dataFrameFileName)
        dfAllNCDs = pd.concat([dfOldNCDs, dfNewNCDs], ignore_index=True)
    else:
        dfAllNCDs = dfNewNCDs

    # Save file
    print 'Saving results dataframe %s...' % dataFrameFileName
    dfAllNCDs.to_pickle(NCDpath + dataFrameFileName)

    # Delete old NCD files
    print 'Deleting old results files...'
    for NCDfile in NCDfiles:
        os.remove(NCDpath + NCDfile)

    return dfAllNCDs
Exemple #16
0
def getNCDresults(subFolder='',
                  featureNames=None,
                  downSampleFactors=None,
                  methods=None,
                  dimensions=None,
                  timeDelays=None,
                  neighbourhoodSizes=None,
                  numFilesPerFolder=None,
                  sequenceLengths=None):
    '''
    Loads NCD results from results dataframes in the NCD folder
    If you want to only select some results then for each parameter you want to filter,
    include a list of the values you want to keep
    '''
    if subFolder != '':
        subFolder = subFolder.rstrip('/') + '/'
    runHistoryFiles = getFileNames(runHistoryPath + subFolder,
                                   endsWith='.pkl',
                                   orderAlphabetically=True)
    resultsDataFrames = []
    for rhFile in runHistoryFiles:

        # Load history file
        runDict = pickle.load(open(runHistoryPath + subFolder + rhFile, 'rb'))
        useFile = True

        # Check filters
        if featureNames is not None:
            if runDict['featureName'] not in featureNames:
                useFile = False
        if downSampleFactors is not None:
            if runDict['downSampleFactor'] not in downSampleFactors:
                useFile = False
        if methods is not None:
            if runDict['method'] not in methods:
                useFile = False
        if dimensions is not None:
            if runDict['dimension'] not in dimensions:
                useFile = False
        if timeDelays is not None:
            if runDict['timeDelay'] not in timeDelays:
                useFile = False
        if neighbourhoodSizes is not None:
            if runDict['neighbourhoodSize'] not in neighbourhoodSizes:
                useFile = False
        if numFilesPerFolder is not None:
            if runDict['numFilesPerFolder'] not in numFilesPerFolder:
                useFile = False
        if sequenceLengths is not None:
            if runDict['sequenceLength'] not in sequenceLengths:
                useFile = False

        # Load results file
        if useFile:
            # Read dataframe and append results
            print 'Reading %s...' % (rhFile + '.res')
            resultsDataFrames.append(
                pd.read_pickle(NCDpath + subFolder + rhFile + '.res'))

    # Create and return dataframe of all results
    print 'Creating results dataframe'
    dfAll = pd.concat(resultsDataFrames)
    return dfAll
                    'numFilesPerFolder': numFilesPerFolder,
                    'sequenceLength': setting['Sequence Length']
                }
                runTime = str(datetime.now()).replace(':', '-')
                pickle.dump(runDict,
                            open(runHistoryPath + runTime + '.pkl', 'wb'))
                # Convert NCD files into a dataframe
                convertNCDfiles(runTime)
                # Create subfolders and move results files into them
                NCDdest = NCDpath + subFolder + '/'
                runHistDest = runHistoryPath + subFolder + '/'
                if not os.path.exists(NCDdest):
                    os.makedirs(NCDdest)
                if not os.path.exists(runHistDest):
                    os.makedirs(runHistDest)
                for fn in getFileNames(NCDpath, '.pkl.res'):
                    shutil.move(NCDpath + fn, NCDdest)
                for fn in getFileNames(runHistoryPath, '.pkl'):
                    shutil.move(runHistoryPath + fn, runHistDest)
                # Get the overall MAP of the run and add to the setting
                MAPresult = getMAPresult(
                    featureName, CRPmethod, setting['Dimension'],
                    setting['Neighbourhood Size'], setting['Time Delay'],
                    setting['DownSample Factor'], numFilesPerFolder,
                    setting['Sequence Length'], subFolder)
                if MAPresult is not None:
                    print 'Mean Average Precision: %0.3f\n' % MAPresult
                else:
                    print 'No MAP result found!'
                setting['Mean Average Precision'] = MAPresult
def csvsToTheanoDataSet2(inputPaths,
                         outputFn,
                         numFilesPerFolder,
                         timeStepsPerFeature,
                         cropFeaturesToSize=None,
                         frequencyStandardisation=False,
                         trainPercentage=70.0,
                         validationPercentage=15.0,
                         testPercentage=15.0):
    '''
    Convert a batch of .csv files created by SonicAnnotator to a .pkl.gz training and
    testing set for input into Theano, with sequential stacking of features to incorporate
    temporal effects
    
    Inputs:
        :inputPaths:  The input folders to get examples from
        :outputFn:  Path to the output file
        :numFilesPerFolder:     The number of files to use from each folder (set to None to use all files)
        :timeStepsPerFeature:   The number of time steps of original features to include in each new feature
        :cropFeaturesToSize:    Set to an integer if only some features should be used - features from the lower end of 
                                 the range will be used i.e. lower frequencies
        :frequencyStandardisation: whether to standardise the range of each frequency band individually
        :trainPercentage:   The percentage of examples to use for training
        :validationPercentage:  The percentage of examples to use for validation
        :testPercentage:    The percentage of examples to use for testing
        TODO: implement a standardisation function argument
    '''

    allFeatures = None
    pieceIndex = 0

    # For each folder (piece)
    for inputPath in inputPaths:

        print 'Converting features in folder %s' % inputPath
        # Get list of numFilesPerFolder feature files
        inputFiles = getFileNames(inputPath,
                                  endsWith='.csv',
                                  orderAlphabetically=True)
        if numFilesPerFolder is not None:
            inputFiles = inputFiles[:numFilesPerFolder]

        # For each file (performance)
        for inputFn in inputFiles:
            print '\t%s' % inputFn
            # Read file
            fileFeatures = np.genfromtxt(inputPath + inputFn, delimiter=',')
            # Drop first column (time)
            fileFeatures = fileFeatures[:, 1:]
            # Drop upper columns if specified
            if cropFeaturesToSize is not None:
                fileFeatures = fileFeatures[:, :cropFeaturesToSize]
            # Drop rows where all columns are zero
            fileFeatures = fileFeatures[~np.all(fileFeatures == 0, axis=1)]
            # Stack features accoring to the time argument
            numExamples = fileFeatures.shape[0]
            numFeatures = fileFeatures.shape[1]
            ffNew = np.zeros([
                numExamples - timeStepsPerFeature + 1,
                timeStepsPerFeature * numFeatures
            ])
            for ts in range(timeStepsPerFeature):
                ffNew[:, ts * numFeatures:(ts + 1) *
                      numFeatures] = fileFeatures[ts:numExamples + 1 + ts -
                                                  timeStepsPerFeature, :]
            fileFeatures = ffNew
            # Add a label column to the end
            numFeatures = fileFeatures.shape[1]
            labelledFileFeatures = np.ones(
                [fileFeatures.shape[0], numFeatures + 1]) * pieceIndex
            labelledFileFeatures[:, :-1] = fileFeatures
            # Add to allFeatures array
            if allFeatures is None:
                allFeatures = copy.deepcopy(labelledFileFeatures)
            else:
                allFeatures = np.vstack(
                    (allFeatures, copy.deepcopy(labelledFileFeatures)))

        pieceIndex += 1

    # Standardise feature range from 0 to 1
    print 'Standardising range...'
    standardisationFn = rcut(outputFn,
                             '.pkl.gz') + '_standardisationValues.pkl.gz'
    if frequencyStandardisation:
        minFeatureValue = np.min(allFeatures[:, 0:numFeatures], axis=0)
        maxFeatureValue = np.max(allFeatures[:, 0:numFeatures], axis=0)
    else:
        minFeatureValue = np.min(np.min(allFeatures[:, 0:numFeatures]))
        maxFeatureValue = np.max(np.max(allFeatures[:, 0:numFeatures]))
    standardisationDict = {
        'Min Value': minFeatureValue,
        'Max Value': maxFeatureValue
    }
    pickle.dump(standardisationDict, open(standardisationFn, 'wb'))
    allFeatures[:, 0:numFeatures] = (allFeatures[:, 0:numFeatures] -
                                     minFeatureValue) / (maxFeatureValue -
                                                         minFeatureValue)
    print 'minFeatureValue = %s\nmaxFeatureValue = %s' % (minFeatureValue,
                                                          maxFeatureValue)

    # Shuffle Features 10 times
    print 'Shuffling...'
    for _ in np.arange(10):
        np.random.shuffle(allFeatures)

    # Extract training, validation and test sets
    numExamples = allFeatures.shape[0]
    numFeatures = allFeatures.shape[1] - 1

    trainingExamples = allFeatures[0:int(trainPercentage * numExamples / 100)]
    validationExamples = allFeatures[int(trainPercentage * numExamples /
                                         100):int((trainPercentage +
                                                   validationPercentage) *
                                                  numExamples / 100)]
    testExamples = allFeatures[int((trainPercentage + validationPercentage) *
                                   numExamples / 100):]

    train_set = (trainingExamples[:, :numFeatures],
                 trainingExamples[:, numFeatures])
    valid_set = (validationExamples[:, :numFeatures],
                 validationExamples[:, numFeatures])
    test_set = (testExamples[:, :numFeatures], testExamples[:, numFeatures])

    # Write file for Theano
    print 'Writing Theano file...'
    outputFn = rcut(outputFn, '.pkl.gz') + '.pkl.gz'
    f = gzip.open(outputFn, 'wb')
    cPickle.dump((train_set, valid_set, test_set), f)
    f.close()
def csvsToTheanoDataSet(inputPaths,
                        outputFn,
                        numFilesPerFolder,
                        trainPercentage=70.0,
                        validationPercentage=15.0,
                        testPercentage=15.0):
    '''
    Convert a batch of .csv files created by SonicAnnotator to a .pkl.gz training and
    testing set for input into Theano
    
    Inputs:
        :inputPaths:  The input folders to get examples from
        :outputFn:  Path to the output file
        :numFilesPerFolder:     The number of files to use from each folder (set to None to use all files)
        :trainPercentage:   The percentage of examples to use for training
        :validationPercentage:  The percentage of examples to use for validation
        :testPercentage:    The percentage of examples to use for testing
    '''

    allFeatures = None
    pieceIndex = 0

    # For each folder (piece)
    for inputPath in inputPaths:

        print 'Converting features in folder %s' % inputPath

        # Get list of numFilesPerFolder feature files
        inputFiles = getFileNames(inputPath,
                                  endsWith='.csv',
                                  orderAlphabetically=True)
        if numFilesPerFolder is not None:
            inputFiles = inputFiles[:numFilesPerFolder]

        # For each file (performance)
        for inputFn in inputFiles:
            print '\t%s' % inputFn
            # Read file
            fileFeatures = np.genfromtxt(inputPath + inputFn, delimiter=',')
            # Drop first column (time)
            fileFeatures = fileFeatures[:, 1:]
            # Drop rows where all columns are zero
            fileFeatures = fileFeatures[~np.all(fileFeatures == 0, axis=1)]
            # Add a label column to the end
            numFeatures = fileFeatures.shape[1]
            labelledFileFeatures = np.ones(
                [fileFeatures.shape[0], numFeatures + 1]) * pieceIndex
            labelledFileFeatures[:, :-1] = fileFeatures
            # Add to allFeatures array
            if allFeatures is None:
                allFeatures = copy.deepcopy(labelledFileFeatures)
            else:
                allFeatures = np.vstack(
                    (allFeatures, copy.deepcopy(labelledFileFeatures)))

        pieceIndex += 1

    # Standardise feature range from 0 to 1
    print 'Standardising range...'
    minFeatureValue = np.min(allFeatures[:, 0:numFeatures])
    maxFeatureValue = np.max(allFeatures[:, 0:numFeatures])
    print 'minFeatureValue = %f, maxFeatureValue = %f' % (minFeatureValue,
                                                          maxFeatureValue)
    allFeatures[:, 0:numFeatures] = (allFeatures[:, 0:numFeatures] -
                                     minFeatureValue) / (maxFeatureValue -
                                                         minFeatureValue)

    # Shuffle Features 10 times
    print 'Shuffling...'
    for _ in np.arange(10):
        np.random.shuffle(allFeatures)

    # Extract training, validation and test sets
    numExamples = allFeatures.shape[0]
    numFeatures = allFeatures.shape[1] - 1

    trainingExamples = allFeatures[0:int(trainPercentage * numExamples / 100)]
    validationExamples = allFeatures[int(trainPercentage * numExamples /
                                         100):int((trainPercentage +
                                                   validationPercentage) *
                                                  numExamples / 100)]
    testExamples = allFeatures[int((trainPercentage + validationPercentage) *
                                   numExamples / 100):]

    train_set = (trainingExamples[:, :numFeatures],
                 trainingExamples[:, numFeatures])
    valid_set = (validationExamples[:, :numFeatures],
                 validationExamples[:, numFeatures])
    test_set = (testExamples[:, :numFeatures], testExamples[:, numFeatures])

    # Write file for Theano
    print 'Writing Theano file...'
    outputFn = rcut(outputFn, '.pkl.gz') + '.pkl.gz'
    f = gzip.open(outputFn, 'wb')
    cPickle.dump((train_set, valid_set, test_set), f)
    f.close()