def getUserLabels(self): """Collect UserLabels.""" labelPaths = absoluteFilePaths(self.labelDir) self.userLabels = [CorpusLabel(path, self.corpus) for path in labelPaths] if len(self.userLabels) == 0: raise ValueError("No users labels found") self.nlabelers = len(self.userLabels)
def getDataFiles(self): """ Collect all CSV data files from self.srcRoot directory. @return (dict) Keys are relative paths (from self.srcRoot) and values are the corresponding data files. """ filePaths = absoluteFilePaths(self.srcRoot) dataSets = [DataFile(path) for path in filePaths if ".csv" in path] def getRelativePath(srcRoot, srcPath): return srcPath[srcPath.index(srcRoot)+len(srcRoot):]\ .strip(os.path.sep).replace(os.path.sep, "/")
def getRawLabels(self): """Collect the raw user labels from specified directory.""" labelPaths = absoluteFilePaths(self.labelDir) self.userLabels = [] self.knownLabels = [] for path in labelPaths: if "known" in path: self.knownLabels.append(CorpusLabel(path, self.corpus)) else: self.userLabels.append(CorpusLabel(path, self.corpus)) self.nLabelers = len(self.userLabels) if self.nLabelers == 0: raise ValueError("No users labels found")
def getDataFiles(self): """ Collect all CSV data files from self.srcRoot directory. @return (dict) Keys are relative paths (from self.srcRoot) and values are the corresponding data files. """ filePaths = absoluteFilePaths(self.srcRoot) dataSets = [DataFile(path) for path in filePaths if ".csv" in path] def getRelativePath(srcRoot, srcPath): return srcPath[srcPath.index(srcRoot)+len(srcRoot):].strip("/") return {getRelativePath(self.srcRoot, d.srcPath) : d for d in dataSets}
def getDataFiles(self): """ Collect datafiles from self.srcRoot where datafiles are stored in a dictionary in which the path relative to the self.srcRoot is their key. @return (dict) Dictionary containing key value pairs of a relative path and its corresponding datafile. """ filePaths = absoluteFilePaths(self.srcRoot) dataSets = [DataFile(path) for path in filePaths] def getRelativePath(srcRoot, srcPath): return srcPath[srcPath.index(srcRoot) + len(srcRoot) :].strip("/") dataSets = {getRelativePath(self.srcRoot, d.srcPath): d for d in dataSets} return dataSets
def getDataFiles(self): """ Collect datafiles from self.srcRoot where datafiles are stored in a dictionary in which the path relative to the self.srcRoot is their key. @return (dict) Dictionary containing key value pairs of a relative path and its corresponding datafile. """ filePaths = absoluteFilePaths(self.srcRoot) dataSets = [DataFile(path) for path in filePaths] def getRelativePath(srcRoot, srcPath): return srcPath[srcPath.index(srcRoot) + len(srcRoot):].strip("/") dataSets = {getRelativePath(self.srcRoot, d.srcPath) : d \ for d in dataSets} return dataSets
def getDataFiles(self): """ Collect all CSV data files from self.srcRoot directory. @return (dict) Keys are relative paths (from self.srcRoot) and values are the corresponding data files. """ filePaths = absoluteFilePaths(self.srcRoot) dataSets = [DataFile(path) for path in filePaths if ".csv" in path] def getRelativePath(srcRoot, srcPath): # Handle case where srcRoot is already relative srcRoot = os.path.abspath(srcRoot) ind = srcPath.index(srcRoot) root_len = len(srcRoot) return srcPath[ind+root_len:]\ .strip(os.path.sep).replace(os.path.sep, "/") return {getRelativePath(self.srcRoot, d.srcPath): d for d in dataSets}
#%% print('\n ==Preprocessing==========================================') #construct a dict per collection of datasets #the idea is that each collection has datasets with similar formats collections = { #"nab" : {'folder':'/nab/','delimiter':',','timestamp':True}, "synth": { 'folder': '/synth/', 'delimiter': ',', 'timestamp': False } } #now use the collections to create preprocessors and automatically preprocess each folder for c in collections: rawPaths = absoluteFilePaths(rawDir + collections[c]['folder']) for f in rawPaths: print('\n **Preprocessing ' + f) pre = Preprocessor(fileName=f, delimiter=collections[c]['delimiter'], timestamp=collections[c]['timestamp'], folderStruct=collections[c]['folder']) pre.autoPP() print('\n ==Finished Preprocessing==================================') #create a dict via nab.corpus that contain filenames as keys and dataframes as values print('\n Loading preprocessed data files to corpus') corpus = Corpus(dataDir) data = corpus.dataFiles print('The following datasets will be analysed') for d in data:
def getUserLabels(self): """Collect UserLabels.""" labelPaths = absoluteFilePaths(self.labelRoot) userLabels = [UserLabel(path, corp=self.corpus) for path in labelPaths] self.userLabels = userLabels self.nlabelers = len(self.userLabels)