def procDir(args,paramLst): vector_f = args[2] dirlst = args[3:] for dir in dirlst: docs = listDir(dir) if dir[-1] != '/': idx_base = dir.rfind('/') label = dir[idx_base+1:] else: idx_base = dir[:-1].rfind('/') label = dir[idx_base+1:-1] dir = dir[:-1] num = 0 for doc in docs: num += 1 fname = dir+'/'+doc procFile(paramLst,fname,label,vector_f)
return [fname.rstrip() for fname in p.stdout.readlines()] ##read in the dir names, process files and output to train_vector_file and test_vector_file def procDir(args,paramLst): vector_f = args[2] dirlst = args[3:] for dir in dirlst: docs = listDir(dir) if dir[-1] != '/': idx_base = dir.rfind('/') label = dir[idx_base+1:] else: idx_base = dir[:-1].rfind('/') label = dir[idx_base+1:-1] dir = dir[:-1] num = 0 for doc in docs: num += 1 fname = dir+'/'+doc procFile(paramLst,fname,label,vector_f) # print 'docs',num if __name__ == "__main__": option,args = readCommand(argv) paramLst = getParams(args) if option.procFile: ##if '-f', process a single file procFile(paramLst,*args[2:]) #exclude paramfname from args else: ##without '-f', process all files under given dirs procDir(args,paramLst)