def featureSelection(path): dataset = DataSet() class_name='' TrainSet=[] for root, dirs,files in os.walk(path): #print root if dirs==[]: if class_name != os.path.basename(root): class_name = os.path.basename(root) print class_name class_count = len(files) freq_map={} for f in files: temp_set = set() #print class_name , " <--> " ,f with open (os.path.join(root,f),'r') as fin: lines = fin.readlines() for line in lines: for token in wordpunct_tokenize(line): if token not in punctuation: temp_set.add(token.lower()) for token in temp_set: if freq_map.has_key(token): freq_map[token] = freq_map[token] + 1 else: freq_map[token]=1 dataset.add_new_class(SNLPClass(class_name, freq_map,class_count)) return dataset