Beispiel #1
0
def featureSelection(path):
     
     
     dataset = DataSet()
     class_name=''
     TrainSet=[]
     for root, dirs,files in os.walk(path):
         #print root
         if dirs==[]:
             if class_name != os.path.basename(root):
                class_name = os.path.basename(root)
                print class_name
                class_count = len(files)
                freq_map={}
                for f in files:
                  temp_set = set()
                  #print class_name , " <--> " ,f
                  with open (os.path.join(root,f),'r')  as fin:
                      lines = fin.readlines()
                      for line in lines:
                          for token in wordpunct_tokenize(line):
                              if token not in punctuation:
                                  temp_set.add(token.lower())
                               
                  for token in temp_set:
                      if freq_map.has_key(token):
                          freq_map[token] = freq_map[token] + 1
                      else:
                          freq_map[token]=1
                                  
                          
                          
                dataset.add_new_class(SNLPClass(class_name, freq_map,class_count))
                          
     return dataset