def run(self): dataset_dir = self.pass_in output_dir = self.pass_out for root, dirs, files in os.walk(dataset_dir): for fname in files: matchObj = re.match(r"(train\d+).*", fname) if matchObj is None: continue dataset = dpDataset.load(os.path.join(dataset_dir, fname)) # The returned data structure will be in the following # format: # [effective feature set for a class] * 4 # and the each feature set is again # [user_id] * 10 e = Effective(dataset).getFeatureList(n_count=10) out_prefix = matchObj.group(1) out_fname = out_prefix + "_effectives.yaml" yout = open(os.path.join(output_dir, out_fname), "w") yout.write(yaml.dump(e)) yout.close() print "Finished %s" % out_fname
def trainFrom(self, fname): ''' @param fname the file which includes feature dataset from which we will build a trained maxent model and return ''' dataset = dpDataset.load(fname) trainer = meSimple.METrainer() model = trainer.trainedModelOn(dataset) return model
def getSeeds(): fname = '../feature_set2/ver2.8-efollowing.libsvm' dataset = dp_dataset.load(fname) dataset = dp_dataset.shuffle(dataset) e = Effective(dataset) nested_seeds = e.getFeatureList(n_count = 10) def _flatten(l): for el in l: if (isinstance(el, collections.Iterable) and not isinstance(el, basestring)): for sub in _flatten(el): yield sub else: yield el print nested_seeds return list(set([x for x in _flatten(nested_seeds)]))
def getFeatureList(self, n_count = 10): ''' Return a list of n_count effective features for each class. Therefore, the total number of returned features will be n_count * n_class. ''' dataset = self.dataset m = self.trainedModelOn(dataset) features = self._extractFeatures(dataset) e_features = self._mostEffective(m, features, n_count) # you might want to call reduce(lambda x, y: x + y, effectives) to # flatten the returned value return e_features if __name__ == '__main__': fname = '../feature_set2/ver2.8-etext.libsvm' dataset = dp_dataset.load(fname) e = Effective(dataset) print e.getFeatureList(n_count = 10)