def run(self):

        dataset_dir = self.pass_in
        output_dir = self.pass_out

        for root, dirs, files in os.walk(dataset_dir):
            for fname in files:
                matchObj = re.match(r"(train\d+).*", fname)

                if matchObj is None:
                    continue

                dataset = dpDataset.load(os.path.join(dataset_dir, fname))

                # The returned data structure will be in the following
                # format:
                #   [effective feature set for a class] * 4
                # and the each feature set is again
                #   [user_id] * 10
                e = Effective(dataset).getFeatureList(n_count=10)

                out_prefix = matchObj.group(1)
                out_fname = out_prefix + "_effectives.yaml"

                yout = open(os.path.join(output_dir, out_fname), "w")
                yout.write(yaml.dump(e))
                yout.close()

                print "Finished %s" % out_fname
    def trainFrom(self, fname):
        '''
        @param fname    the file which includes feature dataset from which we
                        will build a trained maxent model and return
        '''
        dataset = dpDataset.load(fname)

        trainer = meSimple.METrainer()
        model = trainer.trainedModelOn(dataset)

        return model
def getSeeds():
    fname = '../feature_set2/ver2.8-efollowing.libsvm'
    dataset = dp_dataset.load(fname)
    dataset = dp_dataset.shuffle(dataset)

    e = Effective(dataset)
    nested_seeds = e.getFeatureList(n_count = 10)

    def _flatten(l):
        for el in l:
            if (isinstance(el, collections.Iterable) and
                    not isinstance(el, basestring)):
                for sub in _flatten(el):
                    yield sub
            else:
                yield el

    print nested_seeds

    return list(set([x for x in _flatten(nested_seeds)]))

    def getFeatureList(self, n_count = 10):
        '''
        Return a list of n_count effective features for each class. Therefore,
        the total number of returned features will be n_count * n_class.
        '''

        dataset = self.dataset

        m = self.trainedModelOn(dataset)
        features = self._extractFeatures(dataset)

        e_features = self._mostEffective(m, features, n_count)

        # you might want to call reduce(lambda x, y: x + y, effectives) to
        # flatten the returned value
        return e_features



if __name__ == '__main__':
    fname = '../feature_set2/ver2.8-etext.libsvm'
    dataset = dp_dataset.load(fname)

    e = Effective(dataset)
    print e.getFeatureList(n_count = 10)