Ejemplo n.º 1
0
    def run(self):
        label_file = self.pass_in
        out_dir = self.pass_out

        # read in the hard label file
        # the format will be [(id, label), (id, label), ...]
        hard_labels = []

        for line in open(label_file):
            line = line.rstrip('\n')

            uid, label = line.split('\t')
            hard_labels.append((uid, label))

        hard_labels = dpDataset.shuffle(hard_labels)

        # we output the splited hard-labeled file into the directory
        round_count = 0
        for train, test in dpDataset.kFolds(hard_labels, FOLDS):
            out_train = os.path.join(out_dir, "train" + str(round_count))
            out_test = os.path.join(out_dir, "test" + str(round_count))

            fout_train = open(out_train, 'w')
            for uid, label in train:
                fout_train.write(uid + '\t' + label + '\n')
            fout_train.close()

            fout_test = open(out_test, 'w')
            for uid, label in test:
                fout_test.write(uid + '\t' + label + '\n')
            fout_test.close()

            round_count += 1
def getSeeds():
    fname = '../feature_set2/ver2.8-efollowing.libsvm'
    dataset = dp_dataset.load(fname)
    dataset = dp_dataset.shuffle(dataset)

    e = Effective(dataset)
    nested_seeds = e.getFeatureList(n_count = 10)

    def _flatten(l):
        for el in l:
            if (isinstance(el, collections.Iterable) and
                    not isinstance(el, basestring)):
                for sub in _flatten(el):
                    yield sub
            else:
                yield el

    print nested_seeds

    return list(set([x for x in _flatten(nested_seeds)]))