def generate_arffs(self, output_dir='arffs_output'):
        if not self.feature_vectors:
            return
        types = set()
        output_dir = "{}/{}".format(output_dir, self.dir)
        minimum_inst = max(2, int(0.01 * len(self.feature_vectors)))
        
        arff_writer.write("{}/all.arff".format(output_dir), 
                        self.feature_vectors, 
                        classification_feature=self.classification_feature, 
                        write_many=False, 
                        minimum_instance_counts_for_features=minimum_inst)
        regex = re.compile(r'(.*)_uni_(.*)$')
        commitment = ['NONE', 'CONSEQUENT']
        non_commitment = ['ANTECEDENT', 'QUOTE', 'QUESTION']
        collapsed_dicts = []
        for vector in self.feature_vectors:
            _modified = dict()
            for key, value in vector.iteritems():
                result = regex.match(key)
                if result:
                    types.add(result.group(1))
                    if result.group(1) in commitment:
                        _modified['commitment: {}'.format(result.group(2))] = value
                    elif result.group(1) in non_commitment:
                        _modified['non_commitment: {}'.format(result.group(2))] = value
                _modified[key] = vector[key]
            collapsed_dicts.append(_modified)

        arff_writer.write("{}/all_collapsed.arff".format(output_dir),
                    collapsed_dicts, 
                    classification_feature=self.classification_feature, 
                    write_many=False, 
                    minimum_instance_counts_for_features=minimum_inst)
 def write(self, vectors, topic, featureset):
     minimum_inst = max(2, int(0.01 * len(vectors)))
     output_dir = 'arffs'
     arff_writer.write("{directory}/{topic}/{featureset}.arff".format(directory=output_dir,topic=topic,featureset=featureset), 
                         self.feature_vectors_by_topic[topic], 
                         classification_feature=self.label, 
                         write_many=False, 
                         minimum_instance_counts_for_features=minimum_inst)
Ejemplo n.º 3
0
    def main(self, features=None):
        with utils.random_guard(95064):
            feature_vectors = self.get_feature_vectors(features)
            print 'Derived features for', len(feature_vectors),'instances'
            feature_vectors = utils.balance(feature_vectors, self.classification_feature)
            print 'writing arff file with', len(feature_vectors),'instances'

            #filename = self.results_dir+self.classification_feature+'/baseline/arffs/'
            filename = 'baseline/arffs'
            detailed_features, instance_keys = arff_writer.write(filename, feature_vectors, classification_feature=self.classification_feature, write_many=True, minimum_instance_counts_for_features=2)
            self.write_arff_instance_data(filename, instance_keys, detailed_features)