def generate_arffs(self, output_dir='arffs_output'): if not self.feature_vectors: return types = set() output_dir = "{}/{}".format(output_dir, self.dir) minimum_inst = max(2, int(0.01 * len(self.feature_vectors))) arff_writer.write("{}/all.arff".format(output_dir), self.feature_vectors, classification_feature=self.classification_feature, write_many=False, minimum_instance_counts_for_features=minimum_inst) regex = re.compile(r'(.*)_uni_(.*)$') commitment = ['NONE', 'CONSEQUENT'] non_commitment = ['ANTECEDENT', 'QUOTE', 'QUESTION'] collapsed_dicts = [] for vector in self.feature_vectors: _modified = dict() for key, value in vector.iteritems(): result = regex.match(key) if result: types.add(result.group(1)) if result.group(1) in commitment: _modified['commitment: {}'.format(result.group(2))] = value elif result.group(1) in non_commitment: _modified['non_commitment: {}'.format(result.group(2))] = value _modified[key] = vector[key] collapsed_dicts.append(_modified) arff_writer.write("{}/all_collapsed.arff".format(output_dir), collapsed_dicts, classification_feature=self.classification_feature, write_many=False, minimum_instance_counts_for_features=minimum_inst)
def write(self, vectors, topic, featureset): minimum_inst = max(2, int(0.01 * len(vectors))) output_dir = 'arffs' arff_writer.write("{directory}/{topic}/{featureset}.arff".format(directory=output_dir,topic=topic,featureset=featureset), self.feature_vectors_by_topic[topic], classification_feature=self.label, write_many=False, minimum_instance_counts_for_features=minimum_inst)
def main(self, features=None): with utils.random_guard(95064): feature_vectors = self.get_feature_vectors(features) print 'Derived features for', len(feature_vectors),'instances' feature_vectors = utils.balance(feature_vectors, self.classification_feature) print 'writing arff file with', len(feature_vectors),'instances' #filename = self.results_dir+self.classification_feature+'/baseline/arffs/' filename = 'baseline/arffs' detailed_features, instance_keys = arff_writer.write(filename, feature_vectors, classification_feature=self.classification_feature, write_many=True, minimum_instance_counts_for_features=2) self.write_arff_instance_data(filename, instance_keys, detailed_features)