def take_action(self, parsed_args): self._learner = PreProcessingWrapper(learner=self._get_learner(parsed_args), upsample=parsed_args.upsample, majority_vote=parsed_args.majority_vote) if not parsed_args.train_input.exists(): raise IOError("failed to open training data file at {}".format(parsed_args.train_input)) if not parsed_args.eval_input.exists(): raise IOError("failed to open evaluation data file at {}".format(parsed_args.eval_input)) train_data = load(parsed_args.train_input) eval_data = load(parsed_args.eval_input) if parsed_args.train_partitions is not None: train_data = train_data.partitions(parsed_args.train_partitions) if parsed_args.eval_partitions is not None: eval_data = eval_data.partitions(parsed_args.eval_partitions) self.log.info("training classifier") self._learner.fit(train_data) predictions = self._learner.predict(eval_data) inverse_label_map = dict(map(reversed, train_data.label_map.items())) predictions = [(item[0], inverse_label_map[item[1]]) for item in sorted(predictions.items(), key=lambda item: item[0])] self.log.info("writing predictions to %s", parsed_args.output) if not parsed_args.output.parent.exists(): parsed_args.output.parent.mkdir(parents=True) output = pd.DataFrame.from_records(predictions, index=range(len(predictions))) output.to_csv(parsed_args.output, sep="\t", index=False, header=False)
def take_action(self, parsed_args): if not parsed_args.input.exists(): raise IOError("failed to open data set at {}".format( parsed_args.input)) steps = parsed_args.steps if steps is None: steps = [None] if len(steps) != len(parsed_args.output): raise ValueError( "There must be one output file for each global step") input_data = load(parsed_args.input) for step, output_file in zip(steps, parsed_args.output): generated_data = self._wrapper.generate( model_filename=parsed_args.model_dir / "model", global_step=step, data_set=input_data, batch_size=parsed_args.batch_size) if not output_file.parent.exists(): output_file.parent.mkdir(parents=True) generated_data.save(output_file)
def take_action(self, parsed_args): if not parsed_args.input.exists(): raise IOError("failed to open data set at {}".format( parsed_args.input)) data_set = load(parsed_args.input) features = np.reshape(data_set.features, [data_set.num_instances, -1]) if features.shape[1] > 50: self.log.info("applying PCA") pca = PCA(n_components=200) pca.fit(features) features = pca.transform(features) self.log.info("computing T-SNE embedding") tsne = TSNE(perplexity=parsed_args.perplexity, learning_rate=parsed_args.learning_rate, verbose=self.app_args.verbose_level) embedding = tsne.fit_transform(features) self.log.info("plotting embedding") self.plot_with_labels(data_set, embedding)
def take_action(self, parsed_args): if not parsed_args.input.exists(): raise IOError("failed to open data set at {}".format(parsed_args.input)) data_set = load(parsed_args.input) constraint_violations = check_integrity(data_set) if constraint_violations is None: print("PASS validation") else: num_digits = int(math.ceil(math.log10(data_set.num_instances))) constraint_headers = [ "Instances with the same filename must have the same nominal labels:", "Instances with the same filename must have the same numeric labels:", "Instances with the same filename must have the same cross validation information:", "Instances with the same filename must have the same partition:", "If a label map is present, all labels must conform to this map:", "For each filename, there must be the same number of chunks:", "For each filename, chunk numbers must be [0, 1, ..., num_chunks]:" ] print("FAIL validation") print() for index, header in enumerate(constraint_headers): print("%d. %-81s %{}d violations".format(num_digits) % ( index + 1, header, len(constraint_violations[index]))) if parsed_args.detailed: for violation in constraint_violations[index]: print("\t%s" % violation)
def take_action(self, parsed_args): for file in parsed_args.input: if not file.exists(): raise IOError("failed to open data set at {}".format(file)) data_sets = [load(file) for file in parsed_args.input] self.log.info("fusing %d data sets along dimension '%s'", len(parsed_args.input), parsed_args.dimension) result = concat_features(data_sets, parsed_args.dimension) result.save(parsed_args.output)
def take_action(self, parsed_args): if not parsed_args.input.exists(): raise IOError("failed to open data set at {}".format( parsed_args.input)) data_set = load(parsed_args.input) self.log.info("fusing chunks along dimension '%s'", parsed_args.dimension) data_set = concat_chunks(data_set, parsed_args.dimension) data_set.save(parsed_args.output)
def take_action(self, parsed_args): if not parsed_args.input.exists(): raise IOError("failed to open data set at {}".format(parsed_args.input)) data_set = load(parsed_args.input) if not data_set.is_fully_labeled: raise ValueError("data set must be fully labeled for upsampling") if parsed_args.partitions is not None and not data_set.has_partition_info: raise ValueError("data set must have partition information for upsampling of specific partitions") partitions = parsed_args.partitions if parsed_args.partitions is not None else None upsampled_data_set = upsample(data_set, partitions) upsampled_data_set.save(parsed_args.output)
def take_action(self, parsed_args): if not parsed_args.input.exists(): raise IOError("failed to open data set at {}".format( parsed_args.input)) if parsed_args.name is None: name = parsed_args.input.with_suffix("").name else: name = parsed_args.name data_set = load(parsed_args.input) export(basedir=parsed_args.output, name=name, data_set=data_set, labels_last=parsed_args.labels_last, fmt=parsed_args.format)
def _setup_io(self, parsed_args, tempdir: Path): data_files = parsed_args.input for file in data_files: if not file.exists(): raise IOError( "failed to open data set file at {}".format(file)) self.record_files = [] self.num_instances = 0 # convert data sets to tfrecords and collect metadata for index, file in enumerate(data_files): record_file = tempdir / (file.name + ("-%d" % index)) self.record_files.append(record_file) self.log.info("created temporary file %s for data set %s", record_file, file) data_set = load(file) if self.feature_shape is None: self.feature_shape = data_set.feature_shape elif self.feature_shape != data_set.feature_shape: raise ValueError("data sets have different feature shapes") self.num_instances += data_set.num_instances export_tfrecords(record_file, data_set) # create output dirs output_dir = parsed_args.run_name if not output_dir.exists(): output_dir.mkdir(parents=True) self.model_filename = output_dir / "logs" / "model" if not self.model_filename.parent.exists(): self.model_filename.parent.mkdir()
def handle(self, *args, **options): path = options['path'] if not os.path.isfile(path): raise Exception('File {} not found'.format(path)) database_name = options['database_name'] dm_name = options['dm_name'] database = get_or_error(Database, dict(name__iexact=database_name)) dataset = data_set.load(Path(path)) features = dataset.features filenames = dataset.filenames sids = [int(x[:-4]) for x in filenames] nobs, ndims = dataset.features.shape preserved = Case(*[When(id=id, then=pos) for pos, id in enumerate(sids)]) segments = Segment.objects.filter(id__in=sids).order_by(preserved) tids = segments.values_list('tid', flat=True) col_inds = {'s2s_autoencoded': [0, ndims]} dm = DataMatrix(database=database) dm.name = dm_name dm.ndims = ndims dm.features_hash = 's2s_autoencoded' dm.aggregations_hash = '' dm.save() full_sids_path = dm.get_sids_path() full_tids_path = dm.get_tids_path() full_bytes_path = dm.get_bytes_path() full_cols_path = dm.get_cols_path() ndarray_to_bytes(features, full_bytes_path) ndarray_to_bytes(np.array(sids, dtype=np.int32), full_sids_path) ndarray_to_bytes(np.array(tids, dtype=np.int32), full_tids_path) with open(full_cols_path, 'w', encoding='utf-8') as f: json.dump(col_inds, f)
def take_action(self, parsed_args): if not parsed_args.input.exists(): raise IOError("failed to open data set at {}".format( parsed_args.input)) data_set = load(parsed_args.input) if parsed_args.add_cv_setup is not None: data_set = create_cv_setup(data_set, num_folds=parsed_args.add_cv_setup) elif parsed_args.add_partitioning is not None: data_set = create_partitioning( data_set, partitions=parsed_args.add_partitioning) elif parsed_args.remove_partitioning: data_set = data_set.copy() for index in data_set: data_set[index].partition = None elif parsed_args.remove_cv_setup: data_set = create_cv_setup(data_set, num_folds=0) data_set.save(parsed_args.output)
def take_action(self, parsed_args): if not parsed_args.input.exists(): raise IOError("unable to open data set at {}".format( parsed_args.input)) data_set = load(parsed_args.input) formatter = TableFormatter() # print global information global_information = [ ("number of instances", data_set.num_instances), ("cross validation info", data_set.has_cv_info), ("partition info", data_set.has_partition_info), ("fully labeled", data_set.is_fully_labeled), ("feature dimensions", data_set.feature_dims), ] print() formatter.print(data=global_information, header="global data set information") print() # print instance information if parsed_args.instance is not None: instance = data_set[parsed_args.instance] instance_information = [ ("data file", instance.filename), ("chunk number", instance.chunk_nr), ("label", "{} ({})".format(instance.label_nominal, instance.label_numeric)), ("cross validation splits", ", ".join([ "None" if x is None else x.name for x in instance.cv_folds ]) or None), ("partition", None if instance.partition is None else instance.partition.name), ("shape", instance.feature_shape), ] formatter.print(data=instance_information, header="instance {} information:".format( parsed_args.instance)) print() if parsed_args.detailed_folds and data_set.has_cv_info and data_set.is_fully_labeled: formatter = TableFormatter(alignment="lrrrr") inverse_label_map = dict(map(reversed, data_set.label_map.items())) for fold in range(data_set.num_folds): train_split = data_set.split(fold, Split.TRAIN) valid_split = data_set.split(fold, Split.VALID) labels, train_counts = np.unique(train_split.labels_numeric, return_counts=True) _, valid_counts = np.unique(valid_split.labels_numeric, return_counts=True) train_total = sum(train_counts) valid_total = sum(valid_counts) fold_information = [] for i in range(len(labels)): train_count = train_counts[i] valid_count = valid_counts[i] train_relative = 100 * train_count / train_total valid_relative = 100 * valid_count / valid_total fold_information.append( (inverse_label_map[labels[i]], train_count, "%2.2f%%" % train_relative, valid_count, "%2.2f%%" % valid_relative)) fold_information.append( ("total", train_total, "", valid_total, "")) formatter.print(data=fold_information, header="fold {} information:".format(fold + 1), dividers=[len(labels) - 1]) print()
def take_action(self, parsed_args): self._learner = self._get_learner(parsed_args) if not parsed_args.input.exists(): raise IOError("failed to open data file at {}".format(parsed_args.input)) if parsed_args.train_partitions is None and parsed_args.eval_partitions is None and not parsed_args.cross_validate: raise ValueError("must select either cross validated or partitioned evaluation") if (parsed_args.train_partitions is not None or parsed_args.eval_partitions is not None) \ and parsed_args.cross_validate: raise ValueError("partitioned evaluation and cross validated evaluation are mutually exclusive") if not parsed_args.cross_validate and ((parsed_args.train_partitions is not None) ^ (parsed_args.eval_partitions is not None)): raise ValueError("at least one train and eval partition required") data_set = load(parsed_args.input) accuracies = [] uars = [] confusion_matrices = [] if parsed_args.cross_validate: accuracy_confidence_intervals = [] uar_confidence_intervals = [] for _ in range(parsed_args.repeat): evaluation = CrossValidatedEvaluation(learner=self._learner, upsample=parsed_args.upsample, majority_vote=parsed_args.majority_vote) evaluation.run(data_set) accuracies.append(evaluation.accuracy) uars.append(evaluation.uar) accuracy_confidence_intervals.append(evaluation.accuracy_confidence_interval) uar_confidence_intervals.append(evaluation.uar_confidence_interval) confusion_matrices.append(evaluation.confusion_matrix) accuracy = np.mean(accuracies) accuracy_confidence_interval = np.mean(accuracy_confidence_intervals) uar = np.mean(uars) uar_confidence_interval = np.mean(uar_confidence_intervals) self.log.info("cross validation accuracy: %2.2f%% (+/- %2.2f%%)", 100 * accuracy, 100 * accuracy_confidence_interval) self.log.info("cross validation UAR: %2.2f%% (+/- %2.2f%%)", 100 * uar, 100 * uar_confidence_interval) else: for _ in range(parsed_args.repeat): # noinspection PyTypeChecker evaluation = PartitionedEvaluation(learner=self._learner, train_partitions=parsed_args.train_partitions, eval_partitions=parsed_args.eval_partitions, upsample=parsed_args.upsample, majority_vote=parsed_args.majority_vote) evaluation.run(data_set) accuracies.append(evaluation.accuracy) uars.append(evaluation.uar) confusion_matrices.append(evaluation.confusion_matrix) accuracy = np.mean(accuracies) uar = np.mean(uars) # noinspection PyTypeChecker,PyStringFormat self.log.info("accuracy on %s: %2.2f%% (UAR %2.2f%%)" % ( " & ".join([p.name for p in parsed_args.eval_partitions]), 100 * accuracy, 100 * uar)) confusion_matrix = np.sum(confusion_matrices, axis=0) formatter = ConfusionMatrixFormatter() self.log.info("confusion matrix:\n%s", formatter.format(confusion_matrix, data_set.label_map)) if self.app_args.verbose_level == 0: # support for piping the output of this command # noinspection PyStringFormat print(("%.4f,%.4f" % (accuracy, uar))) else: self.plot_confusion_matrix(confusion_matrix, sorted(data_set.label_map.keys()), normalize=True)