Exemple #1
0
    def take_action(self, parsed_args):
        self._learner = PreProcessingWrapper(learner=self._get_learner(parsed_args),
                                             upsample=parsed_args.upsample,
                                             majority_vote=parsed_args.majority_vote)

        if not parsed_args.train_input.exists():
            raise IOError("failed to open training data file at {}".format(parsed_args.train_input))
        if not parsed_args.eval_input.exists():
            raise IOError("failed to open evaluation data file at {}".format(parsed_args.eval_input))

        train_data = load(parsed_args.train_input)
        eval_data = load(parsed_args.eval_input)

        if parsed_args.train_partitions is not None:
            train_data = train_data.partitions(parsed_args.train_partitions)

        if parsed_args.eval_partitions is not None:
            eval_data = eval_data.partitions(parsed_args.eval_partitions)

        self.log.info("training classifier")

        self._learner.fit(train_data)
        predictions = self._learner.predict(eval_data)

        inverse_label_map = dict(map(reversed, train_data.label_map.items()))
        predictions = [(item[0], inverse_label_map[item[1]]) for item in
                       sorted(predictions.items(), key=lambda item: item[0])]

        self.log.info("writing predictions to %s", parsed_args.output)

        if not parsed_args.output.parent.exists():
            parsed_args.output.parent.mkdir(parents=True)

        output = pd.DataFrame.from_records(predictions, index=range(len(predictions)))
        output.to_csv(parsed_args.output, sep="\t", index=False, header=False)
Exemple #2
0
    def take_action(self, parsed_args):
        if not parsed_args.input.exists():
            raise IOError("failed to open data set at {}".format(
                parsed_args.input))

        steps = parsed_args.steps

        if steps is None:
            steps = [None]

        if len(steps) != len(parsed_args.output):
            raise ValueError(
                "There must be one output file for each global step")

        input_data = load(parsed_args.input)

        for step, output_file in zip(steps, parsed_args.output):
            generated_data = self._wrapper.generate(
                model_filename=parsed_args.model_dir / "model",
                global_step=step,
                data_set=input_data,
                batch_size=parsed_args.batch_size)
            if not output_file.parent.exists():
                output_file.parent.mkdir(parents=True)

            generated_data.save(output_file)
Exemple #3
0
    def take_action(self, parsed_args):
        if not parsed_args.input.exists():
            raise IOError("failed to open data set at {}".format(
                parsed_args.input))

        data_set = load(parsed_args.input)

        features = np.reshape(data_set.features, [data_set.num_instances, -1])

        if features.shape[1] > 50:
            self.log.info("applying PCA")

            pca = PCA(n_components=200)
            pca.fit(features)
            features = pca.transform(features)

        self.log.info("computing T-SNE embedding")
        tsne = TSNE(perplexity=parsed_args.perplexity,
                    learning_rate=parsed_args.learning_rate,
                    verbose=self.app_args.verbose_level)

        embedding = tsne.fit_transform(features)

        self.log.info("plotting embedding")
        self.plot_with_labels(data_set, embedding)
Exemple #4
0
    def take_action(self, parsed_args):
        if not parsed_args.input.exists():
            raise IOError("failed to open data set at {}".format(parsed_args.input))

        data_set = load(parsed_args.input)
        constraint_violations = check_integrity(data_set)

        if constraint_violations is None:
            print("PASS validation")
        else:
            num_digits = int(math.ceil(math.log10(data_set.num_instances)))

            constraint_headers = [
                "Instances with the same filename must have the same nominal labels:",
                "Instances with the same filename must have the same numeric labels:",
                "Instances with the same filename must have the same cross validation information:",
                "Instances with the same filename must have the same partition:",
                "If a label map is present, all labels must conform to this map:",
                "For each filename, there must be the same number of chunks:",
                "For each filename, chunk numbers must be [0, 1, ..., num_chunks]:"
            ]

            print("FAIL validation")
            print()

            for index, header in enumerate(constraint_headers):
                print("%d. %-81s %{}d violations".format(num_digits) % (
                    index + 1, header, len(constraint_violations[index])))

                if parsed_args.detailed:
                    for violation in constraint_violations[index]:
                        print("\t%s" % violation)
Exemple #5
0
    def take_action(self, parsed_args):
        for file in parsed_args.input:
            if not file.exists():
                raise IOError("failed to open data set at {}".format(file))

        data_sets = [load(file) for file in parsed_args.input]

        self.log.info("fusing %d data sets along dimension '%s'",
                      len(parsed_args.input), parsed_args.dimension)

        result = concat_features(data_sets, parsed_args.dimension)
        result.save(parsed_args.output)
Exemple #6
0
    def take_action(self, parsed_args):
        if not parsed_args.input.exists():
            raise IOError("failed to open data set at {}".format(
                parsed_args.input))

        data_set = load(parsed_args.input)

        self.log.info("fusing chunks along dimension '%s'",
                      parsed_args.dimension)

        data_set = concat_chunks(data_set, parsed_args.dimension)

        data_set.save(parsed_args.output)
Exemple #7
0
    def take_action(self, parsed_args):
        if not parsed_args.input.exists():
            raise IOError("failed to open data set at {}".format(parsed_args.input))

        data_set = load(parsed_args.input)

        if not data_set.is_fully_labeled:
            raise ValueError("data set must be fully labeled for upsampling")
        if parsed_args.partitions is not None and not data_set.has_partition_info:
            raise ValueError("data set must have partition information for upsampling of specific partitions")

        partitions = parsed_args.partitions if parsed_args.partitions is not None else None

        upsampled_data_set = upsample(data_set, partitions)
        upsampled_data_set.save(parsed_args.output)
Exemple #8
0
    def take_action(self, parsed_args):
        if not parsed_args.input.exists():
            raise IOError("failed to open data set at {}".format(
                parsed_args.input))

        if parsed_args.name is None:
            name = parsed_args.input.with_suffix("").name
        else:
            name = parsed_args.name

        data_set = load(parsed_args.input)
        export(basedir=parsed_args.output,
               name=name,
               data_set=data_set,
               labels_last=parsed_args.labels_last,
               fmt=parsed_args.format)
Exemple #9
0
    def _setup_io(self, parsed_args, tempdir: Path):
        data_files = parsed_args.input

        for file in data_files:
            if not file.exists():
                raise IOError(
                    "failed to open data set file at {}".format(file))

        self.record_files = []
        self.num_instances = 0

        # convert data sets to tfrecords and collect metadata
        for index, file in enumerate(data_files):
            record_file = tempdir / (file.name + ("-%d" % index))
            self.record_files.append(record_file)

            self.log.info("created temporary file %s for data set %s",
                          record_file, file)

            data_set = load(file)

            if self.feature_shape is None:
                self.feature_shape = data_set.feature_shape
            elif self.feature_shape != data_set.feature_shape:
                raise ValueError("data sets have different feature shapes")

            self.num_instances += data_set.num_instances

            export_tfrecords(record_file, data_set)

        # create output dirs
        output_dir = parsed_args.run_name

        if not output_dir.exists():
            output_dir.mkdir(parents=True)

        self.model_filename = output_dir / "logs" / "model"

        if not self.model_filename.parent.exists():
            self.model_filename.parent.mkdir()
    def handle(self, *args, **options):
        path = options['path']
        if not os.path.isfile(path):
            raise Exception('File {} not found'.format(path))

        database_name = options['database_name']
        dm_name = options['dm_name']
        database = get_or_error(Database, dict(name__iexact=database_name))

        dataset = data_set.load(Path(path))
        features = dataset.features
        filenames = dataset.filenames
        sids = [int(x[:-4]) for x in filenames]

        nobs, ndims = dataset.features.shape

        preserved = Case(*[When(id=id, then=pos) for pos, id in enumerate(sids)])
        segments = Segment.objects.filter(id__in=sids).order_by(preserved)
        tids = segments.values_list('tid', flat=True)

        col_inds = {'s2s_autoencoded': [0, ndims]}

        dm = DataMatrix(database=database)
        dm.name = dm_name
        dm.ndims = ndims
        dm.features_hash = 's2s_autoencoded'
        dm.aggregations_hash = ''
        dm.save()

        full_sids_path = dm.get_sids_path()
        full_tids_path = dm.get_tids_path()
        full_bytes_path = dm.get_bytes_path()
        full_cols_path = dm.get_cols_path()

        ndarray_to_bytes(features, full_bytes_path)
        ndarray_to_bytes(np.array(sids, dtype=np.int32), full_sids_path)
        ndarray_to_bytes(np.array(tids, dtype=np.int32), full_tids_path)

        with open(full_cols_path, 'w', encoding='utf-8') as f:
            json.dump(col_inds, f)
Exemple #11
0
    def take_action(self, parsed_args):
        if not parsed_args.input.exists():
            raise IOError("failed to open data set at {}".format(
                parsed_args.input))

        data_set = load(parsed_args.input)

        if parsed_args.add_cv_setup is not None:
            data_set = create_cv_setup(data_set,
                                       num_folds=parsed_args.add_cv_setup)
        elif parsed_args.add_partitioning is not None:
            data_set = create_partitioning(
                data_set, partitions=parsed_args.add_partitioning)
        elif parsed_args.remove_partitioning:
            data_set = data_set.copy()

            for index in data_set:
                data_set[index].partition = None
        elif parsed_args.remove_cv_setup:
            data_set = create_cv_setup(data_set, num_folds=0)

        data_set.save(parsed_args.output)
Exemple #12
0
    def take_action(self, parsed_args):
        if not parsed_args.input.exists():
            raise IOError("unable to open data set at {}".format(
                parsed_args.input))

        data_set = load(parsed_args.input)
        formatter = TableFormatter()

        # print global information
        global_information = [
            ("number of instances", data_set.num_instances),
            ("cross validation info", data_set.has_cv_info),
            ("partition info", data_set.has_partition_info),
            ("fully labeled", data_set.is_fully_labeled),
            ("feature dimensions", data_set.feature_dims),
        ]

        print()
        formatter.print(data=global_information,
                        header="global data set information")
        print()

        # print instance information
        if parsed_args.instance is not None:
            instance = data_set[parsed_args.instance]

            instance_information = [
                ("data file", instance.filename),
                ("chunk number", instance.chunk_nr),
                ("label", "{} ({})".format(instance.label_nominal,
                                           instance.label_numeric)),
                ("cross validation splits", ", ".join([
                    "None" if x is None else x.name for x in instance.cv_folds
                ]) or None),
                ("partition", None
                 if instance.partition is None else instance.partition.name),
                ("shape", instance.feature_shape),
            ]

            formatter.print(data=instance_information,
                            header="instance {} information:".format(
                                parsed_args.instance))
            print()

        if parsed_args.detailed_folds and data_set.has_cv_info and data_set.is_fully_labeled:
            formatter = TableFormatter(alignment="lrrrr")

            inverse_label_map = dict(map(reversed, data_set.label_map.items()))

            for fold in range(data_set.num_folds):
                train_split = data_set.split(fold, Split.TRAIN)
                valid_split = data_set.split(fold, Split.VALID)

                labels, train_counts = np.unique(train_split.labels_numeric,
                                                 return_counts=True)
                _, valid_counts = np.unique(valid_split.labels_numeric,
                                            return_counts=True)

                train_total = sum(train_counts)
                valid_total = sum(valid_counts)

                fold_information = []

                for i in range(len(labels)):
                    train_count = train_counts[i]
                    valid_count = valid_counts[i]

                    train_relative = 100 * train_count / train_total
                    valid_relative = 100 * valid_count / valid_total

                    fold_information.append(
                        (inverse_label_map[labels[i]], train_count,
                         "%2.2f%%" % train_relative, valid_count,
                         "%2.2f%%" % valid_relative))

                fold_information.append(
                    ("total", train_total, "", valid_total, ""))

                formatter.print(data=fold_information,
                                header="fold {} information:".format(fold + 1),
                                dividers=[len(labels) - 1])
                print()
Exemple #13
0
    def take_action(self, parsed_args):
        self._learner = self._get_learner(parsed_args)

        if not parsed_args.input.exists():
            raise IOError("failed to open data file at {}".format(parsed_args.input))

        if parsed_args.train_partitions is None and parsed_args.eval_partitions is None and not parsed_args.cross_validate:
            raise ValueError("must select either cross validated or partitioned evaluation")
        if (parsed_args.train_partitions is not None or parsed_args.eval_partitions is not None) \
                and parsed_args.cross_validate:
            raise ValueError("partitioned evaluation and cross validated evaluation are mutually exclusive")
        if not parsed_args.cross_validate and ((parsed_args.train_partitions is not None)
                                                   ^ (parsed_args.eval_partitions is not None)):
            raise ValueError("at least one train and eval partition required")

        data_set = load(parsed_args.input)

        accuracies = []
        uars = []
        confusion_matrices = []

        if parsed_args.cross_validate:
            accuracy_confidence_intervals = []
            uar_confidence_intervals = []

            for _ in range(parsed_args.repeat):
                evaluation = CrossValidatedEvaluation(learner=self._learner,
                                                      upsample=parsed_args.upsample,
                                                      majority_vote=parsed_args.majority_vote)
                evaluation.run(data_set)

                accuracies.append(evaluation.accuracy)
                uars.append(evaluation.uar)
                accuracy_confidence_intervals.append(evaluation.accuracy_confidence_interval)
                uar_confidence_intervals.append(evaluation.uar_confidence_interval)
                confusion_matrices.append(evaluation.confusion_matrix)

            accuracy = np.mean(accuracies)
            accuracy_confidence_interval = np.mean(accuracy_confidence_intervals)
            uar = np.mean(uars)
            uar_confidence_interval = np.mean(uar_confidence_intervals)

            self.log.info("cross validation accuracy: %2.2f%% (+/- %2.2f%%)", 100 * accuracy,
                          100 * accuracy_confidence_interval)
            self.log.info("cross validation UAR: %2.2f%% (+/- %2.2f%%)", 100 * uar, 100 * uar_confidence_interval)
        else:
            for _ in range(parsed_args.repeat):
                # noinspection PyTypeChecker
                evaluation = PartitionedEvaluation(learner=self._learner,
                                                   train_partitions=parsed_args.train_partitions,
                                                   eval_partitions=parsed_args.eval_partitions,
                                                   upsample=parsed_args.upsample,
                                                   majority_vote=parsed_args.majority_vote)
                evaluation.run(data_set)

                accuracies.append(evaluation.accuracy)
                uars.append(evaluation.uar)
                confusion_matrices.append(evaluation.confusion_matrix)

            accuracy = np.mean(accuracies)
            uar = np.mean(uars)

            # noinspection PyTypeChecker,PyStringFormat
            self.log.info("accuracy on %s: %2.2f%% (UAR %2.2f%%)" % (
                " & ".join([p.name for p in parsed_args.eval_partitions]), 100 * accuracy, 100 * uar))

        confusion_matrix = np.sum(confusion_matrices, axis=0)

        formatter = ConfusionMatrixFormatter()
        self.log.info("confusion matrix:\n%s", formatter.format(confusion_matrix, data_set.label_map))

        if self.app_args.verbose_level == 0:
            # support for piping the output of this command
            # noinspection PyStringFormat
            print(("%.4f,%.4f" % (accuracy, uar)))
        else:
            self.plot_confusion_matrix(confusion_matrix, sorted(data_set.label_map.keys()), normalize=True)