values_train = np.array(values_train, dtype="float32")
                labels_train = np.array(labels_train, dtype="float32")
                values_test = np.array(values_test, dtype="float32")
                labels_test = np.array(labels_test, dtype="float32")

            else:

                # initialize word vector embedder
                embedder = WordVectorEmbedder(embedder_model)

                # initialize lists (will be converted later into numpy arrays)
                values = []
                labels = []

                # get equal-sized subsets of each class
                data_sampler = DataSampler(klass, file_path=data_params["path"], num_classes=2)
                data = data_sampler.sample_balanced(
                    min_samples=data_args.get("min_samples", None),
                    rng_seed=data_args.get("load", {}).get("rng_seed", None),
                )

                # load dataset
                logger.info("processing {} samples from {}...".format(len(data), data_params["path"]))
                profile_results = timed_dataload(loader, data, data_args, embedder, values, labels)

                # store loading time
                seconds_loading = profile_results.timer.total_tt

                # shuffle if necessary
                if data_args["shuffle_after_load"]:
        values = []
        labels = []

        # initialize vector embedder
        prebuilt_model_path = data_args.get('models', {}).get(embedder_model, {}).get('prebuilt_model_path', None)
        embedder = WordVectorEmbedder(embedder_model, prebuilt_model_path)

        # load pre-sampled data from disk
        if prebuilt_model_path:
            with open(WordVectorBuilder.filename_train(prebuilt_model_path), 'rb') as f:
                data = pickle.load(f)
        else:

            # get equal-sized subsets of each class
            min_samples = data_args['min_samples'] if data_args.has_key('min_samples') else None
            data_sampler = DataSampler(klass, file_path=data_params['path'], num_classes=2)
            data = data_sampler.sample_balanced(min_samples)

        # load dataset
        logger.info("processing {} samples from {}...".format(len(data), data_params['path']))
        profile_results = timed_dataload(data, data_args, values, labels)

        # store loading time
        seconds_loading = profile_results.timer.total_tt

        # shuffle if necessary
        if data_args['shuffle_after_load']:
            indices = np.arange(len(labels))
            np.random.shuffle(indices)
            values = [values[i] for i in indices]
            labels = [labels[i] for i in indices]
Example #3
0
            # shuffle if necessary
            if data_args['shuffle_after_load']:
                np.random.shuffle(values_train)
                np.random.shuffle(labels_train)
                np.random.shuffle(values_test)
                np.random.shuffle(labels_test)

        else:

            # initialize lists (will be converted later into numpy arrays)
            values = []
            labels = []

            # get equal-sized subsets of each class
            data_sampler = DataSampler(klass, file_path=data_params['path'], num_classes=2)
            data = data_sampler.sample_balanced(min_samples=data_args.get('min_samples', None), rng_seed=data_args.get('load', {}).get('rng_seed', None))

            # load dataset
            logger.info("processing {} samples from {}...".format(len(data), data_params['path']))
            profile_results = timed_dataload(data, data_args, values, labels)

            # store loading time
            seconds_loading = profile_results.timer.total_tt

            # convert into nparray for sklearn
            values = np.array(values, dtype="float32")
            labels = np.array(labels, dtype="float32")
            logger.info("Loaded {} samples...".format(len(values)))

            # shuffle if necessary
Example #4
0
        prebuilt_model_path = data_args.get('models', {}).get(
            embedder_model, {}).get('prebuilt_model_path', None)
        embedder = WordVectorEmbedder(embedder_model, prebuilt_model_path)

        # load pre-sampled data from disk
        if prebuilt_model_path:
            with open(WordVectorBuilder.filename_train(prebuilt_model_path),
                      'rb') as f:
                data = pickle.load(f)
        else:

            # get equal-sized subsets of each class
            min_samples = data_args['min_samples'] if data_args.has_key(
                'min_samples') else None
            data_sampler = DataSampler(klass,
                                       file_path=data_params['path'],
                                       num_classes=2)
            data = data_sampler.sample_balanced(min_samples)

        # load dataset
        logger.info("processing {} samples from {}...".format(
            len(data), data_params['path']))
        profile_results = timed_dataload(data, data_args, values, labels)

        # store loading time
        seconds_loading = profile_results.timer.total_tt

        # shuffle if necessary
        if data_args['shuffle_after_load']:
            indices = np.arange(len(labels))
            np.random.shuffle(indices)