def __init__(self, path_or_list, quiet=True, ids_to_floats=False, label_col='y', id_col='id', class_map=None, sparse=True, feature_hasher=False, num_features=None): super(Reader, self).__init__() self.path_or_list = path_or_list self.quiet = quiet self.ids_to_floats = ids_to_floats self.label_col = label_col self.id_col = id_col self.class_map = class_map self._progress_msg = '' if feature_hasher: self.vectorizer = FeatureHasher(n_features=num_features) else: self.vectorizer = DictVectorizer(sparse=sparse)
def __init__(self, name, ids, labels=None, features=None, vectorizer=None): super(FeatureSet, self).__init__() self.name = name if isinstance(ids, list): ids = np.array(ids) self.ids = ids if isinstance(labels, list): labels = np.array(labels) self.labels = labels self.features = features self.vectorizer = vectorizer # Convert list of dicts to numpy array if isinstance(self.features, list): if self.vectorizer is None: self.vectorizer = NewDictVectorizer(sparse=True) self.features = self.vectorizer.fit_transform(self.features) if self.features is not None: num_feats = self.features.shape[0] if self.ids is None: raise ValueError('A list of IDs is required') num_ids = self.ids.shape[0] if num_feats != num_ids: raise ValueError(('Number of IDs (%s) does not equal ' 'number of feature rows (%s)') % (num_ids, num_feats)) if self.labels is None: self.labels = np.empty(num_feats) self.labels.fill(None) num_labels = self.labels.shape[0] if num_feats != num_labels: raise ValueError(('Number of labels (%s) does not equal ' 'number of feature rows (%s)') % (num_labels, num_feats))
def __init__(self, path_or_list, quiet=True, ids_to_floats=False, label_col='y', id_col='id', class_map=None, sparse=True, feature_hasher=False, num_features=None, logger=None): super(Reader, self).__init__() self.path_or_list = path_or_list self.quiet = quiet self.ids_to_floats = ids_to_floats self.label_col = label_col self.id_col = id_col self.class_map = class_map self._progress_msg = '' self._use_pandas = False if feature_hasher: self.vectorizer = FeatureHasher(n_features=num_features) else: self.vectorizer = DictVectorizer(sparse=sparse) self.logger = logger if logger else logging.getLogger(__name__)
class FeatureSet(object): """ Encapsulation of all of the features, values, and metadata about a given set of data. This replaces ``ExamplesTuple`` from older versions. :param name: The name of this feature set. :type name: str :param ids: Example IDs for this set. :type ids: np.array :param labels: labels for this set. :type labels: np.array :param features: The features for each instance represented as either a list of dictionaries or an array-like (if `vectorizer` is also specified). :type features: list of dict or array-like :param vectorizer: Vectorizer that created feature matrix. :type vectorizer: DictVectorizer or FeatureHasher .. note:: If ids, labels, and/or features are not None, the number of rows in each array must be equal. """ def __init__(self, name, ids, labels=None, features=None, vectorizer=None): super(FeatureSet, self).__init__() self.name = name if isinstance(ids, list): ids = np.array(ids) self.ids = ids if isinstance(labels, list): labels = np.array(labels) self.labels = labels self.features = features self.vectorizer = vectorizer # Convert list of dicts to numpy array if isinstance(self.features, list): if self.vectorizer is None: self.vectorizer = NewDictVectorizer(sparse=True) self.features = self.vectorizer.fit_transform(self.features) if self.features is not None: num_feats = self.features.shape[0] if self.ids is None: raise ValueError('A list of IDs is required') num_ids = self.ids.shape[0] if num_feats != num_ids: raise ValueError(('Number of IDs (%s) does not equal ' 'number of feature rows (%s)') % (num_ids, num_feats)) if self.labels is None: self.labels = np.empty(num_feats) self.labels.fill(None) num_labels = self.labels.shape[0] if num_feats != num_labels: raise ValueError(('Number of labels (%s) does not equal ' 'number of feature rows (%s)') % (num_labels, num_feats)) def __contains__(self, value): """ Check if example ID is in set """ return value in self.ids def __eq__(self, other): """ Check whether two featuresets are the same. .. note:: We consider feature values to be equal if any differences are in the sixth decimal place or higher. """ # We need to sort the indices for the underlying # feature sparse matrix in case we haven't done # so already. if not self.features.has_sorted_indices: self.features.sort_indices() if not other.features.has_sorted_indices: other.features.sort_indices() return (self.ids.shape == other.ids.shape and self.labels.shape == other.labels.shape and self.features.shape == other.features.shape and (self.ids == other.ids).all() and (self.labels == other.labels).all() and np.allclose(self.features.data, other.features.data, rtol=1e-6) and (self.features.indices == other.features.indices).all() and (self.features.indptr == other.features.indptr).all() and self.vectorizer == other.vectorizer) def __iter__(self): """ Iterate through (ID, label, feature_dict) tuples in feature set. """ if self.features is not None: if not isinstance(self.vectorizer, DictVectorizer): raise ValueError('FeatureSets can only be iterated through if ' 'they use a DictVectorizer for their feature ' 'vectorizer.') for id_, label_, feats in zip(self.ids, self.labels, self.features): # When calling inverse_transform we have to add [0] to get the # results for the current instance because it always returns a # 2D array yield (id_, label_, self.vectorizer.inverse_transform(feats)[0]) else: return def __len__(self): return self.features.shape[0] def __add__(self, other): """ Combine two feature sets to create a new one. This is done assuming they both have the same instances with the same IDs in the same order. """ # Check that the sets of IDs are equal if set(self.ids) != set(other.ids): raise ValueError('IDs are not in the same order in each ' 'feature set') # Compute the relative ordering of IDs for merging the features # and labels. ids_indices = dict((y, x) for x, y in enumerate(other.ids)) relative_order = [ids_indices[self_id] for self_id in self.ids] # Initialize the new feature set with a name and the IDs. new_set = FeatureSet('+'.join(sorted([self.name, other.name])), deepcopy(self.ids)) # Combine feature matrices and vectorizers. if not isinstance(self.vectorizer, type(other.vectorizer)): raise ValueError('Cannot combine FeatureSets because they are ' 'not both using the same type of feature ' 'vectorizer (e.g., DictVectorizer, ' 'FeatureHasher)') uses_feature_hasher = isinstance(self.vectorizer, FeatureHasher) if uses_feature_hasher: if (self.vectorizer.n_features != other.vectorizer.n_features): raise ValueError('Cannot combine FeatureSets that uses ' 'FeatureHashers with different values of ' 'n_features setting.') else: # Check for duplicate feature names. if (set(self.vectorizer.feature_names_) & set(other.vectorizer.feature_names_)): raise ValueError('Cannot combine FeatureSets because they ' 'have duplicate feature names.') num_feats = self.features.shape[1] new_set.features = sp.hstack([self.features, other.features[relative_order]], 'csr') new_set.vectorizer = deepcopy(self.vectorizer) if not uses_feature_hasher: for feat_name, index in other.vectorizer.vocabulary_.items(): new_set.vectorizer.vocabulary_[feat_name] = (index + num_feats) other_names = other.vectorizer.feature_names_ new_set.vectorizer.feature_names_.extend(other_names) # If either set has labels, check that they don't conflict. if self.has_labels: # labels should be the same for each FeatureSet, so store once. if other.has_labels and \ not np.all(self.labels == other.labels[relative_order]): raise ValueError('Feature sets have conflicting labels for ' 'examples with the same ID.') new_set.labels = deepcopy(self.labels) else: new_set.labels = deepcopy(other.labels[relative_order]) return new_set def filter(self, ids=None, labels=None, features=None, inverse=False): """ Removes or keeps features and/or examples from the Featureset depending on the passed in parameters. :param ids: Examples to keep in the FeatureSet. If `None`, no ID filtering takes place. :type ids: list of str/float :param labels: labels that we want to retain examples for. If `None`, no label filtering takes place. :type labels: list of str/float :param features: Features to keep in the FeatureSet. To help with filtering string-valued features that were converted to sequences of boolean features when read in, any features in the FeatureSet that contain a `=` will be split on the first occurrence and the prefix will be checked to see if it is in `features`. If `None`, no feature filtering takes place. Cannot be used if FeatureSet uses a FeatureHasher for vectorization. :type features: list of str :param inverse: Instead of keeping features and/or examples in lists, remove them. :type inverse: bool """ # Construct mask that indicates which examples to keep mask = np.ones(len(self), dtype=bool) if ids is not None: mask = np.logical_and(mask, np.in1d(self.ids, ids)) if labels is not None: mask = np.logical_and(mask, np.in1d(self.labels, labels)) if inverse and (labels is not None or ids is not None): mask = np.logical_not(mask) # Remove examples not in mask self.ids = self.ids[mask] self.labels = self.labels[mask] self.features = self.features[mask, :] # Filter features if features is not None: if isinstance(self.vectorizer, FeatureHasher): raise ValueError('FeatureSets with FeatureHasher vectorizers' ' cannot be filtered by feature.') columns = np.array(sorted({feat_num for feat_name, feat_num in iteritems(self.vectorizer.vocabulary_) if (feat_name in features or feat_name.split('=', 1)[0] in features)})) if inverse: all_columns = np.arange(self.features.shape[1]) columns = all_columns[np.logical_not(np.in1d(all_columns, columns))] self.features = self.features[:, columns] self.vectorizer.restrict(columns, indices=True) def filtered_iter(self, ids=None, labels=None, features=None, inverse=False): """ A version of ``__iter__`` that retains only the specified features and/or examples from the output. :param ids: Examples in the FeatureSet to keep. If `None`, no ID filtering takes place. :type ids: list of str/float :param labels: labels that we want to retain examples for. If `None`, no label filtering takes place. :type labels: list of str/float :param features: Features in the FeatureSet to keep. To help with filtering string-valued features that were converted to sequences of boolean features when read in, any features in the FeatureSet that contain a `=` will be split on the first occurrence and the prefix will be checked to see if it is in `features`. If `None`, no feature filtering takes place. Cannot be used if FeatureSet uses a FeatureHasher for vectorization. :type features: list of str :param inverse: Instead of keeping features and/or examples in lists, remove them. :type inverse: bool """ if self.features is not None and not isinstance(self.vectorizer, DictVectorizer): raise ValueError('FeatureSets can only be iterated through if they' ' use a DictVectorizer for their feature ' 'vectorizer.') for id_, label_, feats in zip(self.ids, self.labels, self.features): # Skip instances with IDs not in filter if ids is not None and (id_ in ids) == inverse: continue # Skip instances with labels not in filter if labels is not None and (label_ in labels) == inverse: continue feat_dict = self.vectorizer.inverse_transform(feats)[0] if features is not None: feat_dict = {name: value for name, value in iteritems(feat_dict) if (inverse != (name in features or name.split('=', 1)[0] in features))} elif not inverse: feat_dict = {} yield id_, label_, feat_dict def __sub__(self, other): """ :returns: a copy of ``self`` with all features in ``other`` removed. """ new_set = deepcopy(self) new_set.filter(features=other.vectorizer.feature_names_, inverse=True) return new_set @property def has_labels(self): """ :returns: Whether or not this FeatureSet has any finite labels. """ if self.labels is not None: return not (np.issubdtype(self.labels.dtype, float) and np.isnan(np.min(self.labels))) else: return False def __str__(self): """ :returns: a string representation of FeatureSet """ return str(self.__dict__) def __repr__(self): """ :returns: a string representation of FeatureSet """ return repr(self.__dict__) def __getitem__(self, value): """ :returns: A specific example by row number, or if given a slice, a new FeatureSet containing a subset of the data. """ # Check if we're slicing if isinstance(value, slice): sliced_ids = self.ids[value] sliced_feats = (self.features[value] if self.features is not None else None) sliced_labels = (self.labels[value] if self.labels is not None else None) return FeatureSet('{}_{}'.format(self.name, value), sliced_ids, features=sliced_feats, labels=sliced_labels, vectorizer=self.vectorizer) else: label = self.labels[value] if self.labels is not None else None feats = self.features[value, :] features = (self.vectorizer.inverse_transform(feats)[0] if self.features is not None else {}) return self.ids[value], label, features
def main(argv=None): """ Handles command line arguments and gets things started. Parameters ---------- argv : list of str List of arguments, as if specified on the command-line. If None, ``sys.argv[1:]`` is used instead. """ # Get command line arguments parser = argparse.ArgumentParser( description="Takes an input feature file and converts it to another \ format. Formats are determined automatically from file \ extensions.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('infile', help='input feature file (ends in .arff, .csv, \ .jsonlines, .libsvm, .megam, .ndj, or .tsv)') parser.add_argument('outfile', help='output feature file (ends in .arff, .csv, \ .jsonlines, .libsvm, .megam, .ndj, or .tsv)') parser.add_argument('-i', '--id_col', help='Name of the column which contains the instance \ IDs in ARFF, CSV, or TSV files.', default='id') parser.add_argument('-l', '--label_col', help='Name of the column which contains the class \ labels in ARFF, CSV, or TSV files. For ARFF \ files, this must be the final column to count as\ the label.', default='y') parser.add_argument('-q', '--quiet', help='Suppress printing of "Loading..." messages.', action='store_true') parser.add_argument('--arff_regression', help='Create ARFF files for regression, not \ classification.', action='store_true') parser.add_argument('--arff_relation', help='Relation name to use for ARFF file.', default='skll_relation') parser.add_argument('--reuse_libsvm_map', help='If you want to output multiple files that use \ the same mapping from labels and features to \ numbers when writing libsvm files, you can \ specify an existing .libsvm file to reuse the \ mapping from.', type=argparse.FileType('rb')) parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(__version__)) args = parser.parse_args(argv) # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' '%(message)s')) logger = logging.getLogger(__name__) # make sure the input file extension is one we can process input_extension = os.path.splitext(args.infile)[1].lower() output_extension = os.path.splitext(args.outfile)[1].lower() if input_extension not in EXT_TO_READER: logger.error(('Input file must be in either .arff, .csv, .jsonlines, ' '.libsvm, .megam, .ndj, or .tsv format. You specified: ' '{}').format(input_extension)) sys.exit(1) # Build feature and label vectorizers from existing libsvm file if asked if args.reuse_libsvm_map and output_extension == '.libsvm': feat_map = {} label_map = {} for line in args.reuse_libsvm_map: line = UnicodeDammit(line, ['utf-8', 'windows-1252']).unicode_markup if '#' not in line: logger.error('The LibSVM file you want to reuse the map from ' 'was not created by SKLL and does not actually ' 'contain the necessary mapping info.') sys.exit(1) comments = line.split('#')[1] _, label_map_str, feat_map_str = comments.split('|') feat_map.update( _pair_to_dict_tuple(pair) for pair in feat_map_str.strip().split()) label_map.update( _pair_to_dict_tuple(pair) for pair in label_map_str.strip().split()) feat_vectorizer = DictVectorizer() feat_vectorizer.fit([{name: 1} for name in feat_map]) feat_vectorizer.vocabulary_ = feat_map else: feat_vectorizer = None label_map = None # Iterate through input file and collect the information we need reader = EXT_TO_READER[input_extension](args.infile, quiet=args.quiet, label_col=args.label_col, id_col=args.id_col) feature_set = reader.read() # write out the file in the requested output format writer_type = EXT_TO_WRITER[output_extension] writer_args = {'quiet': args.quiet} if writer_type is DelimitedFileWriter: writer_args['label_col'] = args.label_col writer_args['id_col'] = args.id_col elif writer_type is ARFFWriter: writer_args['label_col'] = args.label_col writer_args['id_col'] = args.id_col writer_args['regression'] = args.arff_regression writer_args['relation'] = args.arff_relation elif writer_type is LibSVMWriter: writer_args['label_map'] = label_map writer = writer_type(args.outfile, feature_set, **writer_args) writer.write()
class Reader(object): """ A little helper class to make picklable iterators out of example dictionary generators :param path_or_list: Path or a list of example dictionaries. :type path_or_list: str or list of dict :param quiet: Do not print "Loading..." status message to stderr. :type quiet: bool :param ids_to_floats: Convert IDs to float to save memory. Will raise error if we encounter an a non-numeric ID. :type ids_to_floats: bool :param id_col: Name of the column which contains the instance IDs for ARFF/CSV/TSV files. If no column with that name exists, or `None` is specified, the IDs will be generated automatically. :type id_col: str :param label_col: Name of the column which contains the class labels for ARFF/CSV/TSV files. If no column with that name exists, or `None` is specified, the data is considered to be unlabelled. :type label_col: str :param class_map: Mapping from original class labels to new ones. This is mainly used for collapsing multiple labels into a single class. Anything not in the mapping will be kept the same. :type class_map: dict from str to str :param sparse: Whether or not to store the features in a numpy CSR matrix when using a DictVectorizer to vectorize the features. :type sparse: bool :param feature_hasher: Whether or not a FeatureHasher should be used to vectorize the features. :type feature_hasher: bool :param num_features: If using a FeatureHasher, how many features should the resulting matrix have? You should set this to a power of 2 greater than the actual number of features to avoid collisions. :type num_features: int """ def __init__(self, path_or_list, quiet=True, ids_to_floats=False, label_col='y', id_col='id', class_map=None, sparse=True, feature_hasher=False, num_features=None): super(Reader, self).__init__() self.path_or_list = path_or_list self.quiet = quiet self.ids_to_floats = ids_to_floats self.label_col = label_col self.id_col = id_col self.class_map = class_map self._progress_msg = '' if feature_hasher: self.vectorizer = FeatureHasher(n_features=num_features) else: self.vectorizer = DictVectorizer(sparse=sparse) @classmethod def for_path(cls, path_or_list, **kwargs): """ :param path: The path to the file to load the examples from, or a list of example dictionaries. :type path: str or dict :param quiet: Do not print "Loading..." status message to stderr. :type quiet: bool :param sparse: Whether or not to store the features in a numpy CSR matrix. :type sparse: bool :param id_col: Name of the column which contains the instance IDs for ARFF/CSV/TSV files. If no column with that name exists, or `None` is specified, the IDs will be generated automatically. :type id_col: str :param label_col: Name of the column which contains the class labels for ARFF/CSV/TSV files. If no column with that name exists, or `None` is specified, the data is considered to be unlabelled. :type label_col: str :param ids_to_floats: Convert IDs to float to save memory. Will raise error if we encounter an a non-numeric ID. :type ids_to_floats: bool :param class_map: Mapping from original class labels to new ones. This is mainly used for collapsing multiple classes into a single class. Anything not in the mapping will be kept the same. :type class_map: dict from str to str :returns: New instance of the :class:`Reader` sub-class that is appropriate for the given path, or :class:`DictListReader` if given a list of dictionaries. """ if not isinstance(path_or_list, string_types): return DictListReader(path_or_list) else: # Get lowercase extension for file extension checking ext = '.' + path_or_list.rsplit('.', 1)[-1].lower() if ext not in EXT_TO_READER: raise ValueError(('Example files must be in either .arff, ' '.csv, .jsonlines, .megam, .ndj, or .tsv ' 'format. You specified: ' '{}').format(path_or_list)) return EXT_TO_READER[ext](path_or_list, **kwargs) def _sub_read(self, f): """ Does the actual reading of the given file or list. :param f: An open file to iterate through :type f: file """ raise NotImplementedError def _print_progress(self, progress_num, end="\r"): """ Little helper to print out progress numbers in proper format. Nothing gets printed if ``self.quiet`` is ``True``. :param progress_num: Progress indicator value. Usually either a line number or a percentage. :type progress_num: anything that can be converted to str :param end: The string to put at the end of the line. "\\r" should be used for every update except for the final one. :type end: str """ # Print out status if not self.quiet: print("{}{:>15}".format(self._progress_msg, progress_num), end=end, file=sys.stderr) sys.stderr.flush() def read(self): """ Loads examples in the ``.arff``, ``.csv``, ``.jsonlines``, ``.libsvm``, ``.megam``, ``.ndj``, or ``.tsv`` formats. :returns: :class:`~skll.data.featureset.FeatureSet` representing the file we read in. """ # Setup logger logger = logging.getLogger(__name__) logger.debug('Path: %s', self.path_or_list) if not self.quiet: self._progress_msg = "Loading {}...".format(self.path_or_list) print(self._progress_msg, end="\r", file=sys.stderr) sys.stderr.flush() # Get labels and IDs ids = [] labels = [] with open(self.path_or_list, 'r' if PY3 else 'rb') as f: for ex_num, (id_, class_, _) in enumerate(self._sub_read(f), start=1): # Update lists of IDs, clases, and features if self.ids_to_floats: try: id_ = float(id_) except ValueError: raise ValueError(('You set ids_to_floats to true,' ' but ID {} could not be ' 'converted to float in ' '{}').format(id_, self.path_or_list)) ids.append(id_) labels.append(class_) if ex_num % 100 == 0: self._print_progress(ex_num) self._print_progress(ex_num) # Remember total number of examples for percentage progress meter total = ex_num # Convert everything to numpy arrays ids = np.array(ids) labels = np.array(labels) def feat_dict_generator(): with open(self.path_or_list, 'r' if PY3 else 'rb') as f: for ex_num, (_, _, feat_dict) in enumerate(self._sub_read(f)): yield feat_dict if ex_num % 100 == 0: self._print_progress('{:.8}%'.format(100 * ((ex_num / total)))) self._print_progress("100%") # Convert everything to numpy arrays features = self.vectorizer.fit_transform(feat_dict_generator()) # Report that loading is complete self._print_progress("done", end="\n") # Make sure we have the same number of ids, labels, and features assert ids.shape[0] == labels.shape[0] == features.shape[0] if ids.shape[0] != len(set(ids)): raise ValueError('The example IDs are not unique in %s.' % self.path_or_list) return FeatureSet(self.path_or_list, ids, labels=labels, features=features, vectorizer=self.vectorizer)
class Reader(object): """ A helper class to make picklable iterators out of example dictionary generators. Parameters ---------- path_or_list : str or list of dict Path or a list of example dictionaries. quiet : bool, optional Do not print "Loading..." status message to stderr. Defaults to ``True``. ids_to_floats : bool, optional Convert IDs to float to save memory. Will raise error if we encounter an a non-numeric ID. Defaults to ``False``. label_col : str, optional Name of the column which contains the class labels for ARFF/CSV/TSV files. If no column with that name exists, or ``None`` is specified, the data is considered to be unlabelled. Defaults to ``'y'``. id_col : str, optional Name of the column which contains the instance IDs. If no column with that name exists, or ``None`` is specified, example IDs will be automatically generated. Defaults to ``'id'``. class_map : dict, optional Mapping from original class labels to new ones. This is mainly used for collapsing multiple labels into a single class. Anything not in the mapping will be kept the same. Defaults to ``None``. sparse : bool, optional Whether or not to store the features in a numpy CSR matrix when using a DictVectorizer to vectorize the features. Defaults to ``True``. feature_hasher : bool, optional Whether or not a FeatureHasher should be used to vectorize the features. Defaults to ``False``. num_features : int, optional If using a FeatureHasher, how many features should the resulting matrix have? You should set this to a power of 2 greater than the actual number of features to avoid collisions. Defaults to ``None``. logger : logging.Logger, optional A logger instance to use to log messages instead of creating a new one by default. Defaults to ``None``. """ def __init__(self, path_or_list, quiet=True, ids_to_floats=False, label_col='y', id_col='id', class_map=None, sparse=True, feature_hasher=False, num_features=None, logger=None): super(Reader, self).__init__() self.path_or_list = path_or_list self.quiet = quiet self.ids_to_floats = ids_to_floats self.label_col = label_col self.id_col = id_col self.class_map = class_map self._progress_msg = '' if feature_hasher: self.vectorizer = FeatureHasher(n_features=num_features) else: self.vectorizer = DictVectorizer(sparse=sparse) self.logger = logger if logger else logging.getLogger(__name__) @classmethod def for_path(cls, path_or_list, **kwargs): """ Instantiate the appropriate Reader sub-class based on the file extension of the given path. Or use a dictionary reader if the input is a list of dictionaries. Parameters ---------- path_or_list : str or list of dicts A path or list of example dictionaries. kwargs : dict, optional The arguments to the Reader object being instantiated. Returns ------- reader : skll.Reader A new instance of the Reader sub-class that is appropriate for the given path. Raises ------ ValueError If file does not have a valid extension. """ if not isinstance(path_or_list, string_types): return DictListReader(path_or_list) else: # Get lowercase extension for file extension checking ext = '.' + path_or_list.rsplit('.', 1)[-1].lower() if ext not in EXT_TO_READER: raise ValueError(('Example files must be in either .arff, ' '.csv, .jsonlines, .megam, .ndj, or .tsv ' 'format. You specified: ' '{}').format(path_or_list)) return EXT_TO_READER[ext](path_or_list, **kwargs) def _sub_read(self, f): """ Does the actual reading of the given file or list. Parameters ---------- f : file buffer An open file to iterate through. Raises ------ NotImplementedError """ raise NotImplementedError def _print_progress(self, progress_num, end="\r"): """ Helper method to print out progress numbers in proper format. Nothing gets printed if ``self.quiet`` is ``True``. Parameters ---------- progress_num Progress indicator value. Usually either a line number or a percentage. Must be able to convert to string. end : str, optional The string to put at the end of the line. "\\r" should be used for every update except for the final one. Defaults to ``'\r'``. """ # Print out status if not self.quiet: print("{}{:>15}".format(self._progress_msg, progress_num), end=end, file=sys.stderr) sys.stderr.flush() def read(self): """ Loads examples in the `.arff`, `.csv`, `.jsonlines`, `.libsvm`, `.megam`, `.ndj`, or `.tsv` formats. Returns ------- feature_set : skll.FeatureSet ``FeatureSet`` instance representing the input file. Raises ------ ValueError If ``ids_to_floats`` is True, but IDs cannot be converted. ValueError If no features are found. ValueError If the example IDs are not unique. """ self.logger.debug('Path: %s', self.path_or_list) if not self.quiet: self._progress_msg = "Loading {}...".format(self.path_or_list) print(self._progress_msg, end="\r", file=sys.stderr) sys.stderr.flush() # Get labels and IDs ids = [] labels = [] ex_num = 0 with open(self.path_or_list, 'r' if PY3 else 'rb') as f: for ex_num, (id_, class_, _) in enumerate(self._sub_read(f), start=1): # Update lists of IDs, clases, and features if self.ids_to_floats: try: id_ = float(id_) except ValueError: raise ValueError(('You set ids_to_floats to true,' ' but ID {} could not be ' 'converted to float in ' '{}').format(id_, self.path_or_list)) ids.append(id_) labels.append(class_) if ex_num % 100 == 0: self._print_progress(ex_num) self._print_progress(ex_num) # Remember total number of examples for percentage progress meter total = ex_num if total == 0: raise ValueError("No features found in possibly " "empty file '{}'.".format(self.path_or_list)) # Convert everything to numpy arrays ids = np.array(ids) labels = np.array(labels) def feat_dict_generator(): with open(self.path_or_list, 'r' if PY3 else 'rb') as f: for ex_num, (_, _, feat_dict) in enumerate(self._sub_read(f)): yield feat_dict if ex_num % 100 == 0: self._print_progress('{:.8}%'.format( 100 * ((ex_num / total)))) self._print_progress("100%") # Convert everything to numpy arrays features = self.vectorizer.fit_transform(feat_dict_generator()) # Report that loading is complete self._print_progress("done", end="\n") # Make sure we have the same number of ids, labels, and features assert ids.shape[0] == labels.shape[0] == features.shape[0] if ids.shape[0] != len(set(ids)): raise ValueError('The example IDs are not unique in %s.' % self.path_or_list) return FeatureSet(self.path_or_list, ids, labels=labels, features=features, vectorizer=self.vectorizer)
def main(argv=None): """ Handles command line arguments and gets things started. Parameters ---------- argv : list of str List of arguments, as if specified on the command-line. If None, ``sys.argv[1:]`` is used instead. """ # Get command line arguments parser = argparse.ArgumentParser( description="Takes an input feature file and converts it to another \ format. Formats are determined automatically from file \ extensions.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('infile', help='input feature file (ends in .arff, .csv, \ .jsonlines, .libsvm, .megam, .ndj, or .tsv)') parser.add_argument('outfile', help='output feature file (ends in .arff, .csv, \ .jsonlines, .libsvm, .megam, .ndj, or .tsv)') parser.add_argument('-i', '--id_col', help='Name of the column which contains the instance \ IDs in ARFF, CSV, or TSV files.', default='id') label_group = parser.add_mutually_exclusive_group(required=False) label_group.add_argument('-l', '--label_col', help='Name of the column which contains the class \ labels in ARFF, CSV, or TSV files. For ARFF \ files, this must be the final column to count as\ the label.', default='y') label_group.add_argument('--no_labels', action='store_true', default=False, help='Used to indicate that the input data has no labels.') parser.add_argument('-q', '--quiet', help='Suppress printing of "Loading..." messages.', action='store_true') parser.add_argument('--arff_regression', help='Create ARFF files for regression, not \ classification.', action='store_true') parser.add_argument('--arff_relation', help='Relation name to use for ARFF file.', default='skll_relation') parser.add_argument('--reuse_libsvm_map', help='If you want to output multiple files that use \ the same mapping from labels and features to \ numbers when writing libsvm files, you can \ specify an existing .libsvm file to reuse the \ mapping from.', type=argparse.FileType('rb')) parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(__version__)) args = parser.parse_args(argv) # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' '%(message)s')) logger = logging.getLogger(__name__) # make sure the input file extension is one we can process input_extension = os.path.splitext(args.infile)[1].lower() output_extension = os.path.splitext(args.outfile)[1].lower() if input_extension not in EXT_TO_READER: logger.error(('Input file must be in either .arff, .csv, .jsonlines, ' '.libsvm, .megam, .ndj, or .tsv format. You specified: ' '{}').format(input_extension)) sys.exit(1) # Build feature and label vectorizers from existing libsvm file if asked if args.reuse_libsvm_map and output_extension == '.libsvm': feat_map = {} label_map = {} for line in args.reuse_libsvm_map: line = UnicodeDammit(line, ['utf-8', 'windows-1252']).unicode_markup if '#' not in line: logger.error('The LibSVM file you want to reuse the map from ' 'was not created by SKLL and does not actually ' 'contain the necessary mapping info.') sys.exit(1) comments = line.split('#')[1] _, label_map_str, feat_map_str = comments.split('|') feat_map.update(_pair_to_dict_tuple(pair) for pair in feat_map_str.strip().split()) label_map.update(_pair_to_dict_tuple(pair) for pair in label_map_str .strip().split()) feat_vectorizer = DictVectorizer() feat_vectorizer.fit([{name: 1} for name in feat_map]) feat_vectorizer.vocabulary_ = feat_map else: feat_vectorizer = None label_map = None label_col = None if args.no_labels else args.label_col # Iterate through input file and collect the information we need reader = EXT_TO_READER[input_extension](args.infile, quiet=args.quiet, label_col=label_col, id_col=args.id_col) feature_set = reader.read() # write out the file in the requested output format writer_type = EXT_TO_WRITER[output_extension] writer_args = {'quiet': args.quiet} if writer_type is CSVWriter or writer_type is TSVWriter: writer_args['label_col'] = label_col writer_args['id_col'] = args.id_col elif writer_type is ARFFWriter: writer_args['label_col'] = label_col writer_args['id_col'] = args.id_col writer_args['regression'] = args.arff_regression writer_args['relation'] = args.arff_relation elif writer_type is LibSVMWriter: writer_args['label_map'] = label_map writer = writer_type(args.outfile, feature_set, **writer_args) writer.write()
class FeatureSet(object): """ Encapsulation of all of the features, values, and metadata about a given set of data. This replaces `ExamplesTuple` from older versions of SKLL. Parameters ---------- name : str The name of this feature set. ids : np.array Example IDs for this set. labels : np.array, optional labels for this set. Defaults to ``None``. feature : list of dict or array-like, optional The features for each instance represented as either a list of dictionaries or an array-like (if `vectorizer` is also specified). Defaults to ``None``. vectorizer : DictVectorizer or FeatureHasher, optional Vectorizer which will be used to generate the feature matrix. Defaults to ``None``. Warnings -------- FeatureSets can only be equal if the order of the instances is identical because these are stored as lists/arrays. Since scikit-learn's `DictVectorizer` automatically sorts the underlying feature matrix if it is sparse, we do not do any sorting before checking for equality. This is not a problem because we _always_ use sparse matrices with `DictVectorizer` when creating FeatureSets. Notes ----- If ids, labels, and/or features are not None, the number of rows in each array must be equal. """ def __init__(self, name, ids, labels=None, features=None, vectorizer=None): super(FeatureSet, self).__init__() self.name = name if isinstance(ids, list): ids = np.array(ids) self.ids = ids if isinstance(labels, list): labels = np.array(labels) self.labels = labels self.features = features self.vectorizer = vectorizer # Convert list of dicts to numpy array if isinstance(self.features, list): if self.vectorizer is None: self.vectorizer = NewDictVectorizer(sparse=True) self.features = self.vectorizer.fit_transform(self.features) if self.features is not None: num_feats = self.features.shape[0] if self.ids is None: raise ValueError('A list of IDs is required') num_ids = self.ids.shape[0] if num_feats != num_ids: raise ValueError( ('Number of IDs (%s) does not equal ' 'number of feature rows (%s)') % (num_ids, num_feats)) if self.labels is None: self.labels = np.empty(num_feats) self.labels.fill(None) num_labels = self.labels.shape[0] if num_feats != num_labels: raise ValueError( ('Number of labels (%s) does not equal ' 'number of feature rows (%s)') % (num_labels, num_feats)) def __contains__(self, value): """ Check if example ID is in the FeatureSet. Parameters ---------- value The value to check. """ return value in self.ids def __eq__(self, other): """ Check whether two featuresets are the same. Parameters ---------- other : skll.FeatureSet The other ``FeatureSet`` to check equivalence with. Note ---- We consider feature values to be equal if any differences are in the sixth decimal place or higher. """ return (self.ids.shape == other.ids.shape and self.labels.shape == other.labels.shape and self.features.shape == other.features.shape and (self.ids == other.ids).all() and (self.labels == other.labels).all() and np.allclose( self.features.data, other.features.data, rtol=1e-6) and (self.features.indices == other.features.indices).all() and (self.features.indptr == other.features.indptr).all() and self.vectorizer == other.vectorizer) def __iter__(self): """ Iterate through (ID, label, feature_dict) tuples in feature set. """ if self.features is not None: if not isinstance(self.vectorizer, DictVectorizer): raise ValueError('FeatureSets can only be iterated through if ' 'they use a DictVectorizer for their feature ' 'vectorizer.') for id_, label_, feats in zip(self.ids, self.labels, self.features): # reshape to a 2D matrix if we are not using a sparse matrix # to store the features feats = feats.reshape(1, -1) if not sp.issparse(feats) else feats # When calling inverse_transform we have to add [0] to get the # results for the current instance because it always returns a # 2D array yield (id_, label_, self.vectorizer.inverse_transform(feats)[0]) else: return def __len__(self): """ The number of rows in the ``FeatureSet`` instance. """ return self.features.shape[0] def __add__(self, other): """ Combine two feature sets to create a new one. This is done assuming they both have the same instances with the same IDs in the same order. Parameters ---------- other : skll.FeatureSet The other ``FeatureSet`` to add to this one. Raises ------ ValueError If IDs are not in the same order in each ``FeatureSet`` instance. ValueError If vectorizers are different between the two ``FeatureSet`` instances. ValueError If there are duplicate feature names. ValueError If there are conflicting labels. """ # Check that the sets of IDs are equal if set(self.ids) != set(other.ids): raise ValueError('IDs are not in the same order in each ' 'feature set') # Compute the relative ordering of IDs for merging the features # and labels. ids_indices = dict((y, x) for x, y in enumerate(other.ids)) relative_order = [ids_indices[self_id] for self_id in self.ids] # Initialize the new feature set with a name and the IDs. new_set = FeatureSet('+'.join(sorted([self.name, other.name])), deepcopy(self.ids)) # Combine feature matrices and vectorizers. if not isinstance(self.vectorizer, type(other.vectorizer)): raise ValueError('Cannot combine FeatureSets because they are ' 'not both using the same type of feature ' 'vectorizer (e.g., DictVectorizer, ' 'FeatureHasher)') uses_feature_hasher = isinstance(self.vectorizer, FeatureHasher) if uses_feature_hasher: if (self.vectorizer.n_features != other.vectorizer.n_features): raise ValueError('Cannot combine FeatureSets that uses ' 'FeatureHashers with different values of ' 'n_features setting.') else: # Check for duplicate feature names. if (set(self.vectorizer.feature_names_) & set(other.vectorizer.feature_names_)): raise ValueError('Cannot combine FeatureSets because they ' 'have duplicate feature names.') num_feats = self.features.shape[1] new_set.features = sp.hstack( [self.features, other.features[relative_order]], 'csr') new_set.vectorizer = deepcopy(self.vectorizer) if not uses_feature_hasher: for feat_name, index in other.vectorizer.vocabulary_.items(): new_set.vectorizer.vocabulary_[feat_name] = (index + num_feats) other_names = other.vectorizer.feature_names_ new_set.vectorizer.feature_names_.extend(other_names) # If either set has labels, check that they don't conflict. if self.has_labels: # labels should be the same for each FeatureSet, so store once. if other.has_labels and \ not np.all(self.labels == other.labels[relative_order]): raise ValueError('Feature sets have conflicting labels for ' 'examples with the same ID.') new_set.labels = deepcopy(self.labels) else: new_set.labels = deepcopy(other.labels[relative_order]) return new_set def filter(self, ids=None, labels=None, features=None, inverse=False): """ Removes or keeps features and/or examples from the `Featureset` depending on the parameters. Filtering is done in-place. Parameters ---------- ids : list of str/float, optional Examples to keep in the FeatureSet. If `None`, no ID filtering takes place. Defaults to ``None``. labels : list of str/float, optional Labels that we want to retain examples for. If `None`, no label filtering takes place. Defaults to ``None``. features : list of str, optional Features to keep in the FeatureSet. To help with filtering string-valued features that were converted to sequences of boolean features when read in, any features in the FeatureSet that contain a `=` will be split on the first occurrence and the prefix will be checked to see if it is in `features`. If `None`, no feature filtering takes place. Cannot be used if FeatureSet uses a FeatureHasher for vectorization. Defaults to ``None``. inverse : bool, optional Instead of keeping features and/or examples in lists, remove them. Defaults to ``False``. Raises ------ ValueError If attempting to use features to filter a ``FeatureSet`` that uses a ``FeatureHasher`` vectorizer. """ # Construct mask that indicates which examples to keep mask = np.ones(len(self), dtype=bool) if ids is not None: mask = np.logical_and(mask, np.in1d(self.ids, ids)) if labels is not None: mask = np.logical_and(mask, np.in1d(self.labels, labels)) if inverse and (labels is not None or ids is not None): mask = np.logical_not(mask) # Remove examples not in mask self.ids = self.ids[mask] self.labels = self.labels[mask] self.features = self.features[mask, :] # Filter features if features is not None: if isinstance(self.vectorizer, FeatureHasher): raise ValueError('FeatureSets with FeatureHasher vectorizers' ' cannot be filtered by feature.') columns = np.array( sorted({ feat_num for feat_name, feat_num in self.vectorizer.vocabulary_.items() if (feat_name in features or feat_name.split('=', 1)[0] in features) })) if inverse: all_columns = np.arange(self.features.shape[1]) columns = all_columns[np.logical_not( np.in1d(all_columns, columns))] self.features = self.features[:, columns] self.vectorizer.restrict(columns, indices=True) def filtered_iter(self, ids=None, labels=None, features=None, inverse=False): """ A version of `__iter__` that retains only the specified features and/or examples from the output. Parameters ---------- ids : list of str/float, optional Examples to keep in the ``FeatureSet``. If ``None``, no ID filtering takes place. Defaults to ``None``. labels : list of str/float, optional Labels that we want to retain examples for. If ``None``, no label filtering takes place. Defaults to ``None``. features : list of str, optional Features to keep in the ``FeatureSet``. To help with filtering string-valued features that were converted to sequences of boolean features when read in, any features in the ``FeatureSet`` that contain a `=` will be split on the first occurrence and the prefix will be checked to see if it is in ``features``. If `None`, no feature filtering takes place. Cannot be used if ``FeatureSet`` uses a FeatureHasher for vectorization. Defaults to ``None``. inverse : bool, optional Instead of keeping features and/or examples in lists, remove them. Defaults to ``False``. Yields ------ id_ : str The ID of the example. label_ : str The label of the example. feat_dict : dict The feature dictionary, with feature name as the key and example value as the value. Raises ------ ValueError If the vectorizer is not a `DictVectorizer`. """ if self.features is not None and not isinstance( self.vectorizer, DictVectorizer): raise ValueError('FeatureSets can only be iterated through if they' ' use a DictVectorizer for their feature ' 'vectorizer.') for id_, label_, feats in zip(self.ids, self.labels, self.features): # Skip instances with IDs not in filter if ids is not None and (id_ in ids) == inverse: continue # Skip instances with labels not in filter if labels is not None and (label_ in labels) == inverse: continue # reshape to a 2D matrix if we are not using a sparse matrix # to store the features feats = feats.reshape(1, -1) if not sp.issparse(feats) else feats feat_dict = self.vectorizer.inverse_transform(feats)[0] if features is not None: feat_dict = { name: value for name, value in feat_dict.items() if (inverse != ( name in features or name.split('=', 1)[0] in features)) } elif not inverse: feat_dict = {} yield id_, label_, feat_dict def __sub__(self, other): """ Subset ``FeatureSet`` instance by removing all the features from the other ``FeatureSet`` instance. Parameters ---------- other : skll.FeatureSet The other ``FeatureSet`` containing the features that should be removed from this ``FeatureSet``. Returns ------- A copy of `self` with all features in `other` removed. """ new_set = deepcopy(self) new_set.filter(features=other.vectorizer.feature_names_, inverse=True) return new_set @property def has_labels(self): """ Check if ``FeatureSet`` has finite labels. Returns ------- has_labels : bool Whether or not this FeatureSet has any finite labels. """ # make sure that labels is not None or a list of Nones if self.labels is not None and not all(label is None for label in self.labels): # then check that they are not a list of NaNs return not (np.issubdtype(self.labels.dtype, np.floating) and np.isnan(np.min(self.labels))) else: return False def __str__(self): """ Returns ------- A string representation of ``FeatureSet``. """ return str(self.__dict__) def __repr__(self): """ Returns ------- A string representation of ``FeatureSet``. """ return repr(self.__dict__) def __getitem__(self, value): """ Parameters ---------- value The value to retrieve. Returns ------- A specific example by row number or, if given a slice, a new ``FeatureSet`` instance containing a subset of the data. """ # Check if we're slicing if isinstance(value, slice): sliced_ids = self.ids[value] sliced_feats = (self.features[value] if self.features is not None else None) sliced_labels = (self.labels[value] if self.labels is not None else None) return FeatureSet('{}_{}'.format(self.name, value), sliced_ids, features=sliced_feats, labels=sliced_labels, vectorizer=self.vectorizer) else: label = self.labels[value] if self.labels is not None else None feats = self.features[value, :] features = (self.vectorizer.inverse_transform(feats)[0] if self.features is not None else {}) return self.ids[value], label, features @staticmethod def split_by_ids(fs, ids_for_split1, ids_for_split2=None): """ Split the ``FeatureSet`` into two new ``FeatureSet`` instances based on the given IDs for the two splits. Parameters ---------- fs : skll.FeatureSet The ``FeatureSet`` instance to split. ids_for_split1 : list of int A list of example IDs which will be split out into the first ``FeatureSet`` instance. Note that the FeatureSet instance will respect the order of the specified IDs. ids_for_split2 : list of int, optional An optional ist of example IDs which will be split out into the second ``FeatureSet`` instance. Note that the ``FeatureSet`` instance will respect the order of the specified IDs. If this is not specified, then the second ``FeatureSet`` instance will contain the complement of the first set of IDs sorted in ascending order. Defaults to ``None``. Returns ------- fs1 : skll.FeatureSet The first ``FeatureSet``. fs2 : skll.FeatureSet The second ``FeatureSet``. """ # Note: an alternative way to implement this is to make copies # of the given FeatureSet instance and then use the `filter()` # method but that wastes too much memory since it requires making # two copies of the original FeatureSet which may be huge. With # the current implementation, we are creating new objects but # they should be much smaller than the original FeatureSet. ids1 = fs.ids[ids_for_split1] labels1 = fs.labels[ids_for_split1] features1 = fs.features[ids_for_split1] if ids_for_split2 is None: ids2 = fs.ids[~np.in1d(fs.ids, ids_for_split1)] labels2 = fs.labels[~np.in1d(fs.ids, ids_for_split1)] features2 = fs.features[~np.in1d(fs.ids, ids_for_split1)] else: ids2 = fs.ids[ids_for_split2] labels2 = fs.labels[ids_for_split2] features2 = fs.features[ids_for_split2] fs1 = FeatureSet('{}_1'.format(fs.name), ids1, labels=labels1, features=features1, vectorizer=fs.vectorizer) fs2 = FeatureSet('{}_2'.format(fs.name), ids2, labels=labels2, features=features2, vectorizer=fs.vectorizer) return fs1, fs2 @staticmethod def from_data_frame(df, name, labels_column=None, vectorizer=None): """ Helper function to create a ``FeatureSet`` instance from a `pandas.DataFrame`. Will raise an Exception if pandas is not installed in your environment. The ``ids`` in the ``FeatureSet`` will be the index from the given frame. Parameters ---------- df : pd.DataFrame The pandas.DataFrame object to use as a ``FeatureSet``. name : str The name of the output ``FeatureSet`` instance. labels_column : str, optional The name of the column containing the labels (data to predict). Defaults to ``None``. vectorizer : DictVectorizer or FeatureHasher, optional Vectorizer which will be used to generate the feature matrix. Defaults to ``None``. Returns ------- feature_set : skll.FeatureSet A ``FeatureSet`` instance generated from from the given data frame. """ if labels_column: feature_columns = [ column for column in df.columns if column != labels_column ] labels = df[labels_column].tolist() else: feature_columns = df.columns labels = None features = df[feature_columns].to_dict(orient='records') return FeatureSet(name, ids=df.index.tolist(), labels=labels, features=features, vectorizer=vectorizer)
class Reader(object): """ A helper class to make picklable iterators out of example dictionary generators. Parameters ---------- path_or_list : str or list of dict Path or a list of example dictionaries. quiet : bool, optional Do not print "Loading..." status message to stderr. Defaults to ``True``. ids_to_floats : bool, optional Convert IDs to float to save memory. Will raise error if we encounter an a non-numeric ID. Defaults to ``False``. label_col : str, optional Name of the column which contains the class labels for ARFF/CSV/TSV files. If no column with that name exists, or ``None`` is specified, the data is considered to be unlabelled. Defaults to ``'y'``. id_col : str, optional Name of the column which contains the instance IDs. If no column with that name exists, or ``None`` is specified, example IDs will be automatically generated. Defaults to ``'id'``. class_map : dict, optional Mapping from original class labels to new ones. This is mainly used for collapsing multiple labels into a single class. Anything not in the mapping will be kept the same. Defaults to ``None``. sparse : bool, optional Whether or not to store the features in a numpy CSR matrix when using a DictVectorizer to vectorize the features. Defaults to ``True``. feature_hasher : bool, optional Whether or not a FeatureHasher should be used to vectorize the features. Defaults to ``False``. num_features : int, optional If using a FeatureHasher, how many features should the resulting matrix have? You should set this to a power of 2 greater than the actual number of features to avoid collisions. Defaults to ``None``. logger : logging.Logger, optional A logger instance to use to log messages instead of creating a new one by default. Defaults to ``None``. """ def __init__(self, path_or_list, quiet=True, ids_to_floats=False, label_col='y', id_col='id', class_map=None, sparse=True, feature_hasher=False, num_features=None, logger=None): super(Reader, self).__init__() self.path_or_list = path_or_list self.quiet = quiet self.ids_to_floats = ids_to_floats self.label_col = label_col self.id_col = id_col self.class_map = class_map self._progress_msg = '' self._use_pandas = False if feature_hasher: self.vectorizer = FeatureHasher(n_features=num_features) else: self.vectorizer = DictVectorizer(sparse=sparse) self.logger = logger if logger else logging.getLogger(__name__) @classmethod def for_path(cls, path_or_list, **kwargs): """ Instantiate the appropriate Reader sub-class based on the file extension of the given path. Or use a dictionary reader if the input is a list of dictionaries. Parameters ---------- path_or_list : str or list of dicts A path or list of example dictionaries. kwargs : dict, optional The arguments to the Reader object being instantiated. Returns ------- reader : skll.Reader A new instance of the Reader sub-class that is appropriate for the given path. Raises ------ ValueError If file does not have a valid extension. """ if not isinstance(path_or_list, str): return DictListReader(path_or_list) else: # Get lowercase extension for file extension checking ext = '.' + path_or_list.rsplit('.', 1)[-1].lower() if ext not in EXT_TO_READER: raise ValueError(('Example files must be in either .arff, ' '.csv, .jsonlines, .ndj, or .tsv ' 'format. You specified: ' '{}').format(path_or_list)) return EXT_TO_READER[ext](path_or_list, **kwargs) def _sub_read(self, file): """ Does the actual reading of the given file or list. For `Reader` objects that do not rely on `pandas` (and therefore read row-by-row), this function will be called by `_sub_read_rows()` and will take a file buffer rather than a file path. Otherwise, it will take a path and will be called directly in the `read()` method. Parameters ---------- file : file buffer or str Either a file buffer, if ``_sub_read_rows()`` is calling this method, or a path to a file, if it is being read with ``pandas``. Raises ------ NotImplementedError """ raise NotImplementedError def _print_progress(self, progress_num, end="\r"): """ Helper method to print out progress numbers in proper format. Nothing gets printed if ``self.quiet`` is ``True``. Parameters ---------- progress_num Progress indicator value. Usually either a line number or a percentage. Must be able to convert to string. end : str, optional The string to put at the end of the line. "\\r" should be used for every update except for the final one. Defaults to ``'\r'``. """ # Print out status if not self.quiet: print("{}{:>15}".format(self._progress_msg, progress_num), end=end, file=sys.stderr) sys.stderr.flush() def _sub_read_rows(self, file): """ Read the file in row-by-row. This method is used for `Reader` objects that do not rely on `pandas`, and are instead read line-by-line into a FeatureSet object, unlike pandas-based reader object, which will read everything into memory in a data frame object before converting to a `FeatureSet`. Parameters ---------- file : str The path to a file. Returns ------- ids : np.array The ids array. labels : np.array The labels array. features : list of dicts The features dictionary. Raises ------ ValueError If ``ids_to_floats`` is True, but IDs cannot be converted. ValueError If no features are found. ValueError If the example IDs are not unique. """ # Get labels and IDs ids = [] labels = [] ex_num = 0 with open(file, 'r', encoding='utf-8') as f: for ex_num, (id_, class_, _) in enumerate(self._sub_read(f), start=1): # Update lists of IDs, classes, and features if self.ids_to_floats: try: id_ = float(id_) except ValueError: raise ValueError(('You set ids_to_floats to true,' ' but ID {} could not be ' 'converted to float in ' '{}').format(id_, self.path_or_list)) ids.append(id_) labels.append(class_) if ex_num % 100 == 0: self._print_progress(ex_num) self._print_progress(ex_num) # Remember total number of examples for percentage progress meter total = ex_num if total == 0: raise ValueError("No features found in possibly " "empty file '{}'.".format(self.path_or_list)) # Convert everything to numpy arrays ids = np.array(ids) labels = np.array(labels) def feat_dict_generator(): with open(self.path_or_list, 'r', encoding='utf-8') as f: for ex_num, (_, _, feat_dict) in enumerate(self._sub_read(f)): yield feat_dict if ex_num % 100 == 0: self._print_progress('{:.8}%'.format(100 * ((ex_num / total)))) self._print_progress("100%") # extract the features dictionary features = feat_dict_generator() return ids, labels, features def _parse_dataframe(self, df, id_col, label_col, replace_blanks_with=None, drop_blanks=False): """ Parse the data frame into ids, labels, and features. For `Reader` objects that rely on `pandas`, this function will be called in the `_sub_read()` method to parse the data frame into the expected format. It will not be used by `Reader` classes that read row-by-row (and therefore use the `_sub_read_rows()` function). Parameters ---------- df : pd.DataFrame The pandas data frame to parse. id_col : str or None The id column. label_col : str or None The label column. replace_blanks_with : value, ``dict``, or ``None``, optional Specifies a new value with which to replace blank values. Options are :: - value = A (numeric) value with which to replace blank values. - ``dict`` = A dictionary specifying the replacement value for each column. - ``None`` = Blank values will be left as blanks, and not replaced. Defaults to ``None``. drop_blanks : bool, optional If ``True``, remove lines/rows that have any blank values. Defaults to ``False``. Returns ------- ids : np.array The ids for the feature set. labels : np.array The labels for the feature set. features : list of dicts The features for the feature set. """ if df.empty: raise ValueError("No features found in possibly " "empty file '{}'.".format(self.path_or_list)) if drop_blanks and replace_blanks_with is not None: raise ValueError("You cannot both drop blanks and replace them. " "'replace_blanks_with' can only have a value when " "'drop_blanks' is `False`.") # should we replace blank values with something? if replace_blanks_with is not None: self.logger.info('Blank values in all rows/lines will be replaced with ' 'user-specified value(s).') df = df.fillna(replace_blanks_with) # should we remove lines that have any NaNs? if drop_blanks: self.logger.info('Rows/lines with any blank values will be dropped.') df = df.dropna().reset_index(drop=True) # if the id column exists, # get them from the data frame and # delete the column; otherwise, just # set it to None if id_col is not None and id_col in df: ids = df[id_col].astype(str) del df[id_col] # if `ids_to_floats` is True, # then convert the ids to floats if self.ids_to_floats: ids = ids.astype(float) ids = ids.values else: # create ids with the prefix `EXAMPLE_` ids = np.array(['EXAMPLE_{}'.format(i) for i in range(df.shape[0])]) # if the label column exists, # get them from the data frame and # delete the column; otherwise, just # set it to None if label_col is not None and label_col in df: labels = df[label_col] del df[label_col] # if `class_map` exists, then # map the new classes to the labels; # otherwise, just convert them to floats if self.class_map is not None: labels = labels.apply(safe_float, replace_dict=self.class_map) else: labels = labels.apply(safe_float) labels = labels.values else: # create an array of Nones labels = np.array([None] * df.shape[0]) # convert the remaining features to # a list of dictionaries features = df.to_dict(orient='records') return ids, labels, features def read(self): """ Loads examples in the `.arff`, `.csv`, `.jsonlines`, `.libsvm`, `.ndj`, or `.tsv` formats. Returns ------- feature_set : skll.FeatureSet ``FeatureSet`` instance representing the input file. Raises ------ ValueError If ``ids_to_floats`` is True, but IDs cannot be converted. ValueError If no features are found. ValueError If the example IDs are not unique. """ self.logger.debug('Path: %s', self.path_or_list) if not self.quiet: self._progress_msg = "Loading {}...".format(self.path_or_list) print(self._progress_msg, end="\r", file=sys.stderr) sys.stderr.flush() if self._use_pandas: ids, labels, features = self._sub_read(self.path_or_list) else: ids, labels, features = self._sub_read_rows(self.path_or_list) # Convert everything to numpy arrays features = self.vectorizer.fit_transform(features) # Report that loading is complete self._print_progress("done", end="\n") # Make sure we have the same number of ids, labels, and features assert ids.shape[0] == labels.shape[0] == features.shape[0] if ids.shape[0] != len(set(ids)): raise ValueError('The example IDs are not unique in %s.' % self.path_or_list) return FeatureSet(self.path_or_list, ids, labels=labels, features=features, vectorizer=self.vectorizer)
class FeatureSet(object): """ Encapsulation of all of the features, values, and metadata about a given set of data. This replaces ExamplesTuple in older versions. :param name: The name of this feature set. :type name: str :param ids: Example IDs for this set. If :type ids: np.array :param classes: Classes for this set. :type classes: np.array :param features: The features for each instance represented as either a list of dictionaries or an array-like (if `feat_vectorizer` is also specified). :type features: list of dict or array-like :param vectorizer: Vectorizer that created feature matrix. :type vectorizer: DictVectorizer or FeatureHasher .. note:: If ids, classes, and/or features are not None, the number of rows in each array must be equal. """ def __init__(self, name, ids=None, classes=None, features=None, vectorizer=None): super(FeatureSet, self).__init__() self.name = name if isinstance(ids, list): ids = np.array(ids) self.ids = ids if isinstance(classes, list): classes = np.array(classes) self.classes = classes self.features = features self.vectorizer = vectorizer # Convert list of dicts to numpy array if isinstance(self.features, list): if self.vectorizer is None: self.vectorizer = NewDictVectorizer(sparse=True) self.features = self.vectorizer.fit_transform(self.features) if self.features is not None: num_feats = self.features.shape[0] if self.ids is None: self.ids = np.empty(num_feats) self.ids.fill(None) num_ids = self.ids.shape[0] if num_feats != num_ids: raise ValueError(('Number of IDs (%s) does not equal ' 'number of feature rows (%s)') % (num_ids, num_feats)) if self.classes is None: self.classes = np.empty(num_feats) self.classes.fill(None) num_classes = self.classes.shape[0] if num_feats != num_classes: raise ValueError(('Number of classes ({}) does not equal ' 'number of feature rows({})') % (num_classes, num_feats)) def __contains__(self, value): pass def __iter__(self): ''' Iterate through (ID, class, feature_dict) tuples in feature set. ''' if self.features is not None: if not isinstance(self.vectorizer, DictVectorizer): raise ValueError('FeatureSets can only be iterated through if ' 'they use a DictVectorizer for their feature ' 'vectorizer.') for id_, class_, feats in zip(self.ids, self.classes, self.features): # When calling inverse_transform we have to add [0] to get the # results for the current instance because it always returns a # 2D array yield (id_, class_, self.vectorizer.inverse_transform(feats)[0]) else: return def __len__(self): return self.features.shape[1] def __add__(self, other): ''' Combine two feature sets to create a new one. This is done assuming they both have the same instances with the same IDs in the same order. ''' new_set = FeatureSet('+'.join(sorted([self.name, other.name]))) # Combine feature matrices and vectorizers if self.features is not None: if not isinstance(self.vectorizer, type(other.vectorizer)): raise ValueError('Cannot combine FeatureSets because they are ' 'not both using the same type of feature ' 'vectorizer (e.g., DictVectorizer, ' 'FeatureHasher)') feature_hasher = isinstance(self.vectorizer, FeatureHasher) if feature_hasher: if (self.vectorizer.n_features != other.vectorizer.n_features): raise ValueError('Cannot combine FeatureSets that uses ' 'FeatureHashers with different values of ' 'n_features setting.') else: # Check for duplicate feature names if (set(self.vectorizer.feature_names_) & set(other.vectorizer.feature_names_)): raise ValueError('Cannot combine FeatureSets because they ' 'have duplicate feature names.') num_feats = self.features.shape[1] new_set.features = sp.hstack([self.features, other.features], 'csr') new_set.vectorizer = deepcopy(self.vectorizer) if not feature_hasher: for feat_name, index in other.vectorizer.vocabulary_.items(): new_set.vectorizer.vocabulary_[feat_name] = (index + num_feats) other_names = other.vectorizer.feature_names_ new_set.vectorizer.feature_names_.extend(other_names) else: new_set.features = deepcopy(other.features) new_set.vectorizer = deepcopy(other.vectorizer) # Check that IDs are in the same order if self.has_ids: if other.has_ids and not np.all(self.ids == other.ids): raise ValueError('IDs are not in the same order in each ' 'feature set') else: new_set.ids = deepcopy(self.ids) else: new_set.ids = deepcopy(other.ids) # If either set has labels, check that they don't conflict if self.has_classes: # Classes should be the same for each ExamplesTuple, so store once if other.has_classes and not np.all(self.classes == other.classes): raise ValueError('Feature sets have conflicting labels for ' 'examples with the same ID.') else: new_set.classes = deepcopy(self.classes) else: new_set.classes = deepcopy(other.classes) return new_set def filter(self, ids=None, classes=None, features=None, inverse=False): ''' Removes or keeps features and/or examples from the Featureset depending on the passed in parameters. :param ids: Examples to keep in the FeatureSet. If `None`, no ID filtering takes place. :type ids: list of str/float :param classes: Classes that we want to retain examples for. If `None`, no class filtering takes place. :type classes: list of str/float :param features: Features to keep in the FeatureSet. To help with filtering string-valued features that were converted to sequences of boolean features when read in, any features in the FeatureSet that contain a `=` will be split on the first occurrence and the prefix will be checked to see if it is in `features`. If `None`, no feature filtering takes place. Cannot be used if FeatureSet uses a FeatureHasher for vectorization. :type features: list of str :param inverse: Instead of keeping features and/or examples in lists, remove them. :type inverse: bool ''' # Construct mask that indicates which examples to keep mask = np.ones(len(self), dtype=bool) if ids is not None: mask = np.logical_and(mask, np.logical_not(np.in1d(self.ids, ids))) if classes is not None: mask = np.logical_and(mask, np.logical_not(np.in1d(self.classes, classes))) if inverse: mask = np.logical_not(mask) # Remove examples not in mask self.ids = self.ids[mask] self.classes = self.classes[mask] self.features = self.features[mask, :] # Filter features if features is not None: if isinstance(self.vectorizer, FeatureHasher): raise ValueError('FeatureSets with FeatureHasher vectorizers' ' cannot be filtered by feature.') columns = np.array(sorted({feat_num for feat_name, feat_num in iteritems(self.vectorizer.vocabulary_) if (feat_name in features or feat_name.split('=', 1)[0] in features)})) if inverse: columns = ~columns self.features = self.features[:, columns] self.vectorizer.restrict(columns) def filtered_iter(self, ids=None, classes=None, features=None, inverse=False): ''' A version of ``__iter__`` that retains only the specified features and/or examples from the output. :param ids: Examples in the FeatureSet to keep. If `None`, no ID filtering takes place. :type ids: list of str/float :param classes: Classes that we want to retain examples for. If `None`, no class filtering takes place. :type classes: list of str/float :param features: Features in the FeatureSet to keep. To help with filtering string-valued features that were converted to sequences of boolean features when read in, any features in the FeatureSet that contain a `=` will be split on the first occurrence and the prefix will be checked to see if it is in `features`. If `None`, no feature filtering takes place. Cannot be used if FeatureSet uses a FeatureHasher for vectorization. :type features: list of str :param inverse: Instead of keeping features and/or examples in lists, remove them. :type inverse: bool ''' if self.features is not None and not isinstance(self.vectorizer, DictVectorizer): raise ValueError('FeatureSets can only be iterated through if they' ' use a DictVectorizer for their feature ' 'vectorizer.') for id_, class_, feats in zip(self.ids, self.classes, self.features): # Skip instances with IDs not in filter if ids is not None and (id_ in ids) == inverse: continue # Skip instances with classes not in filter if classes is not None and (class_ in classes) == inverse: continue feat_dict = self.vectorizer.inverse_transform(feats)[0] if features is not None: feat_dict = {name: value for name, value in iteritems(feat_dict) if (inverse != (name in features) or (name.split('=', 1)[0] in features))} elif not inverse: feat_dict = {} yield id_, class_, feat_dict def __sub__(self, other): ''' Return a copy of ``self`` with all features in ``other`` removed. ''' new_set = deepcopy(self) new_set.filter(features=other.features, inverse=True) return new_set @property def has_classes(self): ''' Whether or not this FeatureSet has any finite classes. ''' if self.classes is not None: return not (np.issubdtype(self.classes.dtype, float) and np.isnan(np.min(self.classes))) else: return False @property def has_ids(self): ''' Whether or not this FeatureSet has any finite IDs. ''' if self.ids is not None: return not (np.issubdtype(self.ids.dtype, float) and np.isnan(np.min(self.ids))) else: return False @property def feat_vectorizer(self): ''' Backward compatible name for vectorizer ''' warn('FeatureSet.feat_vectorizer will be removed in SKLL 1.0.0. ' 'Please switch to using FeatureSet.vectorizer to access the ' 'feature vectorizer.', DeprecationWarning) return self.vectorizer def __str__(self): ''' Return a string representation of FeatureSet ''' return str(self.__dict__) def __repr__(self): ''' Return a string representation of FeatureSet ''' return repr(self.__dict__)
class Reader(object): """ A little helper class to make picklable iterators out of example dictionary generators :param path_or_list: Path or a list of example dictionaries. :type path_or_list: str or list of dict :param quiet: Do not print "Loading..." status message to stderr. :type quiet: bool :param ids_to_floats: Convert IDs to float to save memory. Will raise error if we encounter an a non-numeric ID. :type ids_to_floats: bool :param id_col: Name of the column which contains the instance IDs for ARFF/CSV/TSV files. If no column with that name exists, or `None` is specified, the IDs will be generated automatically. :type id_col: str :param label_col: Name of the column which contains the class labels for ARFF/CSV/TSV files. If no column with that name exists, or `None` is specified, the data is considered to be unlabelled. :type label_col: str :param class_map: Mapping from original class labels to new ones. This is mainly used for collapsing multiple labels into a single class. Anything not in the mapping will be kept the same. :type class_map: dict from str to str :param sparse: Whether or not to store the features in a numpy CSR matrix when using a DictVectorizer to vectorize the features. :type sparse: bool :param feature_hasher: Whether or not a FeatureHasher should be used to vectorize the features. :type feature_hasher: bool :param num_features: If using a FeatureHasher, how many features should the resulting matrix have? You should set this to a power of 2 greater than the actual number of features to avoid collisions. :type num_features: int """ def __init__(self, path_or_list, quiet=True, ids_to_floats=False, label_col='y', id_col='id', class_map=None, sparse=True, feature_hasher=False, num_features=None): super(Reader, self).__init__() self.path_or_list = path_or_list self.quiet = quiet self.ids_to_floats = ids_to_floats self.label_col = label_col self.id_col = id_col self.class_map = class_map self._progress_msg = '' if feature_hasher: self.vectorizer = FeatureHasher(n_features=num_features) else: self.vectorizer = DictVectorizer(sparse=sparse) @classmethod def for_path(cls, path_or_list, **kwargs): """ :param path: The path to the file to load the examples from, or a list of example dictionaries. :type path: str or dict :param quiet: Do not print "Loading..." status message to stderr. :type quiet: bool :param sparse: Whether or not to store the features in a numpy CSR matrix. :type sparse: bool :param id_col: Name of the column which contains the instance IDs for ARFF/CSV/TSV files. If no column with that name exists, or `None` is specified, the IDs will be generated automatically. :type id_col: str :param label_col: Name of the column which contains the class labels for ARFF/CSV/TSV files. If no column with that name exists, or `None` is specified, the data is considered to be unlabelled. :type label_col: str :param ids_to_floats: Convert IDs to float to save memory. Will raise error if we encounter an a non-numeric ID. :type ids_to_floats: bool :param class_map: Mapping from original class labels to new ones. This is mainly used for collapsing multiple classes into a single class. Anything not in the mapping will be kept the same. :type class_map: dict from str to str :returns: New instance of the :class:`Reader` sub-class that is appropriate for the given path, or :class:`DictListReader` if given a list of dictionaries. """ if not isinstance(path_or_list, string_types): return DictListReader(path_or_list) else: # Get lowercase extension for file extension checking ext = '.' + path_or_list.rsplit('.', 1)[-1].lower() if ext not in EXT_TO_READER: raise ValueError(('Example files must be in either .arff, ' '.csv, .jsonlines, .megam, .ndj, or .tsv ' 'format. You specified: ' '{}').format(path_or_list)) return EXT_TO_READER[ext](path_or_list, **kwargs) def _sub_read(self, f): """ Does the actual reading of the given file or list. :param f: An open file to iterate through :type f: file """ raise NotImplementedError def _print_progress(self, progress_num, end="\r"): """ Little helper to print out progress numbers in proper format. Nothing gets printed if ``self.quiet`` is ``True``. :param progress_num: Progress indicator value. Usually either a line number or a percentage. :type progress_num: anything that can be converted to str :param end: The string to put at the end of the line. "\\r" should be used for every update except for the final one. :type end: str """ # Print out status if not self.quiet: print("{}{:>15}".format(self._progress_msg, progress_num), end=end, file=sys.stderr) sys.stderr.flush() def read(self): """ Loads examples in the ``.arff``, ``.csv``, ``.jsonlines``, ``.libsvm``, ``.megam``, ``.ndj``, or ``.tsv`` formats. :returns: :class:`~skll.data.featureset.FeatureSet` representing the file we read in. """ # Setup logger logger = logging.getLogger(__name__) logger.debug('Path: %s', self.path_or_list) if not self.quiet: self._progress_msg = "Loading {}...".format(self.path_or_list) print(self._progress_msg, end="\r", file=sys.stderr) sys.stderr.flush() # Get labels and IDs ids = [] labels = [] with open(self.path_or_list, 'r' if PY3 else 'rb') as f: for ex_num, (id_, class_, _) in enumerate(self._sub_read(f)): # Update lists of IDs, clases, and features if self.ids_to_floats: try: id_ = float(id_) except ValueError: raise ValueError(('You set ids_to_floats to true,' ' but ID {} could not be ' 'converted to float in ' '{}').format(id_, self.path_or_list)) ids.append(id_) labels.append(class_) if ex_num % 100 == 0: self._print_progress(ex_num) self._print_progress(ex_num) # Remember total number of examples for percentage progress meter total = ex_num # Convert everything to numpy arrays ids = np.array(ids) labels = np.array(labels) def feat_dict_generator(): with open(self.path_or_list, 'r' if PY3 else 'rb') as f: for ex_num, (_, _, feat_dict) in enumerate(self._sub_read(f)): yield feat_dict if ex_num % 100 == 0: self._print_progress('{:.8}%'.format( 100 * ((ex_num + 1) / total))) self._print_progress("100%") # Convert everything to numpy arrays features = self.vectorizer.fit_transform(feat_dict_generator()) # Report that loading is complete self._print_progress("done", end="\n") # Make sure we have the same number of ids, labels, and features assert ids.shape[0] == labels.shape[0] == features.shape[0] if ids.shape[0] != len(set(ids)): raise ValueError('The example IDs are not unique in %s.' % self.path_or_list) return FeatureSet(self.path_or_list, ids, labels=labels, features=features, vectorizer=self.vectorizer)
class Reader(object): """ A helper class to make picklable iterators out of example dictionary generators. Parameters ---------- path_or_list : str or list of dict Path or a list of example dictionaries. quiet : bool, optional Do not print "Loading..." status message to stderr. Defaults to ``True``. ids_to_floats : bool, optional Convert IDs to float to save memory. Will raise error if we encounter an a non-numeric ID. Defaults to ``False``. label_col : str, optional Name of the column which contains the class labels for ARFF/CSV/TSV files. If no column with that name exists, or ``None`` is specified, the data is considered to be unlabelled. Defaults to ``'y'``. id_col : str, optional Name of the column which contains the instance IDs. If no column with that name exists, or ``None`` is specified, example IDs will be automatically generated. Defaults to ``'id'``. class_map : dict, optional Mapping from original class labels to new ones. This is mainly used for collapsing multiple labels into a single class. Anything not in the mapping will be kept the same. Defaults to ``None``. sparse : bool, optional Whether or not to store the features in a numpy CSR matrix when using a DictVectorizer to vectorize the features. Defaults to ``True``. feature_hasher : bool, optional Whether or not a FeatureHasher should be used to vectorize the features. Defaults to ``False``. num_features : int, optional If using a FeatureHasher, how many features should the resulting matrix have? You should set this to a power of 2 greater than the actual number of features to avoid collisions. Defaults to ``None``. logger : logging.Logger, optional A logger instance to use to log messages instead of creating a new one by default. Defaults to ``None``. """ def __init__(self, path_or_list, quiet=True, ids_to_floats=False, label_col='y', id_col='id', class_map=None, sparse=True, feature_hasher=False, num_features=None, logger=None): super(Reader, self).__init__() self.path_or_list = path_or_list self.quiet = quiet self.ids_to_floats = ids_to_floats self.label_col = label_col self.id_col = id_col self.class_map = class_map self._progress_msg = '' self._use_pandas = False if feature_hasher: self.vectorizer = FeatureHasher(n_features=num_features) else: self.vectorizer = DictVectorizer(sparse=sparse) self.logger = logger if logger else logging.getLogger(__name__) @classmethod def for_path(cls, path_or_list, **kwargs): """ Instantiate the appropriate Reader sub-class based on the file extension of the given path. Or use a dictionary reader if the input is a list of dictionaries. Parameters ---------- path_or_list : str or list of dicts A path or list of example dictionaries. kwargs : dict, optional The arguments to the Reader object being instantiated. Returns ------- reader : skll.Reader A new instance of the Reader sub-class that is appropriate for the given path. Raises ------ ValueError If file does not have a valid extension. """ if not isinstance(path_or_list, string_types): return DictListReader(path_or_list) else: # Get lowercase extension for file extension checking ext = '.' + path_or_list.rsplit('.', 1)[-1].lower() if ext not in EXT_TO_READER: raise ValueError(('Example files must be in either .arff, ' '.csv, .jsonlines, .megam, .ndj, or .tsv ' 'format. You specified: ' '{}').format(path_or_list)) return EXT_TO_READER[ext](path_or_list, **kwargs) def _sub_read(self, f): """ Does the actual reading of the given file or list. For `Reader` objects that do not rely on `pandas` (and therefore read row-by-row), this function will be called by `_sub_read_rows()` and will take a file buffer rather than a file path. Otherwise, it will take a path and will be called directly in the `read()` method. Parameters ---------- f : file buffer or str Either a file buffer, if ``_sub_read_rows()`` is calling this method, or a path to a file, if it is being read with ``pandas``. Raises ------ NotImplementedError """ raise NotImplementedError def _print_progress(self, progress_num, end="\r"): """ Helper method to print out progress numbers in proper format. Nothing gets printed if ``self.quiet`` is ``True``. Parameters ---------- progress_num Progress indicator value. Usually either a line number or a percentage. Must be able to convert to string. end : str, optional The string to put at the end of the line. "\\r" should be used for every update except for the final one. Defaults to ``'\r'``. """ # Print out status if not self.quiet: print("{}{:>15}".format(self._progress_msg, progress_num), end=end, file=sys.stderr) sys.stderr.flush() def _sub_read_rows(self, path): """ Read the file in row-by-row. This method is used for `Reader` objects that do not rely on `pandas`, and are instead read line-by-line into a FeatureSet object, unlike pandas-based reader object, which will read everything into memory in a data frame object before converting to a `FeatureSet`. Parameters ---------- path : str The path to the file. Returns ------- ids : np.array The ids array. labels : np.array The labels array. features : list of dicts The features dictionary. Raises ------ ValueError If ``ids_to_floats`` is True, but IDs cannot be converted. ValueError If no features are found. ValueError If the example IDs are not unique. """ # Get labels and IDs ids = [] labels = [] ex_num = 0 with open(path, 'r' if PY3 else 'rb') as f: for ex_num, (id_, class_, _) in enumerate(self._sub_read(f), start=1): # Update lists of IDs, classes, and features if self.ids_to_floats: try: id_ = float(id_) except ValueError: raise ValueError(('You set ids_to_floats to true,' ' but ID {} could not be ' 'converted to float in ' '{}').format(id_, self.path_or_list)) ids.append(id_) labels.append(class_) if ex_num % 100 == 0: self._print_progress(ex_num) self._print_progress(ex_num) # Remember total number of examples for percentage progress meter total = ex_num if total == 0: raise ValueError("No features found in possibly " "empty file '{}'.".format(self.path_or_list)) # Convert everything to numpy arrays ids = np.array(ids) labels = np.array(labels) def feat_dict_generator(): with open(self.path_or_list, 'r' if PY3 else 'rb') as f: for ex_num, (_, _, feat_dict) in enumerate(self._sub_read(f)): yield feat_dict if ex_num % 100 == 0: self._print_progress('{:.8}%'.format(100 * ((ex_num / total)))) self._print_progress("100%") # extract the features dictionary features = feat_dict_generator() return ids, labels, features def _parse_dataframe(self, df, id_col, label_col, features=None): """ Parse the data frame into ids, labels, and features. For `Reader` objects that rely on `pandas`, this function will be called in the `_sub_read()` method to parse the data frame into the expected format. It will not be used by `Reader` classes that read row-by-row (and therefore use the `_sub_read_rows()` function). Parameters ---------- df : pd.DataFrame The pandas data frame to parse. id_col : str or None The id column. label_col : str or None The label column. features : list of dict or None The features, if they already exist; if not, then they will be extracted from the data frame. Defaults to None. Returns ------- ids : np.array The ids for the feature set. labels : np.array The labels for the feature set. features : list of dicts The features for the feature set. """ if df.empty: raise ValueError("No features found in possibly " "empty file '{}'.".format(self.path_or_list)) # if the id column exists, # get them from the data frame and # delete the column; otherwise, just # set it to None if id_col is not None and id_col in df: ids = df[id_col] del df[id_col] # if `ids_to_floats` is True, # then convert the ids to floats if self.ids_to_floats: ids = ids.astype(float) ids = ids.values else: # create ids with the prefix `EXAMPLE_` ids = np.array(['EXAMPLE_{}'.format(i) for i in range(df.shape[0])]) # if the label column exists, # get them from the data frame and # delete the column; otherwise, just # set it to None if label_col is not None and label_col in df: labels = df[label_col] del df[label_col] # if `class_map` exists, then # map the new classes to the labels; # otherwise, just convert them to floats if self.class_map is not None: labels = labels.apply(safe_float, replace_dict=self.class_map) else: labels = labels.apply(safe_float) labels = labels.values else: # create an array of Nones labels = np.array([None] * df.shape[0]) # convert the remaining features to # a list of dictionaries, if no # features argument was passed if features is None: features = df.to_dict(orient='records') return ids, labels, features def read(self): """ Loads examples in the `.arff`, `.csv`, `.jsonlines`, `.libsvm`, `.megam`, `.ndj`, or `.tsv` formats. Returns ------- feature_set : skll.FeatureSet ``FeatureSet`` instance representing the input file. Raises ------ ValueError If ``ids_to_floats`` is True, but IDs cannot be converted. ValueError If no features are found. ValueError If the example IDs are not unique. """ self.logger.debug('Path: %s', self.path_or_list) if not self.quiet: self._progress_msg = "Loading {}...".format(self.path_or_list) print(self._progress_msg, end="\r", file=sys.stderr) sys.stderr.flush() if self._use_pandas: ids, labels, features = self._sub_read(self.path_or_list) else: ids, labels, features = self._sub_read_rows(self.path_or_list) # Convert everything to numpy arrays features = self.vectorizer.fit_transform(features) # Report that loading is complete self._print_progress("done", end="\n") # Make sure we have the same number of ids, labels, and features assert ids.shape[0] == labels.shape[0] == features.shape[0] if ids.shape[0] != len(set(ids)): raise ValueError('The example IDs are not unique in %s.' % self.path_or_list) return FeatureSet(self.path_or_list, ids, labels=labels, features=features, vectorizer=self.vectorizer)
class FeatureSet(object): """ Encapsulation of all of the features, values, and metadata about a given set of data. This replaces `ExamplesTuple` from older versions of SKLL. Parameters ---------- name : str The name of this feature set. ids : np.array Example IDs for this set. labels : np.array, optional labels for this set. Defaults to ``None``. feature : list of dict or array-like, optional The features for each instance represented as either a list of dictionaries or an array-like (if `vectorizer` is also specified). Defaults to ``None``. vectorizer : DictVectorizer or FeatureHasher, optional Vectorizer which will be used to generate the feature matrix. Defaults to ``None``. Warnings -------- FeatureSets can only be equal if the order of the instances is identical because these are stored as lists/arrays. Since scikit-learn's `DictVectorizer` automatically sorts the underlying feature matrix if it is sparse, we do not do any sorting before checking for equality. This is not a problem because we _always_ use sparse matrices with `DictVectorizer` when creating FeatureSets. Notes ----- If ids, labels, and/or features are not None, the number of rows in each array must be equal. """ def __init__(self, name, ids, labels=None, features=None, vectorizer=None): super(FeatureSet, self).__init__() self.name = name if isinstance(ids, list): ids = np.array(ids) self.ids = ids if isinstance(labels, list): labels = np.array(labels) self.labels = labels self.features = features self.vectorizer = vectorizer # Convert list of dicts to numpy array if isinstance(self.features, list): if self.vectorizer is None: self.vectorizer = NewDictVectorizer(sparse=True) self.features = self.vectorizer.fit_transform(self.features) if self.features is not None: num_feats = self.features.shape[0] if self.ids is None: raise ValueError('A list of IDs is required') num_ids = self.ids.shape[0] if num_feats != num_ids: raise ValueError(('Number of IDs (%s) does not equal ' 'number of feature rows (%s)') % (num_ids, num_feats)) if self.labels is None: self.labels = np.empty(num_feats) self.labels.fill(None) num_labels = self.labels.shape[0] if num_feats != num_labels: raise ValueError(('Number of labels (%s) does not equal ' 'number of feature rows (%s)') % (num_labels, num_feats)) def __contains__(self, value): """ Check if example ID is in the FeatureSet. Parameters ---------- value The value to check. """ return value in self.ids def __eq__(self, other): """ Check whether two featuresets are the same. Parameters ---------- other : skll.FeatureSet The other ``FeatureSet`` to check equivalence with. Note ---- We consider feature values to be equal if any differences are in the sixth decimal place or higher. """ return (self.ids.shape == other.ids.shape and self.labels.shape == other.labels.shape and self.features.shape == other.features.shape and (self.ids == other.ids).all() and (self.labels == other.labels).all() and np.allclose(self.features.data, other.features.data, rtol=1e-6) and (self.features.indices == other.features.indices).all() and (self.features.indptr == other.features.indptr).all() and self.vectorizer == other.vectorizer) def __iter__(self): """ Iterate through (ID, label, feature_dict) tuples in feature set. """ if self.features is not None: if not isinstance(self.vectorizer, DictVectorizer): raise ValueError('FeatureSets can only be iterated through if ' 'they use a DictVectorizer for their feature ' 'vectorizer.') for id_, label_, feats in zip(self.ids, self.labels, self.features): # reshape to a 2D matrix if we are not using a sparse matrix # to store the features feats = feats.reshape(1, -1) if not sp.issparse(feats) else feats # When calling inverse_transform we have to add [0] to get the # results for the current instance because it always returns a # 2D array yield (id_, label_, self.vectorizer.inverse_transform(feats)[0]) else: return def __len__(self): """ The number of rows in the ``FeatureSet`` instance. """ return self.features.shape[0] def __add__(self, other): """ Combine two feature sets to create a new one. This is done assuming they both have the same instances with the same IDs in the same order. Parameters ---------- other : skll.FeatureSet The other ``FeatureSet`` to add to this one. Raises ------ ValueError If IDs are not in the same order in each ``FeatureSet`` instance. ValueError If vectorizers are different between the two ``FeatureSet`` instances. ValueError If there are duplicate feature names. ValueError If there are conflicting labels. """ # Check that the sets of IDs are equal if set(self.ids) != set(other.ids): raise ValueError('IDs are not in the same order in each ' 'feature set') # Compute the relative ordering of IDs for merging the features # and labels. ids_indices = dict((y, x) for x, y in enumerate(other.ids)) relative_order = [ids_indices[self_id] for self_id in self.ids] # Initialize the new feature set with a name and the IDs. new_set = FeatureSet('+'.join(sorted([self.name, other.name])), deepcopy(self.ids)) # Combine feature matrices and vectorizers. if not isinstance(self.vectorizer, type(other.vectorizer)): raise ValueError('Cannot combine FeatureSets because they are ' 'not both using the same type of feature ' 'vectorizer (e.g., DictVectorizer, ' 'FeatureHasher)') uses_feature_hasher = isinstance(self.vectorizer, FeatureHasher) if uses_feature_hasher: if (self.vectorizer.n_features != other.vectorizer.n_features): raise ValueError('Cannot combine FeatureSets that uses ' 'FeatureHashers with different values of ' 'n_features setting.') else: # Check for duplicate feature names. if (set(self.vectorizer.feature_names_) & set(other.vectorizer.feature_names_)): raise ValueError('Cannot combine FeatureSets because they ' 'have duplicate feature names.') num_feats = self.features.shape[1] new_set.features = sp.hstack([self.features, other.features[relative_order]], 'csr') new_set.vectorizer = deepcopy(self.vectorizer) if not uses_feature_hasher: for feat_name, index in other.vectorizer.vocabulary_.items(): new_set.vectorizer.vocabulary_[feat_name] = (index + num_feats) other_names = other.vectorizer.feature_names_ new_set.vectorizer.feature_names_.extend(other_names) # If either set has labels, check that they don't conflict. if self.has_labels: # labels should be the same for each FeatureSet, so store once. if other.has_labels and \ not np.all(self.labels == other.labels[relative_order]): raise ValueError('Feature sets have conflicting labels for ' 'examples with the same ID.') new_set.labels = deepcopy(self.labels) else: new_set.labels = deepcopy(other.labels[relative_order]) return new_set def filter(self, ids=None, labels=None, features=None, inverse=False): """ Removes or keeps features and/or examples from the `Featureset` depending on the parameters. Filtering is done in-place. Parameters ---------- ids : list of str/float, optional Examples to keep in the FeatureSet. If `None`, no ID filtering takes place. Defaults to ``None``. labels : list of str/float, optional Labels that we want to retain examples for. If `None`, no label filtering takes place. Defaults to ``None``. features : list of str, optional Features to keep in the FeatureSet. To help with filtering string-valued features that were converted to sequences of boolean features when read in, any features in the FeatureSet that contain a `=` will be split on the first occurrence and the prefix will be checked to see if it is in `features`. If `None`, no feature filtering takes place. Cannot be used if FeatureSet uses a FeatureHasher for vectorization. Defaults to ``None``. inverse : bool, optional Instead of keeping features and/or examples in lists, remove them. Defaults to ``False``. Raises ------ ValueError If attempting to use features to filter a ``FeatureSet`` that uses a ``FeatureHasher`` vectorizer. """ # Construct mask that indicates which examples to keep mask = np.ones(len(self), dtype=bool) if ids is not None: mask = np.logical_and(mask, np.in1d(self.ids, ids)) if labels is not None: mask = np.logical_and(mask, np.in1d(self.labels, labels)) if inverse and (labels is not None or ids is not None): mask = np.logical_not(mask) # Remove examples not in mask self.ids = self.ids[mask] self.labels = self.labels[mask] self.features = self.features[mask, :] # Filter features if features is not None: if isinstance(self.vectorizer, FeatureHasher): raise ValueError('FeatureSets with FeatureHasher vectorizers' ' cannot be filtered by feature.') columns = np.array(sorted({feat_num for feat_name, feat_num in iteritems(self.vectorizer.vocabulary_) if (feat_name in features or feat_name.split('=', 1)[0] in features)})) if inverse: all_columns = np.arange(self.features.shape[1]) columns = all_columns[np.logical_not(np.in1d(all_columns, columns))] self.features = self.features[:, columns] self.vectorizer.restrict(columns, indices=True) def filtered_iter(self, ids=None, labels=None, features=None, inverse=False): """ A version of `__iter__` that retains only the specified features and/or examples from the output. Parameters ---------- ids : list of str/float, optional Examples to keep in the ``FeatureSet``. If ``None``, no ID filtering takes place. Defaults to ``None``. labels : list of str/float, optional Labels that we want to retain examples for. If ``None``, no label filtering takes place. Defaults to ``None``. features : list of str, optional Features to keep in the ``FeatureSet``. To help with filtering string-valued features that were converted to sequences of boolean features when read in, any features in the ``FeatureSet`` that contain a `=` will be split on the first occurrence and the prefix will be checked to see if it is in ``features``. If `None`, no feature filtering takes place. Cannot be used if ``FeatureSet`` uses a FeatureHasher for vectorization. Defaults to ``None``. inverse : bool, optional Instead of keeping features and/or examples in lists, remove them. Defaults to ``False``. Yields ------ id_ : str The ID of the example. label_ : str The label of the example. feat_dict : dict The feature dictionary, with feature name as the key and example value as the value. Raises ------ ValueError If the vectorizer is not a `DictVectorizer`. """ if self.features is not None and not isinstance(self.vectorizer, DictVectorizer): raise ValueError('FeatureSets can only be iterated through if they' ' use a DictVectorizer for their feature ' 'vectorizer.') for id_, label_, feats in zip(self.ids, self.labels, self.features): # Skip instances with IDs not in filter if ids is not None and (id_ in ids) == inverse: continue # Skip instances with labels not in filter if labels is not None and (label_ in labels) == inverse: continue # reshape to a 2D matrix if we are not using a sparse matrix # to store the features feats = feats.reshape(1, -1) if not sp.issparse(feats) else feats feat_dict = self.vectorizer.inverse_transform(feats)[0] if features is not None: feat_dict = {name: value for name, value in iteritems(feat_dict) if (inverse != (name in features or name.split('=', 1)[0] in features))} elif not inverse: feat_dict = {} yield id_, label_, feat_dict def __sub__(self, other): """ Subset ``FeatureSet`` instance by removing all the features from the other ``FeatureSet`` instance. Parameters ---------- other : skll.FeatureSet The other ``FeatureSet`` containing the features that should be removed from this ``FeatureSet``. Returns ------- A copy of `self` with all features in `other` removed. """ new_set = deepcopy(self) new_set.filter(features=other.vectorizer.feature_names_, inverse=True) return new_set @property def has_labels(self): """ Check if ``FeatureSet`` has finite labels. Returns ------- has_labels : bool Whether or not this FeatureSet has any finite labels. """ # make sure that labels is not None or a list of Nones if self.labels is not None and not all(label is None for label in self.labels): # then check that they are not a list of NaNs return not (np.issubdtype(self.labels.dtype, np.floating) and np.isnan(np.min(self.labels))) else: return False def __str__(self): """ Returns ------- A string representation of ``FeatureSet``. """ return str(self.__dict__) def __repr__(self): """ Returns ------- A string representation of ``FeatureSet``. """ return repr(self.__dict__) def __getitem__(self, value): """ Parameters ---------- value The value to retrieve. Returns ------- A specific example by row number or, if given a slice, a new ``FeatureSet`` instance containing a subset of the data. """ # Check if we're slicing if isinstance(value, slice): sliced_ids = self.ids[value] sliced_feats = (self.features[value] if self.features is not None else None) sliced_labels = (self.labels[value] if self.labels is not None else None) return FeatureSet('{}_{}'.format(self.name, value), sliced_ids, features=sliced_feats, labels=sliced_labels, vectorizer=self.vectorizer) else: label = self.labels[value] if self.labels is not None else None feats = self.features[value, :] features = (self.vectorizer.inverse_transform(feats)[0] if self.features is not None else {}) return self.ids[value], label, features @staticmethod def split_by_ids(fs, ids_for_split1, ids_for_split2=None): """ Split the ``FeatureSet`` into two new ``FeatureSet`` instances based on the given IDs for the two splits. Parameters ---------- fs : skll.FeatureSet The ``FeatureSet`` instance to split. ids_for_split1 : list of int A list of example IDs which will be split out into the first ``FeatureSet`` instance. Note that the FeatureSet instance will respect the order of the specified IDs. ids_for_split2 : list of int, optional An optional ist of example IDs which will be split out into the second ``FeatureSet`` instance. Note that the ``FeatureSet`` instance will respect the order of the specified IDs. If this is not specified, then the second ``FeatureSet`` instance will contain the complement of the first set of IDs sorted in ascending order. Defaults to ``None``. Returns ------- fs1 : skll.FeatureSet The first ``FeatureSet``. fs2 : skll.FeatureSet The second ``FeatureSet``. """ # Note: an alternative way to implement this is to make copies # of the given FeatureSet instance and then use the `filter()` # method but that wastes too much memory since it requires making # two copies of the original FeatureSet which may be huge. With # the current implementation, we are creating new objects but # they should be much smaller than the original FeatureSet. ids1 = fs.ids[ids_for_split1] labels1 = fs.labels[ids_for_split1] features1 = fs.features[ids_for_split1] if ids_for_split2 is None: ids2 = fs.ids[~np.in1d(fs.ids, ids_for_split1)] labels2 = fs.labels[~np.in1d(fs.ids, ids_for_split1)] features2 = fs.features[~np.in1d(fs.ids, ids_for_split1)] else: ids2 = fs.ids[ids_for_split2] labels2 = fs.labels[ids_for_split2] features2 = fs.features[ids_for_split2] fs1 = FeatureSet('{}_1'.format(fs.name), ids1, labels=labels1, features=features1, vectorizer=fs.vectorizer) fs2 = FeatureSet('{}_2'.format(fs.name), ids2, labels=labels2, features=features2, vectorizer=fs.vectorizer) return fs1, fs2 @staticmethod def from_data_frame(df, name, labels_column=None, vectorizer=None): """ Helper function to create a ``FeatureSet`` instance from a `pandas.DataFrame`. Will raise an Exception if pandas is not installed in your environment. The ``ids`` in the ``FeatureSet`` will be the index from the given frame. Parameters ---------- df : pd.DataFrame The pandas.DataFrame object to use as a ``FeatureSet``. name : str The name of the output ``FeatureSet`` instance. labels_column : str, optional The name of the column containing the labels (data to predict). Defaults to ``None``. vectorizer : DictVectorizer or FeatureHasher, optional Vectorizer which will be used to generate the feature matrix. Defaults to ``None``. Returns ------- feature_set : skll.FeatureSet A ``FeatureSet`` instance generated from from the given data frame. """ if labels_column: feature_columns = [column for column in df.columns if column != labels_column] labels = df[labels_column].tolist() else: feature_columns = df.columns labels = None features = df[feature_columns].to_dict(orient='records') return FeatureSet(name, ids=df.index.tolist(), labels=labels, features=features, vectorizer=vectorizer)