Esempio n. 1
0
 def __init__(self, path_or_list, quiet=True, ids_to_floats=False,
              label_col='y', id_col='id', class_map=None, sparse=True,
              feature_hasher=False, num_features=None):
     super(Reader, self).__init__()
     self.path_or_list = path_or_list
     self.quiet = quiet
     self.ids_to_floats = ids_to_floats
     self.label_col = label_col
     self.id_col = id_col
     self.class_map = class_map
     self._progress_msg = ''
     if feature_hasher:
         self.vectorizer = FeatureHasher(n_features=num_features)
     else:
         self.vectorizer = DictVectorizer(sparse=sparse)
Esempio n. 2
0
 def __init__(self, name, ids, labels=None, features=None,
              vectorizer=None):
     super(FeatureSet, self).__init__()
     self.name = name
     if isinstance(ids, list):
         ids = np.array(ids)
     self.ids = ids
     if isinstance(labels, list):
         labels = np.array(labels)
     self.labels = labels
     self.features = features
     self.vectorizer = vectorizer
     # Convert list of dicts to numpy array
     if isinstance(self.features, list):
         if self.vectorizer is None:
             self.vectorizer = NewDictVectorizer(sparse=True)
         self.features = self.vectorizer.fit_transform(self.features)
     if self.features is not None:
         num_feats = self.features.shape[0]
         if self.ids is None:
             raise ValueError('A list of IDs is required')
         num_ids = self.ids.shape[0]
         if num_feats != num_ids:
             raise ValueError(('Number of IDs (%s) does not equal '
                               'number of feature rows (%s)') % (num_ids,
                                                                 num_feats))
         if self.labels is None:
             self.labels = np.empty(num_feats)
             self.labels.fill(None)
         num_labels = self.labels.shape[0]
         if num_feats != num_labels:
             raise ValueError(('Number of labels (%s) does not equal '
                               'number of feature rows (%s)') % (num_labels,
                                                                 num_feats))
Esempio n. 3
0
 def __init__(self, name, ids, labels=None, features=None,
              vectorizer=None):
     super(FeatureSet, self).__init__()
     self.name = name
     if isinstance(ids, list):
         ids = np.array(ids)
     self.ids = ids
     if isinstance(labels, list):
         labels = np.array(labels)
     self.labels = labels
     self.features = features
     self.vectorizer = vectorizer
     # Convert list of dicts to numpy array
     if isinstance(self.features, list):
         if self.vectorizer is None:
             self.vectorizer = NewDictVectorizer(sparse=True)
         self.features = self.vectorizer.fit_transform(self.features)
     if self.features is not None:
         num_feats = self.features.shape[0]
         if self.ids is None:
             raise ValueError('A list of IDs is required')
         num_ids = self.ids.shape[0]
         if num_feats != num_ids:
             raise ValueError(('Number of IDs (%s) does not equal '
                               'number of feature rows (%s)') % (num_ids,
                                                                 num_feats))
         if self.labels is None:
             self.labels = np.empty(num_feats)
             self.labels.fill(None)
         num_labels = self.labels.shape[0]
         if num_feats != num_labels:
             raise ValueError(('Number of labels (%s) does not equal '
                               'number of feature rows (%s)') % (num_labels,
                                                                 num_feats))
Esempio n. 4
0
    def __init__(self, path_or_list, quiet=True, ids_to_floats=False,
                 label_col='y', id_col='id', class_map=None, sparse=True,
                 feature_hasher=False, num_features=None,
                 logger=None):
        super(Reader, self).__init__()
        self.path_or_list = path_or_list
        self.quiet = quiet
        self.ids_to_floats = ids_to_floats
        self.label_col = label_col
        self.id_col = id_col
        self.class_map = class_map
        self._progress_msg = ''
        self._use_pandas = False

        if feature_hasher:
            self.vectorizer = FeatureHasher(n_features=num_features)
        else:
            self.vectorizer = DictVectorizer(sparse=sparse)
        self.logger = logger if logger else logging.getLogger(__name__)
Esempio n. 5
0
class FeatureSet(object):

    """
    Encapsulation of all of the features, values, and metadata about a given
    set of data.

    This replaces ``ExamplesTuple`` from older versions.

    :param name: The name of this feature set.
    :type name: str
    :param ids: Example IDs for this set.
    :type ids: np.array
    :param labels: labels for this set.
    :type labels: np.array
    :param features: The features for each instance represented as either a
                     list of dictionaries or an array-like (if `vectorizer` is
                     also specified).
    :type features: list of dict or array-like
    :param vectorizer: Vectorizer that created feature matrix.
    :type vectorizer: DictVectorizer or FeatureHasher

    .. note::
       If ids, labels, and/or features are not None, the number of rows in
       each array must be equal.
    """

    def __init__(self, name, ids, labels=None, features=None,
                 vectorizer=None):
        super(FeatureSet, self).__init__()
        self.name = name
        if isinstance(ids, list):
            ids = np.array(ids)
        self.ids = ids
        if isinstance(labels, list):
            labels = np.array(labels)
        self.labels = labels
        self.features = features
        self.vectorizer = vectorizer
        # Convert list of dicts to numpy array
        if isinstance(self.features, list):
            if self.vectorizer is None:
                self.vectorizer = NewDictVectorizer(sparse=True)
            self.features = self.vectorizer.fit_transform(self.features)
        if self.features is not None:
            num_feats = self.features.shape[0]
            if self.ids is None:
                raise ValueError('A list of IDs is required')
            num_ids = self.ids.shape[0]
            if num_feats != num_ids:
                raise ValueError(('Number of IDs (%s) does not equal '
                                  'number of feature rows (%s)') % (num_ids,
                                                                    num_feats))
            if self.labels is None:
                self.labels = np.empty(num_feats)
                self.labels.fill(None)
            num_labels = self.labels.shape[0]
            if num_feats != num_labels:
                raise ValueError(('Number of labels (%s) does not equal '
                                  'number of feature rows (%s)') % (num_labels,
                                                                    num_feats))

    def __contains__(self, value):
        """
        Check if example ID is in set
        """
        return value in self.ids

    def __eq__(self, other):
        """
        Check whether two featuresets are the same.

        .. note::
           We consider feature values to be equal if any differences are in the
           sixth decimal place or higher.
        """

        # We need to sort the indices for the underlying
        # feature sparse matrix in case we haven't done
        # so already.
        if not self.features.has_sorted_indices:
            self.features.sort_indices()
        if not other.features.has_sorted_indices:
            other.features.sort_indices()

        return (self.ids.shape == other.ids.shape and
                self.labels.shape == other.labels.shape and
                self.features.shape == other.features.shape and
                (self.ids == other.ids).all() and
                (self.labels == other.labels).all() and
                np.allclose(self.features.data, other.features.data,
                            rtol=1e-6) and
                (self.features.indices == other.features.indices).all() and
                (self.features.indptr == other.features.indptr).all() and
                self.vectorizer == other.vectorizer)

    def __iter__(self):
        """
        Iterate through (ID, label, feature_dict) tuples in feature set.
        """
        if self.features is not None:
            if not isinstance(self.vectorizer, DictVectorizer):
                raise ValueError('FeatureSets can only be iterated through if '
                                 'they use a DictVectorizer for their feature '
                                 'vectorizer.')
            for id_, label_, feats in zip(self.ids, self.labels,
                                          self.features):
                # When calling inverse_transform we have to add [0] to get the
                # results for the current instance because it always returns a
                # 2D array
                yield (id_, label_,
                       self.vectorizer.inverse_transform(feats)[0])
        else:
            return

    def __len__(self):
        return self.features.shape[0]

    def __add__(self, other):
        """
        Combine two feature sets to create a new one.  This is done assuming
        they both have the same instances with the same IDs in the same order.
        """

        # Check that the sets of IDs are equal
        if set(self.ids) != set(other.ids):
            raise ValueError('IDs are not in the same order in each '
                             'feature set')
        # Compute the relative ordering of IDs for merging the features
        # and labels.
        ids_indices = dict((y, x) for x, y in enumerate(other.ids))
        relative_order = [ids_indices[self_id] for self_id in self.ids]

        # Initialize the new feature set with a name and the IDs.
        new_set = FeatureSet('+'.join(sorted([self.name, other.name])),
                             deepcopy(self.ids))

        # Combine feature matrices and vectorizers.
        if not isinstance(self.vectorizer, type(other.vectorizer)):
            raise ValueError('Cannot combine FeatureSets because they are '
                             'not both using the same type of feature '
                             'vectorizer (e.g., DictVectorizer, '
                             'FeatureHasher)')
        uses_feature_hasher = isinstance(self.vectorizer, FeatureHasher)
        if uses_feature_hasher:
            if (self.vectorizer.n_features !=
                    other.vectorizer.n_features):
                raise ValueError('Cannot combine FeatureSets that uses '
                                 'FeatureHashers with different values of '
                                 'n_features setting.')
        else:
            # Check for duplicate feature names.
            if (set(self.vectorizer.feature_names_) &
                    set(other.vectorizer.feature_names_)):
                raise ValueError('Cannot combine FeatureSets because they '
                                 'have duplicate feature names.')
        num_feats = self.features.shape[1]

        new_set.features = sp.hstack([self.features,
                                      other.features[relative_order]],
                                     'csr')
        new_set.vectorizer = deepcopy(self.vectorizer)
        if not uses_feature_hasher:
            for feat_name, index in other.vectorizer.vocabulary_.items():
                new_set.vectorizer.vocabulary_[feat_name] = (index +
                                                             num_feats)
            other_names = other.vectorizer.feature_names_
            new_set.vectorizer.feature_names_.extend(other_names)

        # If either set has labels, check that they don't conflict.
        if self.has_labels:
            # labels should be the same for each FeatureSet, so store once.
            if other.has_labels and \
                    not np.all(self.labels == other.labels[relative_order]):
                raise ValueError('Feature sets have conflicting labels for '
                                 'examples with the same ID.')
            new_set.labels = deepcopy(self.labels)
        else:
            new_set.labels = deepcopy(other.labels[relative_order])

        return new_set

    def filter(self, ids=None, labels=None, features=None, inverse=False):
        """
        Removes or keeps features and/or examples from the Featureset depending
        on the passed in parameters.

        :param ids: Examples to keep in the FeatureSet. If `None`, no ID
                    filtering takes place.
        :type ids: list of str/float
        :param labels: labels that we want to retain examples for. If `None`,
                        no label filtering takes place.
        :type labels: list of str/float
        :param features: Features to keep in the FeatureSet. To help with
                         filtering string-valued features that were converted
                         to sequences of boolean features when read in, any
                         features in the FeatureSet that contain a `=` will be
                         split on the first occurrence and the prefix will be
                         checked to see if it is in `features`.
                         If `None`, no feature filtering takes place.
                         Cannot be used if FeatureSet uses a FeatureHasher for
                         vectorization.
        :type features: list of str
        :param inverse: Instead of keeping features and/or examples in lists,
                        remove them.
        :type inverse: bool
        """
        # Construct mask that indicates which examples to keep
        mask = np.ones(len(self), dtype=bool)
        if ids is not None:
            mask = np.logical_and(mask, np.in1d(self.ids, ids))
        if labels is not None:
            mask = np.logical_and(mask, np.in1d(self.labels, labels))

        if inverse and (labels is not None or ids is not None):
            mask = np.logical_not(mask)

        # Remove examples not in mask
        self.ids = self.ids[mask]
        self.labels = self.labels[mask]
        self.features = self.features[mask, :]

        # Filter features
        if features is not None:
            if isinstance(self.vectorizer, FeatureHasher):
                raise ValueError('FeatureSets with FeatureHasher vectorizers'
                                 ' cannot be filtered by feature.')
            columns = np.array(sorted({feat_num for feat_name, feat_num in
                                       iteritems(self.vectorizer.vocabulary_)
                                       if (feat_name in features or
                                           feat_name.split('=', 1)[0] in
                                           features)}))
            if inverse:
                all_columns = np.arange(self.features.shape[1])
                columns = all_columns[np.logical_not(np.in1d(all_columns,
                                                             columns))]
            self.features = self.features[:, columns]
            self.vectorizer.restrict(columns, indices=True)

    def filtered_iter(self, ids=None, labels=None, features=None,
                      inverse=False):
        """
        A version of ``__iter__`` that retains only the specified features
        and/or examples from the output.

        :param ids: Examples in the FeatureSet to keep. If `None`, no ID
                    filtering takes place.
        :type ids: list of str/float
        :param labels: labels that we want to retain examples for. If `None`,
                       no label filtering takes place.
        :type labels: list of str/float
        :param features: Features in the FeatureSet to keep. To help with
                         filtering string-valued features that were converted
                         to sequences of boolean features when read in, any
                         features in the FeatureSet that contain a `=` will be
                         split on the first occurrence and the prefix will be
                         checked to see if it is in `features`.
                         If `None`, no feature filtering takes place.
                         Cannot be used if FeatureSet uses a FeatureHasher for
                         vectorization.
        :type features: list of str
        :param inverse: Instead of keeping features and/or examples in lists,
                        remove them.
        :type inverse: bool
        """
        if self.features is not None and not isinstance(self.vectorizer,
                                                        DictVectorizer):
            raise ValueError('FeatureSets can only be iterated through if they'
                             ' use a DictVectorizer for their feature '
                             'vectorizer.')

        for id_, label_, feats in zip(self.ids, self.labels, self.features):
            # Skip instances with IDs not in filter
            if ids is not None and (id_ in ids) == inverse:
                continue
            # Skip instances with labels not in filter
            if labels is not None and (label_ in labels) == inverse:
                continue
            feat_dict = self.vectorizer.inverse_transform(feats)[0]
            if features is not None:
                feat_dict = {name: value for name, value in
                             iteritems(feat_dict) if
                             (inverse != (name in features or
                                          name.split('=', 1)[0] in features))}
            elif not inverse:
                feat_dict = {}
            yield id_, label_, feat_dict

    def __sub__(self, other):
        """
        :returns: a copy of ``self`` with all features in ``other`` removed.
        """
        new_set = deepcopy(self)
        new_set.filter(features=other.vectorizer.feature_names_,
                       inverse=True)
        return new_set

    @property
    def has_labels(self):
        """
        :returns: Whether or not this FeatureSet has any finite labels.
        """
        if self.labels is not None:
            return not (np.issubdtype(self.labels.dtype, float) and
                        np.isnan(np.min(self.labels)))
        else:
            return False

    def __str__(self):
        """
        :returns: a string representation of FeatureSet
        """
        return str(self.__dict__)

    def __repr__(self):
        """
        :returns:  a string representation of FeatureSet
        """
        return repr(self.__dict__)

    def __getitem__(self, value):
        """
        :returns: A specific example by row number, or if given a slice,
                  a new FeatureSet containing a subset of the data.
        """
        # Check if we're slicing
        if isinstance(value, slice):
            sliced_ids = self.ids[value]
            sliced_feats = (self.features[value] if self.features is not None
                            else None)
            sliced_labels = (self.labels[value] if self.labels is not None
                             else None)
            return FeatureSet('{}_{}'.format(self.name, value), sliced_ids,
                              features=sliced_feats, labels=sliced_labels,
                              vectorizer=self.vectorizer)
        else:
            label = self.labels[value] if self.labels is not None else None
            feats = self.features[value, :]
            features = (self.vectorizer.inverse_transform(feats)[0] if
                        self.features is not None else {})
            return self.ids[value], label, features
Esempio n. 6
0
def main(argv=None):
    """
    Handles command line arguments and gets things started.

    Parameters
    ----------
    argv : list of str
        List of arguments, as if specified on the command-line.
        If None, ``sys.argv[1:]`` is used instead.
    """

    # Get command line arguments
    parser = argparse.ArgumentParser(
        description="Takes an input feature file and converts it to another \
                     format. Formats are determined automatically from file \
                     extensions.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('infile',
                        help='input feature file (ends in .arff, .csv, \
                              .jsonlines, .libsvm, .megam, .ndj, or .tsv)')
    parser.add_argument('outfile',
                        help='output feature file (ends in .arff, .csv, \
                              .jsonlines, .libsvm, .megam, .ndj, or .tsv)')
    parser.add_argument('-i',
                        '--id_col',
                        help='Name of the column which contains the instance \
                              IDs in ARFF, CSV, or TSV files.',
                        default='id')
    parser.add_argument('-l',
                        '--label_col',
                        help='Name of the column which contains the class \
                              labels in ARFF, CSV, or TSV files. For ARFF \
                              files, this must be the final column to count as\
                              the label.',
                        default='y')
    parser.add_argument('-q',
                        '--quiet',
                        help='Suppress printing of "Loading..." messages.',
                        action='store_true')
    parser.add_argument('--arff_regression',
                        help='Create ARFF files for regression, not \
                              classification.',
                        action='store_true')
    parser.add_argument('--arff_relation',
                        help='Relation name to use for ARFF file.',
                        default='skll_relation')
    parser.add_argument('--reuse_libsvm_map',
                        help='If you want to output multiple files that use \
                              the same mapping from labels and features to \
                              numbers when writing libsvm files, you can \
                              specify an existing .libsvm file to reuse the \
                              mapping from.',
                        type=argparse.FileType('rb'))
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s {0}'.format(__version__))
    args = parser.parse_args(argv)

    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - '
                                '%(message)s'))
    logger = logging.getLogger(__name__)

    # make sure the input file extension is one we can process
    input_extension = os.path.splitext(args.infile)[1].lower()
    output_extension = os.path.splitext(args.outfile)[1].lower()

    if input_extension not in EXT_TO_READER:
        logger.error(('Input file must be in either .arff, .csv, .jsonlines, '
                      '.libsvm, .megam, .ndj, or .tsv format. You specified: '
                      '{}').format(input_extension))
        sys.exit(1)

    # Build feature and label vectorizers from existing libsvm file if asked
    if args.reuse_libsvm_map and output_extension == '.libsvm':
        feat_map = {}
        label_map = {}
        for line in args.reuse_libsvm_map:
            line = UnicodeDammit(line,
                                 ['utf-8', 'windows-1252']).unicode_markup
            if '#' not in line:
                logger.error('The LibSVM file you want to reuse the map from '
                             'was not created by SKLL and does not actually '
                             'contain the necessary mapping info.')
                sys.exit(1)
            comments = line.split('#')[1]
            _, label_map_str, feat_map_str = comments.split('|')
            feat_map.update(
                _pair_to_dict_tuple(pair)
                for pair in feat_map_str.strip().split())
            label_map.update(
                _pair_to_dict_tuple(pair)
                for pair in label_map_str.strip().split())
        feat_vectorizer = DictVectorizer()
        feat_vectorizer.fit([{name: 1} for name in feat_map])
        feat_vectorizer.vocabulary_ = feat_map
    else:
        feat_vectorizer = None
        label_map = None

    # Iterate through input file and collect the information we need
    reader = EXT_TO_READER[input_extension](args.infile,
                                            quiet=args.quiet,
                                            label_col=args.label_col,
                                            id_col=args.id_col)
    feature_set = reader.read()
    # write out the file in the requested output format
    writer_type = EXT_TO_WRITER[output_extension]
    writer_args = {'quiet': args.quiet}
    if writer_type is DelimitedFileWriter:
        writer_args['label_col'] = args.label_col
        writer_args['id_col'] = args.id_col
    elif writer_type is ARFFWriter:
        writer_args['label_col'] = args.label_col
        writer_args['id_col'] = args.id_col
        writer_args['regression'] = args.arff_regression
        writer_args['relation'] = args.arff_relation
    elif writer_type is LibSVMWriter:
        writer_args['label_map'] = label_map
    writer = writer_type(args.outfile, feature_set, **writer_args)
    writer.write()
Esempio n. 7
0
class Reader(object):

    """
    A little helper class to make picklable iterators out of example
    dictionary generators

    :param path_or_list: Path or a list of example dictionaries.
    :type path_or_list: str or list of dict
    :param quiet: Do not print "Loading..." status message to stderr.
    :type quiet: bool
    :param ids_to_floats: Convert IDs to float to save memory. Will raise error
                          if we encounter an a non-numeric ID.
    :type ids_to_floats: bool
    :param id_col: Name of the column which contains the instance IDs for
                   ARFF/CSV/TSV files. If no column with that name exists, or
                   `None` is specified, the IDs will be generated
                   automatically.
    :type id_col: str
    :param label_col: Name of the column which contains the class labels
                      for ARFF/CSV/TSV files. If no column with that name
                      exists, or `None` is specified, the data is
                      considered to be unlabelled.
    :type label_col: str
    :param class_map: Mapping from original class labels to new ones. This is
                      mainly used for collapsing multiple labels into a single
                      class. Anything not in the mapping will be kept the same.
    :type class_map: dict from str to str
    :param sparse: Whether or not to store the features in a numpy CSR
                   matrix when using a DictVectorizer to vectorize the
                   features.
    :type sparse: bool
    :param feature_hasher: Whether or not a FeatureHasher should be used to
                           vectorize the features.
    :type feature_hasher: bool
    :param num_features: If using a FeatureHasher, how many features should the
                         resulting matrix have?  You should set this to a power
                         of 2 greater than the actual number of features to
                         avoid collisions.
    :type num_features: int
    """

    def __init__(self, path_or_list, quiet=True, ids_to_floats=False,
                 label_col='y', id_col='id', class_map=None, sparse=True,
                 feature_hasher=False, num_features=None):
        super(Reader, self).__init__()
        self.path_or_list = path_or_list
        self.quiet = quiet
        self.ids_to_floats = ids_to_floats
        self.label_col = label_col
        self.id_col = id_col
        self.class_map = class_map
        self._progress_msg = ''
        if feature_hasher:
            self.vectorizer = FeatureHasher(n_features=num_features)
        else:
            self.vectorizer = DictVectorizer(sparse=sparse)

    @classmethod
    def for_path(cls, path_or_list, **kwargs):
        """
        :param path: The path to the file to load the examples from, or a list
                     of example dictionaries.
        :type path: str or dict
        :param quiet: Do not print "Loading..." status message to stderr.
        :type quiet: bool
        :param sparse: Whether or not to store the features in a numpy CSR
                       matrix.
        :type sparse: bool
        :param id_col: Name of the column which contains the instance IDs for
                       ARFF/CSV/TSV files. If no column with that name exists,
                       or `None` is specified, the IDs will be generated
                       automatically.
        :type id_col: str
        :param label_col: Name of the column which contains the class labels
                          for ARFF/CSV/TSV files. If no column with that name
                          exists, or `None` is specified, the data is
                          considered to be unlabelled.
        :type label_col: str
        :param ids_to_floats: Convert IDs to float to save memory. Will raise
                              error if we encounter an a non-numeric ID.
        :type ids_to_floats: bool
        :param class_map: Mapping from original class labels to new ones. This
                          is mainly used for collapsing multiple classes into a
                          single class. Anything not in the mapping will be
                          kept the same.
        :type class_map: dict from str to str

        :returns: New instance of the :class:`Reader` sub-class that is
                  appropriate for the given path, or :class:`DictListReader` if
                  given a list of dictionaries.
        """
        if not isinstance(path_or_list, string_types):
            return DictListReader(path_or_list)
        else:
            # Get lowercase extension for file extension checking
            ext = '.' + path_or_list.rsplit('.', 1)[-1].lower()
            if ext not in EXT_TO_READER:
                raise ValueError(('Example files must be in either .arff, '
                                  '.csv, .jsonlines, .megam, .ndj, or .tsv '
                                  'format. You specified: '
                                  '{}').format(path_or_list))
        return EXT_TO_READER[ext](path_or_list, **kwargs)

    def _sub_read(self, f):
        """
        Does the actual reading of the given file or list.

        :param f: An open file to iterate through
        :type f: file
        """
        raise NotImplementedError

    def _print_progress(self, progress_num, end="\r"):
        """
        Little helper to print out progress numbers in proper format.

        Nothing gets printed if ``self.quiet`` is ``True``.

        :param progress_num: Progress indicator value.  Usually either a line
                             number or a percentage.
        :type progress_num: anything that can be converted to str
        :param end: The string to put at the end of the line.  "\\r" should be
                    used for every update except for the final one.
        :type end: str
        """
        # Print out status
        if not self.quiet:
            print("{}{:>15}".format(self._progress_msg, progress_num),
                  end=end, file=sys.stderr)
            sys.stderr.flush()

    def read(self):
        """
        Loads examples in the ``.arff``, ``.csv``, ``.jsonlines``, ``.libsvm``,
        ``.megam``, ``.ndj``, or ``.tsv`` formats.

        :returns: :class:`~skll.data.featureset.FeatureSet` representing the
                  file we read in.
        """
        # Setup logger
        logger = logging.getLogger(__name__)

        logger.debug('Path: %s', self.path_or_list)

        if not self.quiet:
            self._progress_msg = "Loading {}...".format(self.path_or_list)
            print(self._progress_msg, end="\r", file=sys.stderr)
            sys.stderr.flush()

        # Get labels and IDs
        ids = []
        labels = []
        with open(self.path_or_list, 'r' if PY3 else 'rb') as f:
            for ex_num, (id_, class_, _) in enumerate(self._sub_read(f), start=1):
                # Update lists of IDs, clases, and features
                if self.ids_to_floats:
                    try:
                        id_ = float(id_)
                    except ValueError:
                        raise ValueError(('You set ids_to_floats to true,'
                                          ' but ID {} could not be '
                                          'converted to float in '
                                          '{}').format(id_,
                                                       self.path_or_list))
                ids.append(id_)
                labels.append(class_)
                if ex_num % 100 == 0:
                    self._print_progress(ex_num)
            self._print_progress(ex_num)

        # Remember total number of examples for percentage progress meter
        total = ex_num

        # Convert everything to numpy arrays
        ids = np.array(ids)
        labels = np.array(labels)

        def feat_dict_generator():
            with open(self.path_or_list, 'r' if PY3 else 'rb') as f:
                for ex_num, (_, _, feat_dict) in enumerate(self._sub_read(f)):
                    yield feat_dict
                    if ex_num % 100 == 0:
                        self._print_progress('{:.8}%'.format(100 * ((ex_num /
                                                                    total))))
                self._print_progress("100%")

        # Convert everything to numpy arrays
        features = self.vectorizer.fit_transform(feat_dict_generator())

        # Report that loading is complete
        self._print_progress("done", end="\n")

        # Make sure we have the same number of ids, labels, and features
        assert ids.shape[0] == labels.shape[0] == features.shape[0]

        if ids.shape[0] != len(set(ids)):
            raise ValueError('The example IDs are not unique in %s.' %
                             self.path_or_list)

        return FeatureSet(self.path_or_list, ids, labels=labels,
                          features=features, vectorizer=self.vectorizer)
Esempio n. 8
0
class Reader(object):
    """
    A helper class to make picklable iterators out of example
    dictionary generators.

    Parameters
    ----------
    path_or_list : str or list of dict
        Path or a list of example dictionaries.
    quiet : bool, optional
        Do not print "Loading..." status message to stderr.
        Defaults to ``True``.
    ids_to_floats : bool, optional
        Convert IDs to float to save memory. Will raise error
        if we encounter an a non-numeric ID.
        Defaults to ``False``.
    label_col : str, optional
        Name of the column which contains the class labels
        for ARFF/CSV/TSV files. If no column with that name
        exists, or ``None`` is specified, the data is
        considered to be unlabelled.
        Defaults to ``'y'``.
    id_col : str, optional
        Name of the column which contains the instance IDs.
        If no column with that name exists, or ``None`` is
        specified, example IDs will be automatically generated.
        Defaults to ``'id'``.
    class_map : dict, optional
        Mapping from original class labels to new ones. This is
        mainly used for collapsing multiple labels into a single
        class. Anything not in the mapping will be kept the same.
        Defaults to ``None``.
    sparse : bool, optional
        Whether or not to store the features in a numpy CSR
        matrix when using a DictVectorizer to vectorize the
        features.
        Defaults to ``True``.
    feature_hasher : bool, optional
        Whether or not a FeatureHasher should be used to
        vectorize the features.
        Defaults to ``False``.
    num_features : int, optional
        If using a FeatureHasher, how many features should the
        resulting matrix have?  You should set this to a power
        of 2 greater than the actual number of features to
        avoid collisions.
        Defaults to ``None``.
    logger : logging.Logger, optional
        A logger instance to use to log messages instead of creating
        a new one by default.
        Defaults to ``None``.
    """
    def __init__(self,
                 path_or_list,
                 quiet=True,
                 ids_to_floats=False,
                 label_col='y',
                 id_col='id',
                 class_map=None,
                 sparse=True,
                 feature_hasher=False,
                 num_features=None,
                 logger=None):
        super(Reader, self).__init__()
        self.path_or_list = path_or_list
        self.quiet = quiet
        self.ids_to_floats = ids_to_floats
        self.label_col = label_col
        self.id_col = id_col
        self.class_map = class_map
        self._progress_msg = ''
        if feature_hasher:
            self.vectorizer = FeatureHasher(n_features=num_features)
        else:
            self.vectorizer = DictVectorizer(sparse=sparse)
        self.logger = logger if logger else logging.getLogger(__name__)

    @classmethod
    def for_path(cls, path_or_list, **kwargs):
        """
        Instantiate the appropriate Reader sub-class based on the
        file extension of the given path. Or use a dictionary reader
        if the input is a list of dictionaries.

        Parameters
        ----------
        path_or_list : str or list of dicts
            A path or list of example dictionaries.
        kwargs : dict, optional
            The arguments to the Reader object being instantiated.

        Returns
        -------
        reader : skll.Reader
            A new instance of the Reader sub-class that is
            appropriate for the given path.

        Raises
        ------
        ValueError
            If file does not have a valid extension.
        """
        if not isinstance(path_or_list, string_types):
            return DictListReader(path_or_list)
        else:
            # Get lowercase extension for file extension checking
            ext = '.' + path_or_list.rsplit('.', 1)[-1].lower()
            if ext not in EXT_TO_READER:
                raise ValueError(('Example files must be in either .arff, '
                                  '.csv, .jsonlines, .megam, .ndj, or .tsv '
                                  'format. You specified: '
                                  '{}').format(path_or_list))
        return EXT_TO_READER[ext](path_or_list, **kwargs)

    def _sub_read(self, f):
        """
        Does the actual reading of the given file or list.

        Parameters
        ----------
        f : file buffer
            An open file to iterate through.

        Raises
        ------
        NotImplementedError
        """
        raise NotImplementedError

    def _print_progress(self, progress_num, end="\r"):
        """
        Helper method to print out progress numbers in proper format.
        Nothing gets printed if ``self.quiet`` is ``True``.

        Parameters
        ----------
        progress_num
            Progress indicator value. Usually either a line
            number or a percentage. Must be able to convert to string.

        end : str, optional
            The string to put at the end of the line.  "\\r" should be
            used for every update except for the final one.
            Defaults to ``'\r'``.
        """
        # Print out status
        if not self.quiet:
            print("{}{:>15}".format(self._progress_msg, progress_num),
                  end=end,
                  file=sys.stderr)
            sys.stderr.flush()

    def read(self):
        """
        Loads examples in the `.arff`, `.csv`, `.jsonlines`, `.libsvm`,
        `.megam`, `.ndj`, or `.tsv` formats.

        Returns
        -------
        feature_set : skll.FeatureSet
            ``FeatureSet`` instance representing the input file.

        Raises
        ------
        ValueError
            If ``ids_to_floats`` is True, but IDs cannot be converted.
        ValueError
            If no features are found.
        ValueError
            If the example IDs are not unique.
        """
        self.logger.debug('Path: %s', self.path_or_list)

        if not self.quiet:
            self._progress_msg = "Loading {}...".format(self.path_or_list)
            print(self._progress_msg, end="\r", file=sys.stderr)
            sys.stderr.flush()

        # Get labels and IDs
        ids = []
        labels = []
        ex_num = 0
        with open(self.path_or_list, 'r' if PY3 else 'rb') as f:
            for ex_num, (id_, class_, _) in enumerate(self._sub_read(f),
                                                      start=1):
                # Update lists of IDs, clases, and features
                if self.ids_to_floats:
                    try:
                        id_ = float(id_)
                    except ValueError:
                        raise ValueError(('You set ids_to_floats to true,'
                                          ' but ID {} could not be '
                                          'converted to float in '
                                          '{}').format(id_, self.path_or_list))
                ids.append(id_)
                labels.append(class_)
                if ex_num % 100 == 0:
                    self._print_progress(ex_num)
            self._print_progress(ex_num)

        # Remember total number of examples for percentage progress meter
        total = ex_num
        if total == 0:
            raise ValueError("No features found in possibly "
                             "empty file '{}'.".format(self.path_or_list))

        # Convert everything to numpy arrays
        ids = np.array(ids)
        labels = np.array(labels)

        def feat_dict_generator():
            with open(self.path_or_list, 'r' if PY3 else 'rb') as f:
                for ex_num, (_, _, feat_dict) in enumerate(self._sub_read(f)):
                    yield feat_dict
                    if ex_num % 100 == 0:
                        self._print_progress('{:.8}%'.format(
                            100 * ((ex_num / total))))
                self._print_progress("100%")

        # Convert everything to numpy arrays
        features = self.vectorizer.fit_transform(feat_dict_generator())

        # Report that loading is complete
        self._print_progress("done", end="\n")

        # Make sure we have the same number of ids, labels, and features
        assert ids.shape[0] == labels.shape[0] == features.shape[0]

        if ids.shape[0] != len(set(ids)):
            raise ValueError('The example IDs are not unique in %s.' %
                             self.path_or_list)

        return FeatureSet(self.path_or_list,
                          ids,
                          labels=labels,
                          features=features,
                          vectorizer=self.vectorizer)
def main(argv=None):
    """
    Handles command line arguments and gets things started.

    Parameters
    ----------
    argv : list of str
        List of arguments, as if specified on the command-line.
        If None, ``sys.argv[1:]`` is used instead.
    """

    # Get command line arguments
    parser = argparse.ArgumentParser(
        description="Takes an input feature file and converts it to another \
                     format. Formats are determined automatically from file \
                     extensions.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('infile',
                        help='input feature file (ends in .arff, .csv, \
                              .jsonlines, .libsvm, .megam, .ndj, or .tsv)')
    parser.add_argument('outfile',
                        help='output feature file (ends in .arff, .csv, \
                              .jsonlines, .libsvm, .megam, .ndj, or .tsv)')
    parser.add_argument('-i', '--id_col',
                        help='Name of the column which contains the instance \
                              IDs in ARFF, CSV, or TSV files.',
                        default='id')
    label_group = parser.add_mutually_exclusive_group(required=False)
    label_group.add_argument('-l',
                             '--label_col',
                             help='Name of the column which contains the class \
                                   labels in ARFF, CSV, or TSV files. For ARFF \
                                   files, this must be the final column to count as\
                                   the label.',
                             default='y')
    label_group.add_argument('--no_labels',
                             action='store_true',
                             default=False,
                             help='Used to indicate that the input data has no labels.')
    parser.add_argument('-q', '--quiet',
                        help='Suppress printing of "Loading..." messages.',
                        action='store_true')
    parser.add_argument('--arff_regression',
                        help='Create ARFF files for regression, not \
                              classification.',
                        action='store_true')
    parser.add_argument('--arff_relation',
                        help='Relation name to use for ARFF file.',
                        default='skll_relation')
    parser.add_argument('--reuse_libsvm_map',
                        help='If you want to output multiple files that use \
                              the same mapping from labels and features to \
                              numbers when writing libsvm files, you can \
                              specify an existing .libsvm file to reuse the \
                              mapping from.',
                        type=argparse.FileType('rb'))
    parser.add_argument('--version', action='version',
                        version='%(prog)s {0}'.format(__version__))
    args = parser.parse_args(argv)

    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - '
                                '%(message)s'))
    logger = logging.getLogger(__name__)

    # make sure the input file extension is one we can process
    input_extension = os.path.splitext(args.infile)[1].lower()
    output_extension = os.path.splitext(args.outfile)[1].lower()

    if input_extension not in EXT_TO_READER:
        logger.error(('Input file must be in either .arff, .csv, .jsonlines, '
                      '.libsvm, .megam, .ndj, or .tsv format. You specified: '
                      '{}').format(input_extension))
        sys.exit(1)

    # Build feature and label vectorizers from existing libsvm file if asked
    if args.reuse_libsvm_map and output_extension == '.libsvm':
        feat_map = {}
        label_map = {}
        for line in args.reuse_libsvm_map:
            line = UnicodeDammit(line, ['utf-8',
                                        'windows-1252']).unicode_markup
            if '#' not in line:
                logger.error('The LibSVM file you want to reuse the map from '
                             'was not created by SKLL and does not actually '
                             'contain the necessary mapping info.')
                sys.exit(1)
            comments = line.split('#')[1]
            _, label_map_str, feat_map_str = comments.split('|')
            feat_map.update(_pair_to_dict_tuple(pair) for pair in
                            feat_map_str.strip().split())
            label_map.update(_pair_to_dict_tuple(pair) for pair in
                             label_map_str
                             .strip().split())
        feat_vectorizer = DictVectorizer()
        feat_vectorizer.fit([{name: 1} for name in feat_map])
        feat_vectorizer.vocabulary_ = feat_map
    else:
        feat_vectorizer = None
        label_map = None

    label_col = None if args.no_labels else args.label_col

    # Iterate through input file and collect the information we need
    reader = EXT_TO_READER[input_extension](args.infile,
                                            quiet=args.quiet,
                                            label_col=label_col,
                                            id_col=args.id_col)
    feature_set = reader.read()
    # write out the file in the requested output format
    writer_type = EXT_TO_WRITER[output_extension]
    writer_args = {'quiet': args.quiet}
    if writer_type is CSVWriter or writer_type is TSVWriter:
        writer_args['label_col'] = label_col
        writer_args['id_col'] = args.id_col
    elif writer_type is ARFFWriter:
        writer_args['label_col'] = label_col
        writer_args['id_col'] = args.id_col
        writer_args['regression'] = args.arff_regression
        writer_args['relation'] = args.arff_relation
    elif writer_type is LibSVMWriter:
        writer_args['label_map'] = label_map
    writer = writer_type(args.outfile, feature_set, **writer_args)
    writer.write()
Esempio n. 10
0
class FeatureSet(object):

    """
    Encapsulation of all of the features, values, and metadata about a given
    set of data.

    This replaces ``ExamplesTuple`` from older versions.

    :param name: The name of this feature set.
    :type name: str
    :param ids: Example IDs for this set.
    :type ids: np.array
    :param labels: labels for this set.
    :type labels: np.array
    :param features: The features for each instance represented as either a
                     list of dictionaries or an array-like (if `vectorizer` is
                     also specified).
    :type features: list of dict or array-like
    :param vectorizer: Vectorizer that created feature matrix.
    :type vectorizer: DictVectorizer or FeatureHasher

    .. note::
       If ids, labels, and/or features are not None, the number of rows in
       each array must be equal.
    """

    def __init__(self, name, ids, labels=None, features=None,
                 vectorizer=None):
        super(FeatureSet, self).__init__()
        self.name = name
        if isinstance(ids, list):
            ids = np.array(ids)
        self.ids = ids
        if isinstance(labels, list):
            labels = np.array(labels)
        self.labels = labels
        self.features = features
        self.vectorizer = vectorizer
        # Convert list of dicts to numpy array
        if isinstance(self.features, list):
            if self.vectorizer is None:
                self.vectorizer = NewDictVectorizer(sparse=True)
            self.features = self.vectorizer.fit_transform(self.features)
        if self.features is not None:
            num_feats = self.features.shape[0]
            if self.ids is None:
                raise ValueError('A list of IDs is required')
            num_ids = self.ids.shape[0]
            if num_feats != num_ids:
                raise ValueError(('Number of IDs (%s) does not equal '
                                  'number of feature rows (%s)') % (num_ids,
                                                                    num_feats))
            if self.labels is None:
                self.labels = np.empty(num_feats)
                self.labels.fill(None)
            num_labels = self.labels.shape[0]
            if num_feats != num_labels:
                raise ValueError(('Number of labels (%s) does not equal '
                                  'number of feature rows (%s)') % (num_labels,
                                                                    num_feats))

    def __contains__(self, value):
        """
        Check if example ID is in set
        """
        return value in self.ids

    def __eq__(self, other):
        """
        Check whether two featuresets are the same.

        .. note::
           We consider feature values to be equal if any differences are in the
           sixth decimal place or higher.
        """

        # We need to sort the indices for the underlying
        # feature sparse matrix in case we haven't done
        # so already.
        if not self.features.has_sorted_indices:
            self.features.sort_indices()
        if not other.features.has_sorted_indices:
            other.features.sort_indices()

        return (self.ids.shape == other.ids.shape and
                self.labels.shape == other.labels.shape and
                self.features.shape == other.features.shape and
                (self.ids == other.ids).all() and
                (self.labels == other.labels).all() and
                np.allclose(self.features.data, other.features.data,
                            rtol=1e-6) and
                (self.features.indices == other.features.indices).all() and
                (self.features.indptr == other.features.indptr).all() and
                self.vectorizer == other.vectorizer)

    def __iter__(self):
        """
        Iterate through (ID, label, feature_dict) tuples in feature set.
        """
        if self.features is not None:
            if not isinstance(self.vectorizer, DictVectorizer):
                raise ValueError('FeatureSets can only be iterated through if '
                                 'they use a DictVectorizer for their feature '
                                 'vectorizer.')
            for id_, label_, feats in zip(self.ids, self.labels,
                                          self.features):
                # When calling inverse_transform we have to add [0] to get the
                # results for the current instance because it always returns a
                # 2D array
                yield (id_, label_,
                       self.vectorizer.inverse_transform(feats)[0])
        else:
            return

    def __len__(self):
        return self.features.shape[0]

    def __add__(self, other):
        """
        Combine two feature sets to create a new one.  This is done assuming
        they both have the same instances with the same IDs in the same order.
        """

        # Check that the sets of IDs are equal
        if set(self.ids) != set(other.ids):
            raise ValueError('IDs are not in the same order in each '
                             'feature set')
        # Compute the relative ordering of IDs for merging the features
        # and labels.
        ids_indices = dict((y, x) for x, y in enumerate(other.ids))
        relative_order = [ids_indices[self_id] for self_id in self.ids]

        # Initialize the new feature set with a name and the IDs.
        new_set = FeatureSet('+'.join(sorted([self.name, other.name])),
                             deepcopy(self.ids))

        # Combine feature matrices and vectorizers.
        if not isinstance(self.vectorizer, type(other.vectorizer)):
            raise ValueError('Cannot combine FeatureSets because they are '
                             'not both using the same type of feature '
                             'vectorizer (e.g., DictVectorizer, '
                             'FeatureHasher)')
        uses_feature_hasher = isinstance(self.vectorizer, FeatureHasher)
        if uses_feature_hasher:
            if (self.vectorizer.n_features !=
                    other.vectorizer.n_features):
                raise ValueError('Cannot combine FeatureSets that uses '
                                 'FeatureHashers with different values of '
                                 'n_features setting.')
        else:
            # Check for duplicate feature names.
            if (set(self.vectorizer.feature_names_) &
                    set(other.vectorizer.feature_names_)):
                raise ValueError('Cannot combine FeatureSets because they '
                                 'have duplicate feature names.')
        num_feats = self.features.shape[1]

        new_set.features = sp.hstack([self.features,
                                      other.features[relative_order]],
                                     'csr')
        new_set.vectorizer = deepcopy(self.vectorizer)
        if not uses_feature_hasher:
            for feat_name, index in other.vectorizer.vocabulary_.items():
                new_set.vectorizer.vocabulary_[feat_name] = (index +
                                                             num_feats)
            other_names = other.vectorizer.feature_names_
            new_set.vectorizer.feature_names_.extend(other_names)

        # If either set has labels, check that they don't conflict.
        if self.has_labels:
            # labels should be the same for each FeatureSet, so store once.
            if other.has_labels and \
                    not np.all(self.labels == other.labels[relative_order]):
                raise ValueError('Feature sets have conflicting labels for '
                                 'examples with the same ID.')
            new_set.labels = deepcopy(self.labels)
        else:
            new_set.labels = deepcopy(other.labels[relative_order])

        return new_set

    def filter(self, ids=None, labels=None, features=None, inverse=False):
        """
        Removes or keeps features and/or examples from the Featureset depending
        on the passed in parameters.

        :param ids: Examples to keep in the FeatureSet. If `None`, no ID
                    filtering takes place.
        :type ids: list of str/float
        :param labels: labels that we want to retain examples for. If `None`,
                        no label filtering takes place.
        :type labels: list of str/float
        :param features: Features to keep in the FeatureSet. To help with
                         filtering string-valued features that were converted
                         to sequences of boolean features when read in, any
                         features in the FeatureSet that contain a `=` will be
                         split on the first occurrence and the prefix will be
                         checked to see if it is in `features`.
                         If `None`, no feature filtering takes place.
                         Cannot be used if FeatureSet uses a FeatureHasher for
                         vectorization.
        :type features: list of str
        :param inverse: Instead of keeping features and/or examples in lists,
                        remove them.
        :type inverse: bool
        """
        # Construct mask that indicates which examples to keep
        mask = np.ones(len(self), dtype=bool)
        if ids is not None:
            mask = np.logical_and(mask, np.in1d(self.ids, ids))
        if labels is not None:
            mask = np.logical_and(mask, np.in1d(self.labels, labels))

        if inverse and (labels is not None or ids is not None):
            mask = np.logical_not(mask)

        # Remove examples not in mask
        self.ids = self.ids[mask]
        self.labels = self.labels[mask]
        self.features = self.features[mask, :]

        # Filter features
        if features is not None:
            if isinstance(self.vectorizer, FeatureHasher):
                raise ValueError('FeatureSets with FeatureHasher vectorizers'
                                 ' cannot be filtered by feature.')
            columns = np.array(sorted({feat_num for feat_name, feat_num in
                                       iteritems(self.vectorizer.vocabulary_)
                                       if (feat_name in features or
                                           feat_name.split('=', 1)[0] in
                                           features)}))
            if inverse:
                all_columns = np.arange(self.features.shape[1])
                columns = all_columns[np.logical_not(np.in1d(all_columns,
                                                             columns))]
            self.features = self.features[:, columns]
            self.vectorizer.restrict(columns, indices=True)

    def filtered_iter(self, ids=None, labels=None, features=None,
                      inverse=False):
        """
        A version of ``__iter__`` that retains only the specified features
        and/or examples from the output.

        :param ids: Examples in the FeatureSet to keep. If `None`, no ID
                    filtering takes place.
        :type ids: list of str/float
        :param labels: labels that we want to retain examples for. If `None`,
                       no label filtering takes place.
        :type labels: list of str/float
        :param features: Features in the FeatureSet to keep. To help with
                         filtering string-valued features that were converted
                         to sequences of boolean features when read in, any
                         features in the FeatureSet that contain a `=` will be
                         split on the first occurrence and the prefix will be
                         checked to see if it is in `features`.
                         If `None`, no feature filtering takes place.
                         Cannot be used if FeatureSet uses a FeatureHasher for
                         vectorization.
        :type features: list of str
        :param inverse: Instead of keeping features and/or examples in lists,
                        remove them.
        :type inverse: bool
        """
        if self.features is not None and not isinstance(self.vectorizer,
                                                        DictVectorizer):
            raise ValueError('FeatureSets can only be iterated through if they'
                             ' use a DictVectorizer for their feature '
                             'vectorizer.')

        for id_, label_, feats in zip(self.ids, self.labels, self.features):
            # Skip instances with IDs not in filter
            if ids is not None and (id_ in ids) == inverse:
                continue
            # Skip instances with labels not in filter
            if labels is not None and (label_ in labels) == inverse:
                continue
            feat_dict = self.vectorizer.inverse_transform(feats)[0]
            if features is not None:
                feat_dict = {name: value for name, value in
                             iteritems(feat_dict) if
                             (inverse != (name in features or
                                          name.split('=', 1)[0] in features))}
            elif not inverse:
                feat_dict = {}
            yield id_, label_, feat_dict

    def __sub__(self, other):
        """
        :returns: a copy of ``self`` with all features in ``other`` removed.
        """
        new_set = deepcopy(self)
        new_set.filter(features=other.vectorizer.feature_names_,
                       inverse=True)
        return new_set

    @property
    def has_labels(self):
        """
        :returns: Whether or not this FeatureSet has any finite labels.
        """
        if self.labels is not None:
            return not (np.issubdtype(self.labels.dtype, float) and
                        np.isnan(np.min(self.labels)))
        else:
            return False

    def __str__(self):
        """
        :returns: a string representation of FeatureSet
        """
        return str(self.__dict__)

    def __repr__(self):
        """
        :returns:  a string representation of FeatureSet
        """
        return repr(self.__dict__)

    def __getitem__(self, value):
        """
        :returns: A specific example by row number, or if given a slice,
                  a new FeatureSet containing a subset of the data.
        """
        # Check if we're slicing
        if isinstance(value, slice):
            sliced_ids = self.ids[value]
            sliced_feats = (self.features[value] if self.features is not None
                            else None)
            sliced_labels = (self.labels[value] if self.labels is not None
                             else None)
            return FeatureSet('{}_{}'.format(self.name, value), sliced_ids,
                              features=sliced_feats, labels=sliced_labels,
                              vectorizer=self.vectorizer)
        else:
            label = self.labels[value] if self.labels is not None else None
            feats = self.features[value, :]
            features = (self.vectorizer.inverse_transform(feats)[0] if
                        self.features is not None else {})
            return self.ids[value], label, features
Esempio n. 11
0
class FeatureSet(object):
    """
    Encapsulation of all of the features, values, and metadata about a given
    set of data. This replaces `ExamplesTuple` from older versions of SKLL.

    Parameters
    ----------
    name : str
        The name of this feature set.
    ids : np.array
        Example IDs for this set.
    labels : np.array, optional
        labels for this set.
        Defaults to ``None``.
    feature : list of dict or array-like, optional
        The features for each instance represented as either a
        list of dictionaries or an array-like (if `vectorizer` is
        also specified).
        Defaults to ``None``.
    vectorizer : DictVectorizer or FeatureHasher, optional
        Vectorizer which will be used to generate the feature matrix.
        Defaults to ``None``.

    Warnings
    --------
    FeatureSets can only be equal if the order of the instances is
    identical because these are stored as lists/arrays. Since scikit-learn's
    `DictVectorizer` automatically sorts the underlying feature matrix
    if it is sparse, we do not do any sorting before checking for equality.
    This is not a problem because we _always_ use sparse matrices with
    `DictVectorizer` when creating FeatureSets.

    Notes
    -----
    If ids, labels, and/or features are not None, the number of rows in
    each array must be equal.
    """
    def __init__(self, name, ids, labels=None, features=None, vectorizer=None):
        super(FeatureSet, self).__init__()
        self.name = name
        if isinstance(ids, list):
            ids = np.array(ids)
        self.ids = ids
        if isinstance(labels, list):
            labels = np.array(labels)
        self.labels = labels
        self.features = features
        self.vectorizer = vectorizer
        # Convert list of dicts to numpy array
        if isinstance(self.features, list):
            if self.vectorizer is None:
                self.vectorizer = NewDictVectorizer(sparse=True)
            self.features = self.vectorizer.fit_transform(self.features)
        if self.features is not None:
            num_feats = self.features.shape[0]
            if self.ids is None:
                raise ValueError('A list of IDs is required')
            num_ids = self.ids.shape[0]
            if num_feats != num_ids:
                raise ValueError(
                    ('Number of IDs (%s) does not equal '
                     'number of feature rows (%s)') % (num_ids, num_feats))
            if self.labels is None:
                self.labels = np.empty(num_feats)
                self.labels.fill(None)
            num_labels = self.labels.shape[0]
            if num_feats != num_labels:
                raise ValueError(
                    ('Number of labels (%s) does not equal '
                     'number of feature rows (%s)') % (num_labels, num_feats))

    def __contains__(self, value):
        """
        Check if example ID is in the FeatureSet.

        Parameters
        ----------
        value
            The value to check.
        """
        return value in self.ids

    def __eq__(self, other):
        """
        Check whether two featuresets are the same.

        Parameters
        ----------
        other : skll.FeatureSet
            The other ``FeatureSet`` to check equivalence with.

        Note
        ----
        We consider feature values to be equal if any differences are in the
        sixth decimal place or higher.
        """

        return (self.ids.shape == other.ids.shape
                and self.labels.shape == other.labels.shape
                and self.features.shape == other.features.shape
                and (self.ids == other.ids).all()
                and (self.labels == other.labels).all() and np.allclose(
                    self.features.data, other.features.data, rtol=1e-6)
                and (self.features.indices == other.features.indices).all()
                and (self.features.indptr == other.features.indptr).all()
                and self.vectorizer == other.vectorizer)

    def __iter__(self):
        """
        Iterate through (ID, label, feature_dict) tuples in feature set.
        """
        if self.features is not None:
            if not isinstance(self.vectorizer, DictVectorizer):
                raise ValueError('FeatureSets can only be iterated through if '
                                 'they use a DictVectorizer for their feature '
                                 'vectorizer.')
            for id_, label_, feats in zip(self.ids, self.labels,
                                          self.features):

                # reshape to a 2D matrix if we are not using a sparse matrix
                # to store the features
                feats = feats.reshape(1,
                                      -1) if not sp.issparse(feats) else feats

                # When calling inverse_transform we have to add [0] to get the
                # results for the current instance because it always returns a
                # 2D array
                yield (id_, label_,
                       self.vectorizer.inverse_transform(feats)[0])
        else:
            return

    def __len__(self):
        """
        The number of rows in the ``FeatureSet`` instance.
        """
        return self.features.shape[0]

    def __add__(self, other):
        """
        Combine two feature sets to create a new one.  This is done assuming
        they both have the same instances with the same IDs in the same order.

        Parameters
        ----------
        other : skll.FeatureSet
            The other ``FeatureSet`` to add to this one.

        Raises
        ------
        ValueError
            If IDs are not in the same order in each ``FeatureSet`` instance.
        ValueError
            If vectorizers are different between the two ``FeatureSet`` instances.
        ValueError
            If there are duplicate feature names.
        ValueError
            If there are conflicting labels.
        """

        # Check that the sets of IDs are equal
        if set(self.ids) != set(other.ids):
            raise ValueError('IDs are not in the same order in each '
                             'feature set')
        # Compute the relative ordering of IDs for merging the features
        # and labels.
        ids_indices = dict((y, x) for x, y in enumerate(other.ids))
        relative_order = [ids_indices[self_id] for self_id in self.ids]

        # Initialize the new feature set with a name and the IDs.
        new_set = FeatureSet('+'.join(sorted([self.name, other.name])),
                             deepcopy(self.ids))

        # Combine feature matrices and vectorizers.
        if not isinstance(self.vectorizer, type(other.vectorizer)):
            raise ValueError('Cannot combine FeatureSets because they are '
                             'not both using the same type of feature '
                             'vectorizer (e.g., DictVectorizer, '
                             'FeatureHasher)')
        uses_feature_hasher = isinstance(self.vectorizer, FeatureHasher)
        if uses_feature_hasher:
            if (self.vectorizer.n_features != other.vectorizer.n_features):
                raise ValueError('Cannot combine FeatureSets that uses '
                                 'FeatureHashers with different values of '
                                 'n_features setting.')
        else:
            # Check for duplicate feature names.
            if (set(self.vectorizer.feature_names_)
                    & set(other.vectorizer.feature_names_)):
                raise ValueError('Cannot combine FeatureSets because they '
                                 'have duplicate feature names.')
        num_feats = self.features.shape[1]

        new_set.features = sp.hstack(
            [self.features, other.features[relative_order]], 'csr')
        new_set.vectorizer = deepcopy(self.vectorizer)
        if not uses_feature_hasher:
            for feat_name, index in other.vectorizer.vocabulary_.items():
                new_set.vectorizer.vocabulary_[feat_name] = (index + num_feats)
            other_names = other.vectorizer.feature_names_
            new_set.vectorizer.feature_names_.extend(other_names)

        # If either set has labels, check that they don't conflict.
        if self.has_labels:
            # labels should be the same for each FeatureSet, so store once.
            if other.has_labels and \
                    not np.all(self.labels == other.labels[relative_order]):
                raise ValueError('Feature sets have conflicting labels for '
                                 'examples with the same ID.')
            new_set.labels = deepcopy(self.labels)
        else:
            new_set.labels = deepcopy(other.labels[relative_order])

        return new_set

    def filter(self, ids=None, labels=None, features=None, inverse=False):
        """
        Removes or keeps features and/or examples from the `Featureset` depending
        on the parameters. Filtering is done in-place.

        Parameters
        ----------
        ids : list of str/float, optional
            Examples to keep in the FeatureSet. If `None`, no ID
            filtering takes place.
            Defaults to ``None``.
        labels : list of str/float, optional
            Labels that we want to retain examples for. If `None`,
            no label filtering takes place.
            Defaults to ``None``.
        features : list of str, optional
            Features to keep in the FeatureSet. To help with
            filtering string-valued features that were converted
            to sequences of boolean features when read in, any
            features in the FeatureSet that contain a `=` will be
            split on the first occurrence and the prefix will be
            checked to see if it is in `features`.
            If `None`, no feature filtering takes place.
            Cannot be used if FeatureSet uses a FeatureHasher for
            vectorization.
            Defaults to ``None``.
        inverse : bool, optional
            Instead of keeping features and/or examples in lists,
            remove them.
            Defaults to ``False``.

        Raises
        ------
        ValueError
            If attempting to use features to filter a ``FeatureSet`` that
            uses a ``FeatureHasher`` vectorizer.
        """
        # Construct mask that indicates which examples to keep
        mask = np.ones(len(self), dtype=bool)
        if ids is not None:
            mask = np.logical_and(mask, np.in1d(self.ids, ids))
        if labels is not None:
            mask = np.logical_and(mask, np.in1d(self.labels, labels))

        if inverse and (labels is not None or ids is not None):
            mask = np.logical_not(mask)

        # Remove examples not in mask
        self.ids = self.ids[mask]
        self.labels = self.labels[mask]
        self.features = self.features[mask, :]

        # Filter features
        if features is not None:
            if isinstance(self.vectorizer, FeatureHasher):
                raise ValueError('FeatureSets with FeatureHasher vectorizers'
                                 ' cannot be filtered by feature.')
            columns = np.array(
                sorted({
                    feat_num
                    for feat_name, feat_num in
                    self.vectorizer.vocabulary_.items()
                    if (feat_name in features
                        or feat_name.split('=', 1)[0] in features)
                }))
            if inverse:
                all_columns = np.arange(self.features.shape[1])
                columns = all_columns[np.logical_not(
                    np.in1d(all_columns, columns))]
            self.features = self.features[:, columns]
            self.vectorizer.restrict(columns, indices=True)

    def filtered_iter(self,
                      ids=None,
                      labels=None,
                      features=None,
                      inverse=False):
        """
        A version of `__iter__` that retains only the specified features
        and/or examples from the output.

        Parameters
        ----------
        ids : list of str/float, optional
            Examples to keep in the ``FeatureSet``. If ``None``, no ID
            filtering takes place.
            Defaults to ``None``.
        labels : list of str/float, optional
            Labels that we want to retain examples for. If ``None``,
            no label filtering takes place.
            Defaults to ``None``.
        features : list of str, optional
            Features to keep in the ``FeatureSet``. To help with
            filtering string-valued features that were converted
            to sequences of boolean features when read in, any
            features in the ``FeatureSet`` that contain a `=` will be
            split on the first occurrence and the prefix will be
            checked to see if it is in ``features``.
            If `None`, no feature filtering takes place.
            Cannot be used if ``FeatureSet`` uses a FeatureHasher for
            vectorization.
            Defaults to ``None``.
        inverse : bool, optional
            Instead of keeping features and/or examples in lists,
            remove them.
            Defaults to ``False``.

        Yields
        ------
        id_ : str
            The ID of the example.
        label_ : str
            The label of the example.
        feat_dict : dict
            The feature dictionary, with feature name as the key
            and example value as the value.

        Raises
        ------
        ValueError
            If the vectorizer is not a `DictVectorizer`.
        """
        if self.features is not None and not isinstance(
                self.vectorizer, DictVectorizer):
            raise ValueError('FeatureSets can only be iterated through if they'
                             ' use a DictVectorizer for their feature '
                             'vectorizer.')

        for id_, label_, feats in zip(self.ids, self.labels, self.features):
            # Skip instances with IDs not in filter
            if ids is not None and (id_ in ids) == inverse:
                continue
            # Skip instances with labels not in filter
            if labels is not None and (label_ in labels) == inverse:
                continue

            # reshape to a 2D matrix if we are not using a sparse matrix
            # to store the features
            feats = feats.reshape(1, -1) if not sp.issparse(feats) else feats
            feat_dict = self.vectorizer.inverse_transform(feats)[0]
            if features is not None:
                feat_dict = {
                    name: value
                    for name, value in feat_dict.items() if (inverse != (
                        name in features or name.split('=', 1)[0] in features))
                }
            elif not inverse:
                feat_dict = {}
            yield id_, label_, feat_dict

    def __sub__(self, other):
        """
        Subset ``FeatureSet`` instance by removing all the features from the
        other ``FeatureSet`` instance.

        Parameters
        ----------
        other : skll.FeatureSet
            The other ``FeatureSet`` containing the features that should
            be removed from this ``FeatureSet``.

        Returns
        -------
        A copy of `self` with all features in `other` removed.
        """
        new_set = deepcopy(self)
        new_set.filter(features=other.vectorizer.feature_names_, inverse=True)
        return new_set

    @property
    def has_labels(self):
        """
        Check if ``FeatureSet`` has finite labels.

        Returns
        -------
        has_labels : bool
            Whether or not this FeatureSet has any finite labels.
        """
        # make sure that labels is not None or a list of Nones
        if self.labels is not None and not all(label is None
                                               for label in self.labels):
            # then check that they are not a list of NaNs
            return not (np.issubdtype(self.labels.dtype, np.floating)
                        and np.isnan(np.min(self.labels)))
        else:
            return False

    def __str__(self):
        """
        Returns
        -------
        A string representation of ``FeatureSet``.
        """
        return str(self.__dict__)

    def __repr__(self):
        """
        Returns
        -------
        A string representation of ``FeatureSet``.
        """
        return repr(self.__dict__)

    def __getitem__(self, value):
        """
        Parameters
        ----------
        value
            The value to retrieve.

        Returns
        -------
        A specific example by row number or, if given a slice,
        a new ``FeatureSet`` instance containing a subset of the data.
        """
        # Check if we're slicing
        if isinstance(value, slice):
            sliced_ids = self.ids[value]
            sliced_feats = (self.features[value]
                            if self.features is not None else None)
            sliced_labels = (self.labels[value]
                             if self.labels is not None else None)
            return FeatureSet('{}_{}'.format(self.name, value),
                              sliced_ids,
                              features=sliced_feats,
                              labels=sliced_labels,
                              vectorizer=self.vectorizer)
        else:
            label = self.labels[value] if self.labels is not None else None
            feats = self.features[value, :]
            features = (self.vectorizer.inverse_transform(feats)[0]
                        if self.features is not None else {})
            return self.ids[value], label, features

    @staticmethod
    def split_by_ids(fs, ids_for_split1, ids_for_split2=None):
        """
        Split the ``FeatureSet`` into two new ``FeatureSet`` instances based on
        the given IDs for the two splits.

        Parameters
        ----------
        fs : skll.FeatureSet
            The ``FeatureSet`` instance to split.
        ids_for_split1 : list of int
            A list of example IDs which will be split out into
            the first ``FeatureSet`` instance. Note that the
            FeatureSet instance will respect the order of the
            specified IDs.
        ids_for_split2 : list of int, optional
            An optional ist of example IDs which will be
            split out into the second ``FeatureSet`` instance.
            Note that the ``FeatureSet`` instance will respect
            the order of the specified IDs. If this is
            not specified, then the second ``FeatureSet``
            instance will contain the complement of the
            first set of IDs sorted in ascending order.
            Defaults to ``None``.

        Returns
        -------
        fs1 : skll.FeatureSet
            The first ``FeatureSet``.
        fs2 : skll.FeatureSet
            The second ``FeatureSet``.
        """

        # Note: an alternative way to implement this is to make copies
        # of the given FeatureSet instance and then use the `filter()`
        # method but that wastes too much memory since it requires making
        # two copies of the original FeatureSet which may be huge. With
        # the current implementation, we are creating new objects but
        # they should be much smaller than the original FeatureSet.
        ids1 = fs.ids[ids_for_split1]
        labels1 = fs.labels[ids_for_split1]
        features1 = fs.features[ids_for_split1]
        if ids_for_split2 is None:
            ids2 = fs.ids[~np.in1d(fs.ids, ids_for_split1)]
            labels2 = fs.labels[~np.in1d(fs.ids, ids_for_split1)]
            features2 = fs.features[~np.in1d(fs.ids, ids_for_split1)]
        else:
            ids2 = fs.ids[ids_for_split2]
            labels2 = fs.labels[ids_for_split2]
            features2 = fs.features[ids_for_split2]

        fs1 = FeatureSet('{}_1'.format(fs.name),
                         ids1,
                         labels=labels1,
                         features=features1,
                         vectorizer=fs.vectorizer)
        fs2 = FeatureSet('{}_2'.format(fs.name),
                         ids2,
                         labels=labels2,
                         features=features2,
                         vectorizer=fs.vectorizer)
        return fs1, fs2

    @staticmethod
    def from_data_frame(df, name, labels_column=None, vectorizer=None):
        """
        Helper function to create a ``FeatureSet`` instance from a `pandas.DataFrame`.
        Will raise an Exception if pandas is not installed in your environment.
        The ``ids`` in the ``FeatureSet`` will be the index from the given frame.

        Parameters
        ----------
        df : pd.DataFrame
            The pandas.DataFrame object to use as a ``FeatureSet``.
        name : str
            The name of the output ``FeatureSet`` instance.
        labels_column : str, optional
            The name of the column containing the labels (data to predict).
            Defaults to ``None``.
        vectorizer : DictVectorizer or FeatureHasher, optional
            Vectorizer which will be used to generate the feature matrix.
            Defaults to ``None``.

        Returns
        -------
        feature_set : skll.FeatureSet
            A ``FeatureSet`` instance generated from from the given data frame.
        """
        if labels_column:
            feature_columns = [
                column for column in df.columns if column != labels_column
            ]
            labels = df[labels_column].tolist()
        else:
            feature_columns = df.columns
            labels = None

        features = df[feature_columns].to_dict(orient='records')
        return FeatureSet(name,
                          ids=df.index.tolist(),
                          labels=labels,
                          features=features,
                          vectorizer=vectorizer)
Esempio n. 12
0
class Reader(object):
    """
    A helper class to make picklable iterators out of example
    dictionary generators.

    Parameters
    ----------
    path_or_list : str or list of dict
        Path or a list of example dictionaries.
    quiet : bool, optional
        Do not print "Loading..." status message to stderr.
        Defaults to ``True``.
    ids_to_floats : bool, optional
        Convert IDs to float to save memory. Will raise error
        if we encounter an a non-numeric ID.
        Defaults to ``False``.
    label_col : str, optional
        Name of the column which contains the class labels
        for ARFF/CSV/TSV files. If no column with that name
        exists, or ``None`` is specified, the data is
        considered to be unlabelled.
        Defaults to ``'y'``.
    id_col : str, optional
        Name of the column which contains the instance IDs.
        If no column with that name exists, or ``None`` is
        specified, example IDs will be automatically generated.
        Defaults to ``'id'``.
    class_map : dict, optional
        Mapping from original class labels to new ones. This is
        mainly used for collapsing multiple labels into a single
        class. Anything not in the mapping will be kept the same.
        Defaults to ``None``.
    sparse : bool, optional
        Whether or not to store the features in a numpy CSR
        matrix when using a DictVectorizer to vectorize the
        features.
        Defaults to ``True``.
    feature_hasher : bool, optional
        Whether or not a FeatureHasher should be used to
        vectorize the features.
        Defaults to ``False``.
    num_features : int, optional
        If using a FeatureHasher, how many features should the
        resulting matrix have?  You should set this to a power
        of 2 greater than the actual number of features to
        avoid collisions.
        Defaults to ``None``.
    logger : logging.Logger, optional
        A logger instance to use to log messages instead of creating
        a new one by default.
        Defaults to ``None``.
    """

    def __init__(self, path_or_list, quiet=True, ids_to_floats=False,
                 label_col='y', id_col='id', class_map=None, sparse=True,
                 feature_hasher=False, num_features=None,
                 logger=None):
        super(Reader, self).__init__()
        self.path_or_list = path_or_list
        self.quiet = quiet
        self.ids_to_floats = ids_to_floats
        self.label_col = label_col
        self.id_col = id_col
        self.class_map = class_map
        self._progress_msg = ''
        self._use_pandas = False

        if feature_hasher:
            self.vectorizer = FeatureHasher(n_features=num_features)
        else:
            self.vectorizer = DictVectorizer(sparse=sparse)
        self.logger = logger if logger else logging.getLogger(__name__)

    @classmethod
    def for_path(cls, path_or_list, **kwargs):
        """
        Instantiate the appropriate Reader sub-class based on the
        file extension of the given path. Or use a dictionary reader
        if the input is a list of dictionaries.

        Parameters
        ----------
        path_or_list : str or list of dicts
            A path or list of example dictionaries.
        kwargs : dict, optional
            The arguments to the Reader object being instantiated.

        Returns
        -------
        reader : skll.Reader
            A new instance of the Reader sub-class that is
            appropriate for the given path.

        Raises
        ------
        ValueError
            If file does not have a valid extension.
        """
        if not isinstance(path_or_list, str):
            return DictListReader(path_or_list)
        else:
            # Get lowercase extension for file extension checking
            ext = '.' + path_or_list.rsplit('.', 1)[-1].lower()
            if ext not in EXT_TO_READER:
                raise ValueError(('Example files must be in either .arff, '
                                  '.csv, .jsonlines, .ndj, or .tsv '
                                  'format. You specified: '
                                  '{}').format(path_or_list))
        return EXT_TO_READER[ext](path_or_list, **kwargs)

    def _sub_read(self, file):
        """
        Does the actual reading of the given file or list.
        For `Reader` objects that do not rely on `pandas`
        (and therefore read row-by-row), this function will
        be called by  `_sub_read_rows()` and will take a file
        buffer rather than a file path. Otherwise, it will
        take a path and will be called directly in the `read()`
        method.

        Parameters
        ----------
        file : file buffer or str
            Either a file buffer, if ``_sub_read_rows()``
            is calling this method, or a path to a file,
            if it is being read with ``pandas``.

        Raises
        ------
        NotImplementedError
        """
        raise NotImplementedError

    def _print_progress(self, progress_num, end="\r"):
        """
        Helper method to print out progress numbers in proper format.
        Nothing gets printed if ``self.quiet`` is ``True``.

        Parameters
        ----------
        progress_num
            Progress indicator value. Usually either a line
            number or a percentage. Must be able to convert to string.
        end : str, optional
            The string to put at the end of the line.  "\\r" should be
            used for every update except for the final one.
            Defaults to ``'\r'``.
        """
        # Print out status
        if not self.quiet:
            print("{}{:>15}".format(self._progress_msg, progress_num),
                  end=end, file=sys.stderr)
            sys.stderr.flush()

    def _sub_read_rows(self, file):
        """
        Read the file in row-by-row. This method is used for
        `Reader` objects that do not rely on `pandas`, and are
        instead read line-by-line into a FeatureSet object, unlike
        pandas-based reader object, which will read everything
        into memory in a data frame object before converting to
        a `FeatureSet`.

        Parameters
        ----------
        file : str
            The path to a file.

        Returns
        -------
        ids : np.array
            The ids array.
        labels : np.array
            The labels array.
        features : list of dicts
            The features dictionary.

        Raises
        ------
        ValueError
            If ``ids_to_floats`` is True, but IDs cannot be converted.
        ValueError
            If no features are found.
        ValueError
            If the example IDs are not unique.
        """
        # Get labels and IDs
        ids = []
        labels = []
        ex_num = 0
        with open(file, 'r', encoding='utf-8') as f:
            for ex_num, (id_, class_, _) in enumerate(self._sub_read(f), start=1):

                # Update lists of IDs, classes, and features
                if self.ids_to_floats:
                    try:
                        id_ = float(id_)
                    except ValueError:
                        raise ValueError(('You set ids_to_floats to true,'
                                          ' but ID {} could not be '
                                          'converted to float in '
                                          '{}').format(id_,
                                                       self.path_or_list))
                ids.append(id_)
                labels.append(class_)
                if ex_num % 100 == 0:
                    self._print_progress(ex_num)
            self._print_progress(ex_num)

        # Remember total number of examples for percentage progress meter
        total = ex_num
        if total == 0:
            raise ValueError("No features found in possibly "
                             "empty file '{}'.".format(self.path_or_list))

        # Convert everything to numpy arrays
        ids = np.array(ids)
        labels = np.array(labels)

        def feat_dict_generator():
            with open(self.path_or_list, 'r', encoding='utf-8') as f:
                for ex_num, (_, _, feat_dict) in enumerate(self._sub_read(f)):
                    yield feat_dict
                    if ex_num % 100 == 0:
                        self._print_progress('{:.8}%'.format(100 * ((ex_num / total))))
                self._print_progress("100%")

        # extract the features dictionary
        features = feat_dict_generator()

        return ids, labels, features

    def _parse_dataframe(self,
                         df,
                         id_col,
                         label_col,
                         replace_blanks_with=None,
                         drop_blanks=False):
        """
        Parse the data frame into ids, labels, and features.
        For `Reader` objects that rely on `pandas`, this function
        will be called in the `_sub_read()` method to parse the
        data frame into the expected format. It will not be used
        by `Reader` classes that read row-by-row (and therefore
        use the `_sub_read_rows()` function).

        Parameters
        ----------
        df : pd.DataFrame
            The pandas data frame to parse.
        id_col : str or None
            The id column.
        label_col : str or None
            The label column.
        replace_blanks_with : value, ``dict``, or ``None``, optional
            Specifies a new value with which to replace blank values.
            Options are ::

                -  value = A (numeric) value with which to replace blank values.
                -  ``dict`` = A dictionary specifying the replacement value for each column.
                -  ``None`` = Blank values will be left as blanks, and not replaced.

            Defaults to ``None``.
        drop_blanks : bool, optional
            If ``True``, remove lines/rows that have any blank
            values.
            Defaults to ``False``.

        Returns
        -------
        ids : np.array
            The ids for the feature set.
        labels : np.array
            The labels for the feature set.
        features : list of dicts
            The features for the feature set.
        """
        if df.empty:
            raise ValueError("No features found in possibly "
                             "empty file '{}'.".format(self.path_or_list))

        if drop_blanks and replace_blanks_with is not None:
            raise ValueError("You cannot both drop blanks and replace them. "
                             "'replace_blanks_with' can only have a value when "
                             "'drop_blanks' is `False`.")

        # should we replace blank values with something?
        if replace_blanks_with is not None:
            self.logger.info('Blank values in all rows/lines will be replaced with '
                             'user-specified value(s).')
            df = df.fillna(replace_blanks_with)

        # should we remove lines that have any NaNs?
        if drop_blanks:
            self.logger.info('Rows/lines with any blank values will be dropped.')
            df = df.dropna().reset_index(drop=True)

        # if the id column exists,
        # get them from the data frame and
        # delete the column; otherwise, just
        # set it to None
        if id_col is not None and id_col in df:
            ids = df[id_col].astype(str)
            del df[id_col]
            # if `ids_to_floats` is True,
            # then convert the ids to floats
            if self.ids_to_floats:
                ids = ids.astype(float)
            ids = ids.values
        else:
            # create ids with the prefix `EXAMPLE_`
            ids = np.array(['EXAMPLE_{}'.format(i) for i in range(df.shape[0])])

        # if the label column exists,
        # get them from the data frame and
        # delete the column; otherwise, just
        # set it to None
        if label_col is not None and label_col in df:
            labels = df[label_col]
            del df[label_col]
            # if `class_map` exists, then
            # map the new classes to the labels;
            # otherwise, just convert them to floats
            if self.class_map is not None:
                labels = labels.apply(safe_float, replace_dict=self.class_map)
            else:
                labels = labels.apply(safe_float)
            labels = labels.values
        else:
            # create an array of Nones
            labels = np.array([None] * df.shape[0])

        # convert the remaining features to
        # a list of dictionaries
        features = df.to_dict(orient='records')

        return ids, labels, features

    def read(self):
        """
        Loads examples in the `.arff`, `.csv`, `.jsonlines`, `.libsvm`,
        `.ndj`, or `.tsv` formats.

        Returns
        -------
        feature_set : skll.FeatureSet
            ``FeatureSet`` instance representing the input file.

        Raises
        ------
        ValueError
            If ``ids_to_floats`` is True, but IDs cannot be converted.
        ValueError
            If no features are found.
        ValueError
            If the example IDs are not unique.
        """
        self.logger.debug('Path: %s', self.path_or_list)

        if not self.quiet:
            self._progress_msg = "Loading {}...".format(self.path_or_list)
            print(self._progress_msg, end="\r", file=sys.stderr)
            sys.stderr.flush()

        if self._use_pandas:
            ids, labels, features = self._sub_read(self.path_or_list)
        else:
            ids, labels, features = self._sub_read_rows(self.path_or_list)

        # Convert everything to numpy arrays
        features = self.vectorizer.fit_transform(features)

        # Report that loading is complete
        self._print_progress("done", end="\n")

        # Make sure we have the same number of ids, labels, and features
        assert ids.shape[0] == labels.shape[0] == features.shape[0]

        if ids.shape[0] != len(set(ids)):
            raise ValueError('The example IDs are not unique in %s.' %
                             self.path_or_list)

        return FeatureSet(self.path_or_list, ids, labels=labels,
                          features=features, vectorizer=self.vectorizer)
Esempio n. 13
0
class FeatureSet(object):

    """
    Encapsulation of all of the features, values, and metadata about a given
    set of data.

    This replaces ExamplesTuple in older versions.

    :param name: The name of this feature set.
    :type name: str
    :param ids: Example IDs for this set.  If
    :type ids: np.array
    :param classes: Classes for this set.
    :type classes: np.array
    :param features: The features for each instance represented as either a
                     list of dictionaries or an array-like (if
                     `feat_vectorizer` is also specified).
    :type features: list of dict or array-like
    :param vectorizer: Vectorizer that created feature matrix.
    :type vectorizer: DictVectorizer or FeatureHasher

    .. note::
       If ids, classes, and/or features are not None, the number of rows in
       each array must be equal.
    """

    def __init__(self, name, ids=None, classes=None, features=None,
                 vectorizer=None):
        super(FeatureSet, self).__init__()
        self.name = name
        if isinstance(ids, list):
            ids = np.array(ids)
        self.ids = ids
        if isinstance(classes, list):
            classes = np.array(classes)
        self.classes = classes
        self.features = features
        self.vectorizer = vectorizer
        # Convert list of dicts to numpy array
        if isinstance(self.features, list):
            if self.vectorizer is None:
                self.vectorizer = NewDictVectorizer(sparse=True)
            self.features = self.vectorizer.fit_transform(self.features)
        if self.features is not None:
            num_feats = self.features.shape[0]
            if self.ids is None:
                self.ids = np.empty(num_feats)
                self.ids.fill(None)
            num_ids = self.ids.shape[0]
            if num_feats != num_ids:
                raise ValueError(('Number of IDs (%s) does not equal '
                                  'number of feature rows (%s)') % (num_ids,
                                                                    num_feats))
            if self.classes is None:
                self.classes = np.empty(num_feats)
                self.classes.fill(None)
            num_classes = self.classes.shape[0]
            if num_feats != num_classes:
                raise ValueError(('Number of classes ({}) does not equal '
                                  'number of feature rows({})') % (num_classes,
                                                                   num_feats))

    def __contains__(self, value):
        pass

    def __iter__(self):
        '''
        Iterate through (ID, class, feature_dict) tuples in feature set.
        '''
        if self.features is not None:
            if not isinstance(self.vectorizer, DictVectorizer):
                raise ValueError('FeatureSets can only be iterated through if '
                                 'they use a DictVectorizer for their feature '
                                 'vectorizer.')
            for id_, class_, feats in zip(self.ids, self.classes,
                                          self.features):
                # When calling inverse_transform we have to add [0] to get the
                # results for the current instance because it always returns a
                # 2D array
                yield (id_, class_,
                       self.vectorizer.inverse_transform(feats)[0])
        else:
            return

    def __len__(self):
        return self.features.shape[1]

    def __add__(self, other):
        '''
        Combine two feature sets to create a new one.  This is done assuming
        they both have the same instances with the same IDs in the same order.
        '''
        new_set = FeatureSet('+'.join(sorted([self.name, other.name])))
        # Combine feature matrices and vectorizers
        if self.features is not None:
            if not isinstance(self.vectorizer, type(other.vectorizer)):
                raise ValueError('Cannot combine FeatureSets because they are '
                                 'not both using the same type of feature '
                                 'vectorizer (e.g., DictVectorizer, '
                                 'FeatureHasher)')
            feature_hasher = isinstance(self.vectorizer, FeatureHasher)
            if feature_hasher:
                if (self.vectorizer.n_features !=
                        other.vectorizer.n_features):
                    raise ValueError('Cannot combine FeatureSets that uses '
                                     'FeatureHashers with different values of '
                                     'n_features setting.')
            else:
                # Check for duplicate feature names
                if (set(self.vectorizer.feature_names_) &
                        set(other.vectorizer.feature_names_)):
                    raise ValueError('Cannot combine FeatureSets because they '
                                     'have duplicate feature names.')
            num_feats = self.features.shape[1]
            new_set.features = sp.hstack([self.features, other.features],
                                         'csr')
            new_set.vectorizer = deepcopy(self.vectorizer)
            if not feature_hasher:
                for feat_name, index in other.vectorizer.vocabulary_.items():
                    new_set.vectorizer.vocabulary_[feat_name] = (index +
                                                                 num_feats)
                other_names = other.vectorizer.feature_names_
                new_set.vectorizer.feature_names_.extend(other_names)
        else:
            new_set.features = deepcopy(other.features)
            new_set.vectorizer = deepcopy(other.vectorizer)

        # Check that IDs are in the same order
        if self.has_ids:
            if other.has_ids and not np.all(self.ids == other.ids):
                raise ValueError('IDs are not in the same order in each '
                                 'feature set')
            else:
                new_set.ids = deepcopy(self.ids)
        else:
            new_set.ids = deepcopy(other.ids)

        # If either set has labels, check that they don't conflict
        if self.has_classes:
            # Classes should be the same for each ExamplesTuple, so store once
            if other.has_classes and not np.all(self.classes == other.classes):
                raise ValueError('Feature sets have conflicting labels for '
                                 'examples with the same ID.')
            else:
                new_set.classes = deepcopy(self.classes)
        else:
            new_set.classes = deepcopy(other.classes)
        return new_set

    def filter(self, ids=None, classes=None, features=None, inverse=False):
        '''
        Removes or keeps features and/or examples from the Featureset depending
        on the passed in parameters.

        :param ids: Examples to keep in the FeatureSet. If `None`, no ID
                    filtering takes place.
        :type ids: list of str/float
        :param classes: Classes that we want to retain examples for. If `None`,
                        no class filtering takes place.
        :type classes: list of str/float
        :param features: Features to keep in the FeatureSet. To help with
                         filtering string-valued features that were converted
                         to sequences of boolean features when read in, any
                         features in the FeatureSet that contain a `=` will be
                         split on the first occurrence and the prefix will be
                         checked to see if it is in `features`.
                         If `None`, no feature filtering takes place.
                         Cannot be used if FeatureSet uses a FeatureHasher for
                         vectorization.
        :type features: list of str
        :param inverse: Instead of keeping features and/or examples in lists,
                        remove them.
        :type inverse: bool
        '''
        # Construct mask that indicates which examples to keep
        mask = np.ones(len(self), dtype=bool)
        if ids is not None:
            mask = np.logical_and(mask, np.logical_not(np.in1d(self.ids, ids)))
        if classes is not None:
            mask = np.logical_and(mask, np.logical_not(np.in1d(self.classes,
                                                               classes)))
        if inverse:
            mask = np.logical_not(mask)

        # Remove examples not in mask
        self.ids = self.ids[mask]
        self.classes = self.classes[mask]
        self.features = self.features[mask, :]

        # Filter features
        if features is not None:
            if isinstance(self.vectorizer, FeatureHasher):
                raise ValueError('FeatureSets with FeatureHasher vectorizers'
                                 ' cannot be filtered by feature.')
            columns = np.array(sorted({feat_num for feat_name, feat_num in
                                       iteritems(self.vectorizer.vocabulary_)
                                       if (feat_name in features or
                                           feat_name.split('=', 1)[0] in
                                           features)}))
            if inverse:
                columns = ~columns
            self.features = self.features[:, columns]
            self.vectorizer.restrict(columns)

    def filtered_iter(self, ids=None, classes=None, features=None,
                      inverse=False):
        '''
        A version of ``__iter__`` that retains only the specified features
        and/or examples from the output.

        :param ids: Examples in the FeatureSet to keep. If `None`, no ID
                    filtering takes place.
        :type ids: list of str/float
        :param classes: Classes that we want to retain examples for. If `None`,
                        no class filtering takes place.
        :type classes: list of str/float
        :param features: Features in the FeatureSet to keep. To help with
                         filtering string-valued features that were converted
                         to sequences of boolean features when read in, any
                         features in the FeatureSet that contain a `=` will be
                         split on the first occurrence and the prefix will be
                         checked to see if it is in `features`.
                         If `None`, no feature filtering takes place.
                         Cannot be used if FeatureSet uses a FeatureHasher for
                         vectorization.
        :type features: list of str
        :param inverse: Instead of keeping features and/or examples in lists,
                        remove them.
        :type inverse: bool
        '''
        if self.features is not None and not isinstance(self.vectorizer,
                                                        DictVectorizer):
            raise ValueError('FeatureSets can only be iterated through if they'
                             ' use a DictVectorizer for their feature '
                             'vectorizer.')

        for id_, class_, feats in zip(self.ids, self.classes, self.features):
            # Skip instances with IDs not in filter
            if ids is not None and (id_ in ids) == inverse:
                continue
            # Skip instances with classes not in filter
            if classes is not None and (class_ in classes) == inverse:
                continue
            feat_dict = self.vectorizer.inverse_transform(feats)[0]
            if features is not None:
                feat_dict = {name: value for name, value in
                             iteritems(feat_dict) if
                             (inverse != (name in features) or
                              (name.split('=', 1)[0] in features))}
            elif not inverse:
                feat_dict = {}
            yield id_, class_, feat_dict


    def __sub__(self, other):
        '''
        Return a copy of ``self`` with all features in ``other`` removed.
        '''
        new_set = deepcopy(self)
        new_set.filter(features=other.features, inverse=True)
        return new_set

    @property
    def has_classes(self):
        '''
        Whether or not this FeatureSet has any finite classes.
        '''
        if self.classes is not None:
            return not (np.issubdtype(self.classes.dtype, float) and
                        np.isnan(np.min(self.classes)))
        else:
            return False

    @property
    def has_ids(self):
        '''
        Whether or not this FeatureSet has any finite IDs.
        '''
        if self.ids is not None:
            return not (np.issubdtype(self.ids.dtype, float) and
                        np.isnan(np.min(self.ids)))
        else:
            return False

    @property
    def feat_vectorizer(self):
        ''' Backward compatible name for vectorizer '''
        warn('FeatureSet.feat_vectorizer will be removed in SKLL 1.0.0. '
             'Please switch to using FeatureSet.vectorizer to access the '
             'feature vectorizer.', DeprecationWarning)
        return self.vectorizer

    def __str__(self):
        ''' Return a string representation of FeatureSet '''
        return str(self.__dict__)

    def __repr__(self):
        ''' Return a string representation of FeatureSet '''
        return repr(self.__dict__)
Esempio n. 14
0
class Reader(object):
    """
    A little helper class to make picklable iterators out of example
    dictionary generators

    :param path_or_list: Path or a list of example dictionaries.
    :type path_or_list: str or list of dict
    :param quiet: Do not print "Loading..." status message to stderr.
    :type quiet: bool
    :param ids_to_floats: Convert IDs to float to save memory. Will raise error
                          if we encounter an a non-numeric ID.
    :type ids_to_floats: bool
    :param id_col: Name of the column which contains the instance IDs for
                   ARFF/CSV/TSV files. If no column with that name exists, or
                   `None` is specified, the IDs will be generated
                   automatically.
    :type id_col: str
    :param label_col: Name of the column which contains the class labels
                      for ARFF/CSV/TSV files. If no column with that name
                      exists, or `None` is specified, the data is
                      considered to be unlabelled.
    :type label_col: str
    :param class_map: Mapping from original class labels to new ones. This is
                      mainly used for collapsing multiple labels into a single
                      class. Anything not in the mapping will be kept the same.
    :type class_map: dict from str to str
    :param sparse: Whether or not to store the features in a numpy CSR
                   matrix when using a DictVectorizer to vectorize the
                   features.
    :type sparse: bool
    :param feature_hasher: Whether or not a FeatureHasher should be used to
                           vectorize the features.
    :type feature_hasher: bool
    :param num_features: If using a FeatureHasher, how many features should the
                         resulting matrix have?  You should set this to a power
                         of 2 greater than the actual number of features to
                         avoid collisions.
    :type num_features: int
    """
    def __init__(self,
                 path_or_list,
                 quiet=True,
                 ids_to_floats=False,
                 label_col='y',
                 id_col='id',
                 class_map=None,
                 sparse=True,
                 feature_hasher=False,
                 num_features=None):
        super(Reader, self).__init__()
        self.path_or_list = path_or_list
        self.quiet = quiet
        self.ids_to_floats = ids_to_floats
        self.label_col = label_col
        self.id_col = id_col
        self.class_map = class_map
        self._progress_msg = ''
        if feature_hasher:
            self.vectorizer = FeatureHasher(n_features=num_features)
        else:
            self.vectorizer = DictVectorizer(sparse=sparse)

    @classmethod
    def for_path(cls, path_or_list, **kwargs):
        """
        :param path: The path to the file to load the examples from, or a list
                     of example dictionaries.
        :type path: str or dict
        :param quiet: Do not print "Loading..." status message to stderr.
        :type quiet: bool
        :param sparse: Whether or not to store the features in a numpy CSR
                       matrix.
        :type sparse: bool
        :param id_col: Name of the column which contains the instance IDs for
                       ARFF/CSV/TSV files. If no column with that name exists,
                       or `None` is specified, the IDs will be generated
                       automatically.
        :type id_col: str
        :param label_col: Name of the column which contains the class labels
                          for ARFF/CSV/TSV files. If no column with that name
                          exists, or `None` is specified, the data is
                          considered to be unlabelled.
        :type label_col: str
        :param ids_to_floats: Convert IDs to float to save memory. Will raise
                              error if we encounter an a non-numeric ID.
        :type ids_to_floats: bool
        :param class_map: Mapping from original class labels to new ones. This
                          is mainly used for collapsing multiple classes into a
                          single class. Anything not in the mapping will be
                          kept the same.
        :type class_map: dict from str to str

        :returns: New instance of the :class:`Reader` sub-class that is
                  appropriate for the given path, or :class:`DictListReader` if
                  given a list of dictionaries.
        """
        if not isinstance(path_or_list, string_types):
            return DictListReader(path_or_list)
        else:
            # Get lowercase extension for file extension checking
            ext = '.' + path_or_list.rsplit('.', 1)[-1].lower()
            if ext not in EXT_TO_READER:
                raise ValueError(('Example files must be in either .arff, '
                                  '.csv, .jsonlines, .megam, .ndj, or .tsv '
                                  'format. You specified: '
                                  '{}').format(path_or_list))
        return EXT_TO_READER[ext](path_or_list, **kwargs)

    def _sub_read(self, f):
        """
        Does the actual reading of the given file or list.

        :param f: An open file to iterate through
        :type f: file
        """
        raise NotImplementedError

    def _print_progress(self, progress_num, end="\r"):
        """
        Little helper to print out progress numbers in proper format.

        Nothing gets printed if ``self.quiet`` is ``True``.

        :param progress_num: Progress indicator value.  Usually either a line
                             number or a percentage.
        :type progress_num: anything that can be converted to str
        :param end: The string to put at the end of the line.  "\\r" should be
                    used for every update except for the final one.
        :type end: str
        """
        # Print out status
        if not self.quiet:
            print("{}{:>15}".format(self._progress_msg, progress_num),
                  end=end,
                  file=sys.stderr)
            sys.stderr.flush()

    def read(self):
        """
        Loads examples in the ``.arff``, ``.csv``, ``.jsonlines``, ``.libsvm``,
        ``.megam``, ``.ndj``, or ``.tsv`` formats.

        :returns: :class:`~skll.data.featureset.FeatureSet` representing the
                  file we read in.
        """
        # Setup logger
        logger = logging.getLogger(__name__)

        logger.debug('Path: %s', self.path_or_list)

        if not self.quiet:
            self._progress_msg = "Loading {}...".format(self.path_or_list)
            print(self._progress_msg, end="\r", file=sys.stderr)
            sys.stderr.flush()

        # Get labels and IDs
        ids = []
        labels = []
        with open(self.path_or_list, 'r' if PY3 else 'rb') as f:
            for ex_num, (id_, class_, _) in enumerate(self._sub_read(f)):
                # Update lists of IDs, clases, and features
                if self.ids_to_floats:
                    try:
                        id_ = float(id_)
                    except ValueError:
                        raise ValueError(('You set ids_to_floats to true,'
                                          ' but ID {} could not be '
                                          'converted to float in '
                                          '{}').format(id_, self.path_or_list))
                ids.append(id_)
                labels.append(class_)
                if ex_num % 100 == 0:
                    self._print_progress(ex_num)
            self._print_progress(ex_num)

        # Remember total number of examples for percentage progress meter
        total = ex_num

        # Convert everything to numpy arrays
        ids = np.array(ids)
        labels = np.array(labels)

        def feat_dict_generator():
            with open(self.path_or_list, 'r' if PY3 else 'rb') as f:
                for ex_num, (_, _, feat_dict) in enumerate(self._sub_read(f)):
                    yield feat_dict
                    if ex_num % 100 == 0:
                        self._print_progress('{:.8}%'.format(
                            100 * ((ex_num + 1) / total)))
                self._print_progress("100%")

        # Convert everything to numpy arrays
        features = self.vectorizer.fit_transform(feat_dict_generator())

        # Report that loading is complete
        self._print_progress("done", end="\n")

        # Make sure we have the same number of ids, labels, and features
        assert ids.shape[0] == labels.shape[0] == features.shape[0]

        if ids.shape[0] != len(set(ids)):
            raise ValueError('The example IDs are not unique in %s.' %
                             self.path_or_list)

        return FeatureSet(self.path_or_list,
                          ids,
                          labels=labels,
                          features=features,
                          vectorizer=self.vectorizer)
Esempio n. 15
0
class Reader(object):
    """
    A helper class to make picklable iterators out of example
    dictionary generators.

    Parameters
    ----------
    path_or_list : str or list of dict
        Path or a list of example dictionaries.
    quiet : bool, optional
        Do not print "Loading..." status message to stderr.
        Defaults to ``True``.
    ids_to_floats : bool, optional
        Convert IDs to float to save memory. Will raise error
        if we encounter an a non-numeric ID.
        Defaults to ``False``.
    label_col : str, optional
        Name of the column which contains the class labels
        for ARFF/CSV/TSV files. If no column with that name
        exists, or ``None`` is specified, the data is
        considered to be unlabelled.
        Defaults to ``'y'``.
    id_col : str, optional
        Name of the column which contains the instance IDs.
        If no column with that name exists, or ``None`` is
        specified, example IDs will be automatically generated.
        Defaults to ``'id'``.
    class_map : dict, optional
        Mapping from original class labels to new ones. This is
        mainly used for collapsing multiple labels into a single
        class. Anything not in the mapping will be kept the same.
        Defaults to ``None``.
    sparse : bool, optional
        Whether or not to store the features in a numpy CSR
        matrix when using a DictVectorizer to vectorize the
        features.
        Defaults to ``True``.
    feature_hasher : bool, optional
        Whether or not a FeatureHasher should be used to
        vectorize the features.
        Defaults to ``False``.
    num_features : int, optional
        If using a FeatureHasher, how many features should the
        resulting matrix have?  You should set this to a power
        of 2 greater than the actual number of features to
        avoid collisions.
        Defaults to ``None``.
    logger : logging.Logger, optional
        A logger instance to use to log messages instead of creating
        a new one by default.
        Defaults to ``None``.
    """

    def __init__(self, path_or_list, quiet=True, ids_to_floats=False,
                 label_col='y', id_col='id', class_map=None, sparse=True,
                 feature_hasher=False, num_features=None,
                 logger=None):
        super(Reader, self).__init__()
        self.path_or_list = path_or_list
        self.quiet = quiet
        self.ids_to_floats = ids_to_floats
        self.label_col = label_col
        self.id_col = id_col
        self.class_map = class_map
        self._progress_msg = ''
        self._use_pandas = False

        if feature_hasher:
            self.vectorizer = FeatureHasher(n_features=num_features)
        else:
            self.vectorizer = DictVectorizer(sparse=sparse)
        self.logger = logger if logger else logging.getLogger(__name__)

    @classmethod
    def for_path(cls, path_or_list, **kwargs):
        """
        Instantiate the appropriate Reader sub-class based on the
        file extension of the given path. Or use a dictionary reader
        if the input is a list of dictionaries.

        Parameters
        ----------
        path_or_list : str or list of dicts
            A path or list of example dictionaries.
        kwargs : dict, optional
            The arguments to the Reader object being instantiated.

        Returns
        -------
        reader : skll.Reader
            A new instance of the Reader sub-class that is
            appropriate for the given path.

        Raises
        ------
        ValueError
            If file does not have a valid extension.
        """
        if not isinstance(path_or_list, string_types):
            return DictListReader(path_or_list)
        else:
            # Get lowercase extension for file extension checking
            ext = '.' + path_or_list.rsplit('.', 1)[-1].lower()
            if ext not in EXT_TO_READER:
                raise ValueError(('Example files must be in either .arff, '
                                  '.csv, .jsonlines, .megam, .ndj, or .tsv '
                                  'format. You specified: '
                                  '{}').format(path_or_list))
        return EXT_TO_READER[ext](path_or_list, **kwargs)

    def _sub_read(self, f):
        """
        Does the actual reading of the given file or list.
        For `Reader` objects that do not rely on `pandas`
        (and therefore read row-by-row), this function will
        be called by  `_sub_read_rows()` and will take a file
        buffer rather than a file path. Otherwise, it will
        take a path and will be called directly in the `read()`
        method.

        Parameters
        ----------
        f : file buffer or str
            Either a file buffer, if ``_sub_read_rows()``
            is calling this method, or a path to a file,
            if it is being read with ``pandas``.

        Raises
        ------
        NotImplementedError
        """
        raise NotImplementedError

    def _print_progress(self, progress_num, end="\r"):
        """
        Helper method to print out progress numbers in proper format.
        Nothing gets printed if ``self.quiet`` is ``True``.

        Parameters
        ----------
        progress_num
            Progress indicator value. Usually either a line
            number or a percentage. Must be able to convert to string.

        end : str, optional
            The string to put at the end of the line.  "\\r" should be
            used for every update except for the final one.
            Defaults to ``'\r'``.
        """
        # Print out status
        if not self.quiet:
            print("{}{:>15}".format(self._progress_msg, progress_num),
                  end=end, file=sys.stderr)
            sys.stderr.flush()

    def _sub_read_rows(self, path):
        """
        Read the file in row-by-row. This method is used for
        `Reader` objects that do not rely on `pandas`, and are
        instead read line-by-line into a FeatureSet object, unlike
        pandas-based reader object, which will read everything
        into memory in a data frame object before converting to
        a `FeatureSet`.

        Parameters
        ----------
        path : str
            The path to the file.

        Returns
        -------
        ids : np.array
            The ids array.
        labels : np.array
            The labels array.
        features : list of dicts
            The features dictionary.

        Raises
        ------
        ValueError
            If ``ids_to_floats`` is True, but IDs cannot be converted.
        ValueError
            If no features are found.
        ValueError
            If the example IDs are not unique.
        """
        # Get labels and IDs
        ids = []
        labels = []
        ex_num = 0
        with open(path, 'r' if PY3 else 'rb') as f:
            for ex_num, (id_, class_, _) in enumerate(self._sub_read(f), start=1):

                # Update lists of IDs, classes, and features
                if self.ids_to_floats:
                    try:
                        id_ = float(id_)
                    except ValueError:
                        raise ValueError(('You set ids_to_floats to true,'
                                          ' but ID {} could not be '
                                          'converted to float in '
                                          '{}').format(id_,
                                                       self.path_or_list))
                ids.append(id_)
                labels.append(class_)
                if ex_num % 100 == 0:
                    self._print_progress(ex_num)
            self._print_progress(ex_num)

        # Remember total number of examples for percentage progress meter
        total = ex_num
        if total == 0:
            raise ValueError("No features found in possibly "
                             "empty file '{}'.".format(self.path_or_list))

        # Convert everything to numpy arrays
        ids = np.array(ids)
        labels = np.array(labels)

        def feat_dict_generator():
            with open(self.path_or_list, 'r' if PY3 else 'rb') as f:
                for ex_num, (_, _, feat_dict) in enumerate(self._sub_read(f)):
                    yield feat_dict
                    if ex_num % 100 == 0:
                        self._print_progress('{:.8}%'.format(100 * ((ex_num / total))))
                self._print_progress("100%")

        # extract the features dictionary
        features = feat_dict_generator()

        return ids, labels, features

    def _parse_dataframe(self, df, id_col, label_col, features=None):
        """
        Parse the data frame into ids, labels, and features.
        For `Reader` objects that rely on `pandas`, this function
        will be called in the `_sub_read()` method to parse the
        data frame into the expected format. It will not be used
        by `Reader` classes that read row-by-row (and therefore
        use the `_sub_read_rows()` function).

        Parameters
        ----------
        df : pd.DataFrame
            The pandas data frame to parse.
        id_col : str or None
            The id column.
        label_col : str or None
            The label column.
        features : list of dict or None
            The features, if they already exist;
            if not, then they will be extracted
            from the data frame.
            Defaults to None.

        Returns
        -------
        ids : np.array
            The ids for the feature set.
        labels : np.array
            The labels for the feature set.
        features : list of dicts
            The features for the feature set.
        """
        if df.empty:
            raise ValueError("No features found in possibly "
                             "empty file '{}'.".format(self.path_or_list))

        # if the id column exists,
        # get them from the data frame and
        # delete the column; otherwise, just
        # set it to None
        if id_col is not None and id_col in df:
            ids = df[id_col]
            del df[id_col]
            # if `ids_to_floats` is True,
            # then convert the ids to floats
            if self.ids_to_floats:
                ids = ids.astype(float)
            ids = ids.values
        else:
            # create ids with the prefix `EXAMPLE_`
            ids = np.array(['EXAMPLE_{}'.format(i) for i in range(df.shape[0])])

        # if the label column exists,
        # get them from the data frame and
        # delete the column; otherwise, just
        # set it to None
        if label_col is not None and label_col in df:
            labels = df[label_col]
            del df[label_col]
            # if `class_map` exists, then
            # map the new classes to the labels;
            # otherwise, just convert them to floats
            if self.class_map is not None:
                labels = labels.apply(safe_float,
                                      replace_dict=self.class_map)
            else:
                labels = labels.apply(safe_float)
            labels = labels.values
        else:
            # create an array of Nones
            labels = np.array([None] * df.shape[0])

        # convert the remaining features to
        # a list of dictionaries, if no
        # features argument was passed
        if features is None:
            features = df.to_dict(orient='records')

        return ids, labels, features

    def read(self):
        """
        Loads examples in the `.arff`, `.csv`, `.jsonlines`, `.libsvm`,
        `.megam`, `.ndj`, or `.tsv` formats.

        Returns
        -------
        feature_set : skll.FeatureSet
            ``FeatureSet`` instance representing the input file.

        Raises
        ------
        ValueError
            If ``ids_to_floats`` is True, but IDs cannot be converted.
        ValueError
            If no features are found.
        ValueError
            If the example IDs are not unique.
        """
        self.logger.debug('Path: %s', self.path_or_list)

        if not self.quiet:
            self._progress_msg = "Loading {}...".format(self.path_or_list)
            print(self._progress_msg, end="\r", file=sys.stderr)
            sys.stderr.flush()

        if self._use_pandas:
            ids, labels, features = self._sub_read(self.path_or_list)
        else:
            ids, labels, features = self._sub_read_rows(self.path_or_list)

        # Convert everything to numpy arrays
        features = self.vectorizer.fit_transform(features)

        # Report that loading is complete
        self._print_progress("done", end="\n")

        # Make sure we have the same number of ids, labels, and features
        assert ids.shape[0] == labels.shape[0] == features.shape[0]

        if ids.shape[0] != len(set(ids)):
            raise ValueError('The example IDs are not unique in %s.' %
                             self.path_or_list)

        return FeatureSet(self.path_or_list, ids, labels=labels,
                          features=features, vectorizer=self.vectorizer)
Esempio n. 16
0
class FeatureSet(object):

    """
    Encapsulation of all of the features, values, and metadata about a given
    set of data. This replaces `ExamplesTuple` from older versions of SKLL.

    Parameters
    ----------
    name : str
        The name of this feature set.
    ids : np.array
        Example IDs for this set.
    labels : np.array, optional
        labels for this set.
        Defaults to ``None``.
    feature : list of dict or array-like, optional
        The features for each instance represented as either a
        list of dictionaries or an array-like (if `vectorizer` is
        also specified).
        Defaults to ``None``.
    vectorizer : DictVectorizer or FeatureHasher, optional
        Vectorizer which will be used to generate the feature matrix.
        Defaults to ``None``.

    Warnings
    --------
    FeatureSets can only be equal if the order of the instances is
    identical because these are stored as lists/arrays. Since scikit-learn's
    `DictVectorizer` automatically sorts the underlying feature matrix
    if it is sparse, we do not do any sorting before checking for equality.
    This is not a problem because we _always_ use sparse matrices with
    `DictVectorizer` when creating FeatureSets.

    Notes
    -----
    If ids, labels, and/or features are not None, the number of rows in
    each array must be equal.
    """

    def __init__(self, name, ids, labels=None, features=None,
                 vectorizer=None):
        super(FeatureSet, self).__init__()
        self.name = name
        if isinstance(ids, list):
            ids = np.array(ids)
        self.ids = ids
        if isinstance(labels, list):
            labels = np.array(labels)
        self.labels = labels
        self.features = features
        self.vectorizer = vectorizer
        # Convert list of dicts to numpy array
        if isinstance(self.features, list):
            if self.vectorizer is None:
                self.vectorizer = NewDictVectorizer(sparse=True)
            self.features = self.vectorizer.fit_transform(self.features)
        if self.features is not None:
            num_feats = self.features.shape[0]
            if self.ids is None:
                raise ValueError('A list of IDs is required')
            num_ids = self.ids.shape[0]
            if num_feats != num_ids:
                raise ValueError(('Number of IDs (%s) does not equal '
                                  'number of feature rows (%s)') % (num_ids,
                                                                    num_feats))
            if self.labels is None:
                self.labels = np.empty(num_feats)
                self.labels.fill(None)
            num_labels = self.labels.shape[0]
            if num_feats != num_labels:
                raise ValueError(('Number of labels (%s) does not equal '
                                  'number of feature rows (%s)') % (num_labels,
                                                                    num_feats))

    def __contains__(self, value):
        """
        Check if example ID is in the FeatureSet.

        Parameters
        ----------
        value
            The value to check.
        """
        return value in self.ids

    def __eq__(self, other):
        """
        Check whether two featuresets are the same.

        Parameters
        ----------
        other : skll.FeatureSet
            The other ``FeatureSet`` to check equivalence with.

        Note
        ----
        We consider feature values to be equal if any differences are in the
        sixth decimal place or higher.
        """

        return (self.ids.shape == other.ids.shape and
                self.labels.shape == other.labels.shape and
                self.features.shape == other.features.shape and
                (self.ids == other.ids).all() and
                (self.labels == other.labels).all() and
                np.allclose(self.features.data, other.features.data,
                            rtol=1e-6) and
                (self.features.indices == other.features.indices).all() and
                (self.features.indptr == other.features.indptr).all() and
                self.vectorizer == other.vectorizer)

    def __iter__(self):
        """
        Iterate through (ID, label, feature_dict) tuples in feature set.
        """
        if self.features is not None:
            if not isinstance(self.vectorizer, DictVectorizer):
                raise ValueError('FeatureSets can only be iterated through if '
                                 'they use a DictVectorizer for their feature '
                                 'vectorizer.')
            for id_, label_, feats in zip(self.ids, self.labels, self.features):

                # reshape to a 2D matrix if we are not using a sparse matrix
                # to store the features
                feats = feats.reshape(1, -1) if not sp.issparse(feats) else feats

                # When calling inverse_transform we have to add [0] to get the
                # results for the current instance because it always returns a
                # 2D array
                yield (id_, label_, self.vectorizer.inverse_transform(feats)[0])
        else:
            return

    def __len__(self):
        """
        The number of rows in the ``FeatureSet`` instance.
        """
        return self.features.shape[0]

    def __add__(self, other):
        """
        Combine two feature sets to create a new one.  This is done assuming
        they both have the same instances with the same IDs in the same order.

        Parameters
        ----------
        other : skll.FeatureSet
            The other ``FeatureSet`` to add to this one.

        Raises
        ------
        ValueError
            If IDs are not in the same order in each ``FeatureSet`` instance.
        ValueError
            If vectorizers are different between the two ``FeatureSet`` instances.
        ValueError
            If there are duplicate feature names.
        ValueError
            If there are conflicting labels.
        """

        # Check that the sets of IDs are equal
        if set(self.ids) != set(other.ids):
            raise ValueError('IDs are not in the same order in each '
                             'feature set')
        # Compute the relative ordering of IDs for merging the features
        # and labels.
        ids_indices = dict((y, x) for x, y in enumerate(other.ids))
        relative_order = [ids_indices[self_id] for self_id in self.ids]

        # Initialize the new feature set with a name and the IDs.
        new_set = FeatureSet('+'.join(sorted([self.name, other.name])),
                             deepcopy(self.ids))

        # Combine feature matrices and vectorizers.
        if not isinstance(self.vectorizer, type(other.vectorizer)):
            raise ValueError('Cannot combine FeatureSets because they are '
                             'not both using the same type of feature '
                             'vectorizer (e.g., DictVectorizer, '
                             'FeatureHasher)')
        uses_feature_hasher = isinstance(self.vectorizer, FeatureHasher)
        if uses_feature_hasher:
            if (self.vectorizer.n_features !=
                    other.vectorizer.n_features):
                raise ValueError('Cannot combine FeatureSets that uses '
                                 'FeatureHashers with different values of '
                                 'n_features setting.')
        else:
            # Check for duplicate feature names.
            if (set(self.vectorizer.feature_names_) &
                    set(other.vectorizer.feature_names_)):
                raise ValueError('Cannot combine FeatureSets because they '
                                 'have duplicate feature names.')
        num_feats = self.features.shape[1]

        new_set.features = sp.hstack([self.features,
                                      other.features[relative_order]],
                                     'csr')
        new_set.vectorizer = deepcopy(self.vectorizer)
        if not uses_feature_hasher:
            for feat_name, index in other.vectorizer.vocabulary_.items():
                new_set.vectorizer.vocabulary_[feat_name] = (index +
                                                             num_feats)
            other_names = other.vectorizer.feature_names_
            new_set.vectorizer.feature_names_.extend(other_names)

        # If either set has labels, check that they don't conflict.
        if self.has_labels:
            # labels should be the same for each FeatureSet, so store once.
            if other.has_labels and \
                    not np.all(self.labels == other.labels[relative_order]):
                raise ValueError('Feature sets have conflicting labels for '
                                 'examples with the same ID.')
            new_set.labels = deepcopy(self.labels)
        else:
            new_set.labels = deepcopy(other.labels[relative_order])

        return new_set

    def filter(self, ids=None, labels=None, features=None, inverse=False):
        """
        Removes or keeps features and/or examples from the `Featureset` depending
        on the parameters. Filtering is done in-place.

        Parameters
        ----------
        ids : list of str/float, optional
            Examples to keep in the FeatureSet. If `None`, no ID
            filtering takes place.
            Defaults to ``None``.
        labels : list of str/float, optional
            Labels that we want to retain examples for. If `None`,
            no label filtering takes place.
            Defaults to ``None``.
        features : list of str, optional
            Features to keep in the FeatureSet. To help with
            filtering string-valued features that were converted
            to sequences of boolean features when read in, any
            features in the FeatureSet that contain a `=` will be
            split on the first occurrence and the prefix will be
            checked to see if it is in `features`.
            If `None`, no feature filtering takes place.
            Cannot be used if FeatureSet uses a FeatureHasher for
            vectorization.
            Defaults to ``None``.
        inverse : bool, optional
            Instead of keeping features and/or examples in lists,
            remove them.
            Defaults to ``False``.

        Raises
        ------
        ValueError
            If attempting to use features to filter a ``FeatureSet`` that
            uses a ``FeatureHasher`` vectorizer.
        """
        # Construct mask that indicates which examples to keep
        mask = np.ones(len(self), dtype=bool)
        if ids is not None:
            mask = np.logical_and(mask, np.in1d(self.ids, ids))
        if labels is not None:
            mask = np.logical_and(mask, np.in1d(self.labels, labels))

        if inverse and (labels is not None or ids is not None):
            mask = np.logical_not(mask)

        # Remove examples not in mask
        self.ids = self.ids[mask]
        self.labels = self.labels[mask]
        self.features = self.features[mask, :]

        # Filter features
        if features is not None:
            if isinstance(self.vectorizer, FeatureHasher):
                raise ValueError('FeatureSets with FeatureHasher vectorizers'
                                 ' cannot be filtered by feature.')
            columns = np.array(sorted({feat_num for feat_name, feat_num in
                                       iteritems(self.vectorizer.vocabulary_)
                                       if (feat_name in features or
                                           feat_name.split('=', 1)[0] in
                                           features)}))
            if inverse:
                all_columns = np.arange(self.features.shape[1])
                columns = all_columns[np.logical_not(np.in1d(all_columns,
                                                             columns))]
            self.features = self.features[:, columns]
            self.vectorizer.restrict(columns, indices=True)

    def filtered_iter(self, ids=None, labels=None, features=None,
                      inverse=False):
        """
        A version of `__iter__` that retains only the specified features
        and/or examples from the output.

        Parameters
        ----------
        ids : list of str/float, optional
            Examples to keep in the ``FeatureSet``. If ``None``, no ID
            filtering takes place.
            Defaults to ``None``.
        labels : list of str/float, optional
            Labels that we want to retain examples for. If ``None``,
            no label filtering takes place.
            Defaults to ``None``.
        features : list of str, optional
            Features to keep in the ``FeatureSet``. To help with
            filtering string-valued features that were converted
            to sequences of boolean features when read in, any
            features in the ``FeatureSet`` that contain a `=` will be
            split on the first occurrence and the prefix will be
            checked to see if it is in ``features``.
            If `None`, no feature filtering takes place.
            Cannot be used if ``FeatureSet`` uses a FeatureHasher for
            vectorization.
            Defaults to ``None``.
        inverse : bool, optional
            Instead of keeping features and/or examples in lists,
            remove them.
            Defaults to ``False``.

        Yields
        ------
        id_ : str
            The ID of the example.
        label_ : str
            The label of the example.
        feat_dict : dict
            The feature dictionary, with feature name as the key
            and example value as the value.

        Raises
        ------
        ValueError
            If the vectorizer is not a `DictVectorizer`.
        """
        if self.features is not None and not isinstance(self.vectorizer,
                                                        DictVectorizer):
            raise ValueError('FeatureSets can only be iterated through if they'
                             ' use a DictVectorizer for their feature '
                             'vectorizer.')

        for id_, label_, feats in zip(self.ids, self.labels, self.features):
            # Skip instances with IDs not in filter
            if ids is not None and (id_ in ids) == inverse:
                continue
            # Skip instances with labels not in filter
            if labels is not None and (label_ in labels) == inverse:
                continue

            # reshape to a 2D matrix if we are not using a sparse matrix
            # to store the features
            feats = feats.reshape(1, -1) if not sp.issparse(feats) else feats
            feat_dict = self.vectorizer.inverse_transform(feats)[0]
            if features is not None:
                feat_dict = {name: value for name, value in
                             iteritems(feat_dict) if
                             (inverse != (name in features or
                                          name.split('=', 1)[0] in features))}
            elif not inverse:
                feat_dict = {}
            yield id_, label_, feat_dict

    def __sub__(self, other):
        """
        Subset ``FeatureSet`` instance by removing all the features from the
        other ``FeatureSet`` instance.

        Parameters
        ----------
        other : skll.FeatureSet
            The other ``FeatureSet`` containing the features that should
            be removed from this ``FeatureSet``.

        Returns
        -------
        A copy of `self` with all features in `other` removed.
        """
        new_set = deepcopy(self)
        new_set.filter(features=other.vectorizer.feature_names_,
                       inverse=True)
        return new_set

    @property
    def has_labels(self):
        """
        Check if ``FeatureSet`` has finite labels.

        Returns
        -------
        has_labels : bool
            Whether or not this FeatureSet has any finite labels.
        """
        # make sure that labels is not None or a list of Nones
        if self.labels is not None and not all(label is None for label in self.labels):
            # then check that they are not a list of NaNs
            return not (np.issubdtype(self.labels.dtype, np.floating) and
                        np.isnan(np.min(self.labels)))
        else:
            return False

    def __str__(self):
        """
        Returns
        -------
        A string representation of ``FeatureSet``.
        """
        return str(self.__dict__)

    def __repr__(self):
        """
        Returns
        -------
        A string representation of ``FeatureSet``.
        """
        return repr(self.__dict__)

    def __getitem__(self, value):
        """
        Parameters
        ----------
        value
            The value to retrieve.

        Returns
        -------
        A specific example by row number or, if given a slice,
        a new ``FeatureSet`` instance containing a subset of the data.
        """
        # Check if we're slicing
        if isinstance(value, slice):
            sliced_ids = self.ids[value]
            sliced_feats = (self.features[value] if self.features is not None
                            else None)
            sliced_labels = (self.labels[value] if self.labels is not None
                             else None)
            return FeatureSet('{}_{}'.format(self.name, value), sliced_ids,
                              features=sliced_feats, labels=sliced_labels,
                              vectorizer=self.vectorizer)
        else:
            label = self.labels[value] if self.labels is not None else None
            feats = self.features[value, :]
            features = (self.vectorizer.inverse_transform(feats)[0] if
                        self.features is not None else {})
            return self.ids[value], label, features

    @staticmethod
    def split_by_ids(fs, ids_for_split1, ids_for_split2=None):
        """
        Split the ``FeatureSet`` into two new ``FeatureSet`` instances based on
        the given IDs for the two splits.

        Parameters
        ----------
        fs : skll.FeatureSet
            The ``FeatureSet`` instance to split.
        ids_for_split1 : list of int
            A list of example IDs which will be split out into
            the first ``FeatureSet`` instance. Note that the
            FeatureSet instance will respect the order of the
            specified IDs.
        ids_for_split2 : list of int, optional
            An optional ist of example IDs which will be
            split out into the second ``FeatureSet`` instance.
            Note that the ``FeatureSet`` instance will respect
            the order of the specified IDs. If this is
            not specified, then the second ``FeatureSet``
            instance will contain the complement of the
            first set of IDs sorted in ascending order.
            Defaults to ``None``.

        Returns
        -------
        fs1 : skll.FeatureSet
            The first ``FeatureSet``.
        fs2 : skll.FeatureSet
            The second ``FeatureSet``.
        """

        # Note: an alternative way to implement this is to make copies
        # of the given FeatureSet instance and then use the `filter()`
        # method but that wastes too much memory since it requires making
        # two copies of the original FeatureSet which may be huge. With
        # the current implementation, we are creating new objects but
        # they should be much smaller than the original FeatureSet.
        ids1 = fs.ids[ids_for_split1]
        labels1 = fs.labels[ids_for_split1]
        features1 = fs.features[ids_for_split1]
        if ids_for_split2 is None:
            ids2 = fs.ids[~np.in1d(fs.ids, ids_for_split1)]
            labels2 = fs.labels[~np.in1d(fs.ids, ids_for_split1)]
            features2 = fs.features[~np.in1d(fs.ids, ids_for_split1)]
        else:
            ids2 = fs.ids[ids_for_split2]
            labels2 = fs.labels[ids_for_split2]
            features2 = fs.features[ids_for_split2]

        fs1 = FeatureSet('{}_1'.format(fs.name),
                         ids1,
                         labels=labels1,
                         features=features1,
                         vectorizer=fs.vectorizer)
        fs2 = FeatureSet('{}_2'.format(fs.name),
                         ids2,
                         labels=labels2,
                         features=features2,
                         vectorizer=fs.vectorizer)
        return fs1, fs2

    @staticmethod
    def from_data_frame(df, name, labels_column=None, vectorizer=None):
        """
        Helper function to create a ``FeatureSet`` instance from a `pandas.DataFrame`.
        Will raise an Exception if pandas is not installed in your environment.
        The ``ids`` in the ``FeatureSet`` will be the index from the given frame.

        Parameters
        ----------
        df : pd.DataFrame
            The pandas.DataFrame object to use as a ``FeatureSet``.
        name : str
            The name of the output ``FeatureSet`` instance.
        labels_column : str, optional
            The name of the column containing the labels (data to predict).
            Defaults to ``None``.
        vectorizer : DictVectorizer or FeatureHasher, optional
            Vectorizer which will be used to generate the feature matrix.
            Defaults to ``None``.

        Returns
        -------
        feature_set : skll.FeatureSet
            A ``FeatureSet`` instance generated from from the given data frame.
        """
        if labels_column:
            feature_columns = [column for column in df.columns if column != labels_column]
            labels = df[labels_column].tolist()
        else:
            feature_columns = df.columns
            labels = None

        features = df[feature_columns].to_dict(orient='records')
        return FeatureSet(name,
                          ids=df.index.tolist(),
                          labels=labels,
                          features=features,
                          vectorizer=vectorizer)