Ejemplo n.º 1
0
def load_svmlight_file(f, n_features=None, dtype=np.float64,
                       multilabel=False):
    """Load datasets in the svmlight / libsvm format into sparse CSR matrix

    This format is a text-based format, with one sample per line. It does
    not store zero valued features hence is suitable for sparse dataset.

    The first element of each line can be used to store a target variable
    to predict.

    This format is used as the default format for both svmlight and the
    libsvm command line programs.

    Parsing a text based source can be expensive. When working on
    repeatedly on the same dataset, it is recommended to wrap this
    loader with joblib.Memory.cache to store a memmapped backup of the
    CSR results of the first call and benefit from the near instantaneous
    loading of memmapped structures for the subsequent calls.

    This implementation is naive: it does allocate too much memory and
    is slow since written in python. On large datasets it is recommended
    to use an optimized loader such as:

      https://github.com/mblondel/svmlight-loader

    Parameters
    ----------
    f: str or file-like
        (Path to) a file to load.

    n_features: int or None
        The number of features to use. If None, it will be inferred. This
        argument is useful to load several files that are subsets of a
        bigger sliced dataset: each subset might not have example of
        every feature, hence the inferred shape might vary from one
        slice to another.

    multilabel: boolean, optional
        Samples may have several labels each (see
        http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multilabel.html)

    Returns
    -------
    (X, y)

    where X is a scipy.sparse matrix of shape (n_samples, n_features),
          y is a ndarray of shape (n_samples,), or, in the multilabel case,
          a list of tuples of length n_samples.
    """
    if hasattr(f, "read"):
        return _load_svmlight_file(f, n_features, dtype, multilabel)
    with open(f) as f:
        return _load_svmlight_file(f, n_features, dtype, multilabel)
Ejemplo n.º 2
0
def load_svmlight_file(file_path, buffer_mb=40, query_id=False):
    """Load datasets in the svmlight / libsvm format into sparse CSR matrix

    This format is a text-based format, with one sample per line. It does
    not store zero valued features hence is suitable for sparse dataset.

    The first element of each line can be used to store a target variable
    to predict.

    This format is used as the default format for both svmlight and the
    libsvm command line programs.

    Parsing a text based source can be expensive. When working on
    repeatedly on the same dataset, it is recommended to wrap this
    loader with joblib.Memory.cache to store a memmapped backup of the
    CSR results of the first call and benefit from the near instantaneous
    loading of memmapped structures for the subsequent calls.

    Parameters
    ----------
    file_path: str
        Path to a file to load.
    buffer_mb : integer
        Buffer size to use for low level read
    query_id : bool
        True if the query ids has to be loaded, false otherwise

    Returns
    -------
    (X, y, [query_ids])

    where X is a dense numpy matrix of shape (n_samples, n_features) and type dtype,
          y is a ndarray of shape (n_samples,).
          query_ids is a ndarray of shape(nsamples,) if query_id is True, it is not returned otherwise
    """
    data, labels, qids = _load_svmlight_file(file_path, buffer_mb)

    # reshape the numpy array into a matrix
    n_samples = len(labels)
    n_features = len(data) / n_samples
    data.shape = (n_samples, n_features)
    if data.dtype != np.float32:
        new_data = data.astype(dtype=np.float32)
        del data
        data = new_data

    # Convert infinite values to max_float representation (SVM reader problem)
    # This patch is needed because some dataset have infinite values and because
    # the split condition is <=, while sole software uses <. In order to
    # reconduct the former condition to the latter, we slightly decrease the
    # split. However, slightly decreasing inf does not have any effect.
    data[data == np.inf] = np.finfo(data.dtype).max

    if not query_id:
        return data, labels
    else:
        return data, labels, qids
Ejemplo n.º 3
0
def load_svmlight_file(file_path, other_file_path=None,
                         n_features=None, buffer_mb=40):
    """Load datasets in the svmlight / libsvm format directly into
    scipy sparse CSR matrices.

    Parameters
    ----------
    file_path: str
        Path to a file to load.

    other_file_path: str or None
        Path to another file to load. scikit-learn will make sure that the
        number of features in the returned matrix is the same as for
        file_path.

    n_features: int or None
        The number of features to use. If None, it will be inferred.

    buffer_mb: int (default: 40)
        The size of the buffer used while loading the dataset in mega-bytes.

    Returns
    -------
        (X, y)

        where X is a scipy.sparse matrix of shape (n_samples, n_features),
              y is a ndarray of shape (n_samples,),

        or, if other_file_path is not None,

        (X1, y1, X2, y2)

        where X1 and X2 are scipy.sparse matrices of shape
                            (n_samples1, n_features) and
                            (n_samples2, n_features),
              y1 and y2 are ndarrays of shape (n_samples1,) and (n_samples2,).

    Note
    ----
    When fitting a model to a matrix X_train and evaluating it against a matrix
    X_test, it is essential that X_train and X_test have the same number of
    features (X_train.shape[1] == X_test.shape[1]). This may not be the case if
    you load them with load_svmlight_format separately. To address this
    problem, we recommend to use load_svmlight_format(train_file, test_file)
    or load_svmlight_format(test_file, n_features=X_train.shape[1]).
    """
    data, indices, indptr, labels = _load_svmlight_file(file_path, buffer_mb)

    if n_features is not None:
        shape = (indptr.shape[0] - 1, n_features)
    else:
        shape = None    # inferred

    X_train = sp.csr_matrix((data, indices, indptr), shape)

    ret = [X_train, labels]

    if other_file_path is not None:
        tup = _load_svmlight_file(other_file_path, buffer_mb)
        data, indices, indptr, labels = tup

        if n_features is None:
            n_features = X_train.shape[1]

        shape = (indptr.shape[0] - 1, n_features)

        X_test = sp.csr_matrix((data, indices, indptr), shape)

        ret.append(X_test)
        ret.append(labels)

    return tuple(ret)
Ejemplo n.º 4
0
def load_svmlight_file(file_path,
                       other_file_path=None,
                       n_features=None,
                       buffer_mb=40):
    """Load datasets in the svmlight / libsvm format directly into
    scipy sparse CSR matrices.

    Parameters
    ----------
    file_path: str
        Path to a file to load.

    other_file_path: str or None
        Path to another file to load. scikit-learn will make sure that the
        number of features in the returned matrix is the same as for
        file_path.

    n_features: int or None
        The number of features to use. If None, it will be inferred.

    buffer_mb: int (default: 40)
        The size of the buffer used while loading the dataset in mega-bytes.

    Returns
    -------
        (X, y)

        where X is a scipy.sparse matrix of shape (n_samples, n_features),
              y is a ndarray of shape (n_samples,),

        or, if other_file_path is not None,

        (X1, y1, X2, y2)

        where X1 and X2 are scipy.sparse matrices of shape
                            (n_samples1, n_features) and
                            (n_samples2, n_features),
              y1 and y2 are ndarrays of shape (n_samples1,) and (n_samples2,).

    Note
    ----
    When fitting a model to a matrix X_train and evaluating it against a matrix
    X_test, it is essential that X_train and X_test have the same number of
    features (X_train.shape[1] == X_test.shape[1]). This may not be the case if
    you load them with load_svmlight_format separately. To address this
    problem, we recommend to use load_svmlight_format(train_file, test_file)
    or load_svmlight_format(test_file, n_features=X_train.shape[1]).
    """
    data, indices, indptr, labels = _load_svmlight_file(file_path, buffer_mb)

    if n_features is not None:
        shape = (indptr.shape[0] - 1, n_features)
    else:
        shape = None  # inferred

    X_train = sp.csr_matrix((data, indices, indptr), shape)

    ret = [X_train, labels]

    if other_file_path is not None:
        tup = _load_svmlight_file(other_file_path, buffer_mb)
        data, indices, indptr, labels = tup

        if n_features is None:
            n_features = X_train.shape[1]

        shape = (indptr.shape[0] - 1, n_features)

        X_test = sp.csr_matrix((data, indices, indptr), shape)

        ret.append(X_test)
        ret.append(labels)

    return tuple(ret)