def __init__(self,
                 cache_dir='/tmp/',
                 parent_id=None,
                 mid=None,
                 load_model=False,
                 mode='r'):
        if parent_id is None and mid is None:
            raise WrongParameter('At least one of parent_id or mid '
                                 'should be provided!')

        if self._wrapper_type == 'lsi' and self.mode in ['w', 'fw']:
            # lsi supports explicitly providing mid at creation
            if parent_id is None:
                raise WrongParameter(('parent_id={} must be provided for '
                                      'model creation!').format(parent_id))
            else:
                validate_mid(parent_id)
                self.pipeline = PipelineFinder.by_id(parent_id, cache_dir)
                if mid is not None:
                    validate_mid(mid)
                self.mid = mid
        else:
            if parent_id is None and mid is not None:
                validate_mid(mid)
                self.pipeline = PipelineFinder.by_id(mid, cache_dir).parent
                self.mid = mid
            elif parent_id is not None:
                validate_mid(parent_id)
                self.pipeline = PipelineFinder.by_id(parent_id, cache_dir)
                self.mid = None

        # this only affects LSI
        if mode not in ['r', 'w', 'fw']:
            raise WrongParameter(
                'mode={} must be one of "r", "w", "fw"'.format(mode))
        self.mode = mode

        # this is an alias that should be deprecated
        self.fe = FeatureVectorizer(cache_dir=cache_dir,
                                    dsid=self.pipeline['vectorizer'])

        self.model_dir = self.pipeline.get_path() / self._wrapper_type

        if self._wrapper_type == 'search':
            # no data need to be stored on disk
            return

        if not self.model_dir.exists():
            self.model_dir.mkdir()

        if self.mid is not None and self.mode == 'r':
            self._pars = self._load_pars()
        else:
            self._pars = None

        if load_model:
            if self.mid is not None and self.mode == 'r':
                self.cmod = self._load_model()
            else:
                self.cmod = None
    def fit(self, method='simhash'):
        """
        Precompute all the required values for duplicate detection
        """

        pars = {'method': method}
        if method not in ['simhash', 'i-match']:
            raise WrongParameter('Dup. detection method {} not implemented!'.format(method))
        if method == 'simhash':
            from freediscovery.near_duplicates import SimhashNearDuplicates
            self.model = shash = SimhashNearDuplicates()
        else:
            self.model = None
        self._pars = pars
        mid, mid_dir = setup_model(self.model_dir)

        self.mid = mid

        X = self.pipeline.data
        if method == 'simhash':
            shash.fit(X)

        self._fit_X = X

        joblib.dump(self.model, str(self.model_dir / mid / 'model'))
        joblib.dump(pars, str(self.model_dir / mid / 'pars'))
    def __init__(self, cache_dir='/tmp/', dsid=None, verbose=False, mode='r'):
        self.verbose = verbose

        self._filenames = None
        self._vect = None
        self._tfidf = None
        self._db = None
        self._pars = None

        self.cache_dir = cache_dir = PipelineFinder._normalize_cachedir(cache_dir)
        if not cache_dir.exists():
            cache_dir.mkdir()
        self.dsid = dsid
        if mode not in ['r', 'w', 'fw']:
            raise WrongParameter('mode={} must be one of "r", "w", "fw"'
                                 .format(mode))
        self.mode = mode
        if dsid is not None:
            validate_mid(dsid)
            dsid_dir = self.cache_dir / dsid
            if mode == 'r':
                if not dsid_dir.exists():
                    raise DatasetNotFound('Dataset '
                                          '{} ({}) not found in {}!'.format(
                                           dsid, type(self).__name__, cache_dir))
            else:
                if dsid_dir.exists():
                    if mode == 'w':
                        raise WrongParameter(('dataset identified by dsid={} '
                                              'already exists. Use mode="fw" '
                                              'to overwrite.')
                                             .format(dsid))
                    elif mode == 'fw':
                        shutil.rmtree(dsid_dir)
        else:
            dsid_dir = None
        self.dsid_dir = dsid_dir
Exemple #4
0
    def from_folder(cls,
                    data_dir,
                    file_pattern=None,
                    dir_pattern=None,
                    internal_id_offset=0,
                    document_id_generator='indexed_file_path'):
        """ Create a DocumentIndex from files in data_dir

        Parmaters
        ---------
        data_dir : str
            path to the data directory (used only if metadata not provided),
            default: None

        Returns
        -------
        result : DocumentIndex
            a DocumentIndex object
        """

        data_dir = os.path.abspath(os.path.normpath(data_dir))

        if not os.path.exists(data_dir):
            raise NotFound('data_dir={} does not exist'.format(data_dir))

        if document_id_generator not in [
                'indexed_file_path', 'infer_file_path'
        ]:
            raise WrongParameter(
                ("document_id_generator={} not supported. It must be "
                 "one of ['indexed_file_path', 'infer_file_path']"
                 ).format(document_id_generator))

        filenames = _list_filenames(data_dir, dir_pattern, file_pattern)

        db = [{
            'file_path': file_path,
            'internal_id': idx + internal_id_offset
        } for idx, file_path in enumerate(filenames)]

        db = pd.DataFrame(db)

        _generate_document_id(db, document_id_generator)

        res = cls(data_dir, db)
        res.filenames_ = filenames
        return res
Exemple #5
0
    def __init__(self,
                 cache_dir='/tmp/',
                 parent_id=None,
                 mid=None,
                 verbose=False,
                 random_state=None,
                 mode='r'):

        if mode not in ['r', 'w', 'fw']:
            raise WrongParameter(
                'mode={} must be one of "r", "w", "fw"'.format(mode))
        self.mode = mode

        super(_LSIWrapper, self).__init__(cache_dir=cache_dir,
                                          parent_id=parent_id,
                                          mid=mid,
                                          mode=mode)
        self.random_state = random_state
Exemple #6
0
    def _build_estimator(Y_train, method, cv, cv_scoring, cv_n_folds, random_state=None, **options):
        if cv:
            #from sklearn.cross_validation import StratifiedKFold
            #cv_obj = StratifiedKFold(n_splits=cv_n_folds, shuffle=False)
            cv_obj = cv_n_folds  # temporary hack (due to piclking issues otherwise, this needs to be fixed)
        else:
            cv_obj = None

        _rename_main_thread()

        if method == 'LinearSVC':
            from sklearn.svm import LinearSVC
            if cv is None:
                cmod = LinearSVC(random_state=random_state, **options)
            else:
                try:
                    from freediscovery_extra import make_linearsvc_cv_model
                except ImportError:
                    raise OptionalDependencyMissing('freediscovery_extra')
                cmod = make_linearsvc_cv_model(cv_obj, cv_scoring, **options)
        elif method == 'LogisticRegression':
            from sklearn.linear_model import LogisticRegression
            if cv is None:
                cmod = LogisticRegression(random_state=random_state, **options)
            else:
                try:
                    from freediscovery_extra import make_logregr_cv_model
                except ImportError:
                    raise OptionalDependencyMissing('freediscovery_extra')
                cmod = make_logregr_cv_model(cv_obj, cv_scoring, **options)
        elif method == 'NearestCentroid':
            cmod = NearestCentroidRanker()
        elif method == 'NearestNeighbor':
            cmod = NearestNeighborRanker()
        elif method == 'xgboost':
            try:
                import xgboost as xgb
            except ImportError:
                raise OptionalDependencyMissing('xgboost')
            if cv is None:
                try:
                    from freediscovery_extra import make_xgboost_model
                except ImportError:
                    raise OptionalDependencyMissing('freediscovery_extra')
                cmod = make_xgboost_model(cv_obj, cv_scoring, **options)
            else:
                try:
                    from freediscovery_extra import make_xgboost_cv_model
                except ImportError:
                    raise OptionalDependencyMissing('freediscovery_extra')
                cmod = make_xgboost_cv_model(cv, cv_obj, cv_scoring, **options)
        elif method == 'MLPClassifier':
            if cv is not None:
                raise NotImplementedFD('CV not supported with MLPClassifier')
            from sklearn.neural_network import MLPClassifier
            cmod = MLPClassifier(solver='adam', hidden_layer_sizes=10,
                                 max_iter=200, activation='identity',
                                 verbose=0,
                                 random_state=random_state)
        else:
            raise WrongParameter('Method {} not implemented!'.format(method))
        return cmod
Exemple #7
0
    def predict(self, chunk_size=5000, ml_output='probability', metric='cosine'):
        """
        Predict the relevance using a previously trained model

        Parameters
        ----------
        chunck_size : int
           chunk size
        ml_output : str
           type of the output in ['decision_function', 'probability'],
           only affects ML methods. default: 'probability'
        metric : str   
            The similarity returned by nearest neighbor classifier in
            ['cosine', 'jaccard', 'cosine_norm', 'jaccard_norm'].
            default: 'cosine'

        Returns
        -------
        res : ndarray [n_samples, n_classes]
           the score for each class
        nn_ind : {ndarray [n_samples, n_classes], None}
           the index of the nearest neighbor for each class
           (when the NearestNeighborRanker is used)
        """
        if ml_output not in ['probability', 'decision_function']:
            raise ValueError(("Wrong input value ml_output={}, must be one of "
                              "['probability', 'decision_function']")
                             .format(ml_output))

        if ml_output == 'probability':
            ml_output = 'predict_proba'

        if self.cmod is not None:
            cmod = self.cmod
        else:
            raise WrongParameter('The model must be trained first, or sid must be provided to load\
                    a previously trained model!')

        ds = self.pipeline.data

        nn_ind = None
        if isinstance(cmod, NearestNeighborRanker):
            res, nn_ind_orig = cmod.kneighbors(ds)
            res = _scale_cosine_similarity(res, metric=metric)
            nn_ind = self._pars['index'][nn_ind_orig]
        elif hasattr(cmod, ml_output):
            res = getattr(cmod, ml_output)(ds)
        elif hasattr(cmod, 'decision_function'):
            # and we need predict_proba
            res = cmod.decision_function(ds)
            res = expit(res)
        elif hasattr(cmod, 'predict_proba'):
            # and we need decision_function
            res = cmod.predict_proba(ds)
            res = logit(res)
        else:
            raise ValueError('Model {} has neither decision_function nor predict_proba methods!'.format(cmod))

        # handle the case of binary categorization
        # as two classes categorization
        if res.ndim == 1:
            if ml_output == 'decision_function':
                res_p = res
                res_n = - res
            else:
                res_p = res
                res_n = 1 - res
            res = np.hstack((res_n[:, None], res_p[:, None]))
        return res, nn_ind
Exemple #8
0
    def fit(self, index, y, method='LinearSVC', cv=None):
        """
        Train the categorization model

        Parameters
        ----------
        index : array-like, shape (n_samples)
           document indices of the training set
        y : array-like, shape (n_samples)
           target class relative to index (string or int)
        method : str
           the ML algorithm to use (one of "LogisticRegression", "LinearSVC", 'xgboost')
        cv : str
           use cross-validation
        Returns
        -------
        cmod : sklearn.BaseEstimator
           the scikit learn classifier object
        Y_train : array-like, shape (n_samples)
           training predictions
        """

        valid_methods = ["LinearSVC", "LogisticRegression", "xgboost",
                         "NearestCentroid", "NearestNeighbor"]

        if method in ['MLPClassifier']:
            raise WrongParameter('method={} is implemented but not production ready. It was disabled for now.'.format(method))

        if method not in valid_methods:
            raise WrongParameter('method={} is not supported, should be one of {}'.format(
                method, valid_methods)) 
        if cv is not None and method in ['NearestNeighbor', 'NearestCentroid']:
            raise WrongParameter('Cross validation (cv={}) not supported with {}'.format(
                                        cv, method))

        if cv not in [None, 'fast', 'full']:
            raise WrongParameter('cv')

        d_all = self.pipeline.data

        X_train = d_all[index, :]

        Y_labels = y

        self.le = LabelEncoder()
        Y_train = self.le.fit_transform(Y_labels)

        cmod = self._build_estimator(Y_train, method, cv, self.cv_scoring, self.cv_n_folds)

        mid, mid_dir = setup_model(self.model_dir)

        if method == 'xgboost' and not cv:
            cmod.fit(X_train, Y_train, eval_metric='auc')
        else:
            cmod.fit(X_train, Y_train)

        joblib.dump(self.le, str(mid_dir / 'label_encoder'))
        joblib.dump(cmod, str(mid_dir / 'model'))

        pars = {
            'method': method,
            'index': index,
            'y': y,
            'categories': self.le.classes_
            }
        pars['options'] = cmod.get_params()
        self._pars = pars
        joblib.dump(pars, str(mid_dir / 'pars'))

        self.mid = mid
        self.cmod = cmod
        return cmod, Y_train
    def setup(self, n_features=None, chunk_size=5000, analyzer='word',
              ngram_range=(1, 1), stop_words=None, n_jobs=1,
              use_hashing=False,
              weighting='nnc', norm_alpha=0.75, min_df=0.0, max_df=1.0,
              parse_email_headers=False,
              preprocess=[], column_ids=None, column_separator=','
              ):
        """Initalize the features extraction.

        See sklearn.feature_extraction.text for a detailed description
        of the input parameters

        Parameters
        ----------
        analyzer : string, {'word', 'char'} or callable
            Whether the feature should be made of word or character n-grams.
            If a callable is passed it is used to extract the sequence of
            features out of the raw, unprocessed input.
        ngram_range : tuple (min_n, max_n)
            The lower and upper boundary of the range of n-values for different
            n-grams to be extracted. All values of n such that
            min_n <= n <= max_n will be used.
        stop_words : string {'english'}, list, or None (default)
            If a string, it is passed to _check_stop_list and the appropriate
            stop list is returned. 'english' is currently the only supported
            string value.
            If None, no stop words will be used. max_df can be set to a value
            in the range [0.7, 1.0) to automatically detect and filter stop
            words based on intra corpus document frequency of terms.
        max_df : float in range [0.0, 1.0] or int, default=1.0
            When building the vocabulary ignore terms that have a document
            frequency strictly higher than the given threshold (corpus-specific
            stop words).
            If float, the parameter represents a proportion of documents,
            integer absolute counts.
            This parameter is ignored if vocabulary is not None.
        min_df : float in range [0.0, 1.0] or int, default=1
            When building the vocabulary ignore terms that have a document
            frequency strictly lower than the given threshold. This value is
            also called cut-off in the literature.
            If float, the parameter represents a proportion of documents,
            integer absolute counts.
            This parameter is ignored if vocabulary is not None.
        max_features : int or None, default=None or 100001
            If not None, build a vocabulary that only consider the top
            max_features ordered by term frequency across the corpus.
        weighting : str
            SMART weighting type
        preprocess : list of strings, default: []
            A list of pre-processing steps, including 'emails_ingore_header'
        column_ids: None or List[str]
            when provided the ingested files are assumed to be CSV
        column_separator: str, default=','
            delimiter used for parsing CSV files. Only used when
            ``column_ids`` is not None.
        """
        if self.mode not in ['w', 'fw']:
            raise WrongParameter('The vectorizer can be setup only with '
                                 'mode in ["w", "fw"]')

        if analyzer not in ['word', 'char', 'char_wb']:
            raise WrongParameter('analyzer={} not supported!'.format(analyzer))

        if not isinstance(ngram_range, tuple) \
           and not isinstance(ngram_range, list):
            raise WrongParameter(('not a valid input ngram_range='
                                  '{}: should be a list or a typle!')
                                 .format(ngram_range))

        if not len(ngram_range) == 2:
            raise WrongParameter('len(gram_range=={}!=2'
                                 .format(len(ngram_range)))

        if not 0 <= norm_alpha <= 1:
            raise WrongParameter('norm_alpha={} not in [0, 1]'
                                 .format(norm_alpha))

        _, _, weighting_n = _validate_smart_notation(weighting)
        if weighting_n == 'n':
            warnings.warn('You should use either cosine or pivoted normalization '
                          'i.e. weighting should be "**[cp]"',
                          UserWarning)

        for key in preprocess:
            if key not in processing_filters:
                raise WrongParameter(('Unknown preprocessing step {} '
                                      ' must of be of {}')
                                     .format(key, ', '.join(list(processing_filters.keys()))))

        if stop_words in [None, 'english', 'english_alphanumeric']:
            pass
        elif stop_words in _StopWordsWrapper(cache_dir=self.cache_dir):
            pass
        else:
            raise WrongParameter('stop_words = {}'.format(stop_words))

        if not isinstance(column_separator, str):
            raise ValueError('column_separator={} expected string'
                             .format(column_separator))

        if not (column_ids is None or isinstance(column_ids, (list, tuple))):
            raise ValueError('column_ids={} expected None or sequence'
                             .format(column_ids))

        if n_features is None and use_hashing:
            n_features = 100001  # default size of the hashing table

        if self.dsid is None:
            self.dsid = dsid = generate_uuid()
        else:
            dsid = self.dsid
        self.dsid_dir = dsid_dir = self.cache_dir / dsid

        dsid_dir.mkdir()

        pars = {'data_dir': None,
                'n_samples': None, "n_features": n_features,
                'chunk_size': chunk_size, 'stop_words': stop_words,
                'analyzer': analyzer, 'ngram_range': ngram_range,
                'n_jobs': n_jobs, 'use_hashing': use_hashing,
                'weighting': weighting, 'norm_alpha': norm_alpha,
                'min_df': min_df, 'max_df': max_df,
                'parse_email_headers': parse_email_headers,
                'type': type(self).__name__,
                'preprocess': preprocess,
                'freediscovery_version': __version__,
                'column_ids': column_ids,
                'column_separator': column_separator}
        self._pars = pars
        with (dsid_dir / 'pars').open('wb') as fh:
            pickle.dump(self._pars, fh)
        return dsid