def __init__(self, cache_dir='/tmp/', parent_id=None, mid=None, load_model=False, mode='r'): if parent_id is None and mid is None: raise WrongParameter('At least one of parent_id or mid ' 'should be provided!') if self._wrapper_type == 'lsi' and self.mode in ['w', 'fw']: # lsi supports explicitly providing mid at creation if parent_id is None: raise WrongParameter(('parent_id={} must be provided for ' 'model creation!').format(parent_id)) else: validate_mid(parent_id) self.pipeline = PipelineFinder.by_id(parent_id, cache_dir) if mid is not None: validate_mid(mid) self.mid = mid else: if parent_id is None and mid is not None: validate_mid(mid) self.pipeline = PipelineFinder.by_id(mid, cache_dir).parent self.mid = mid elif parent_id is not None: validate_mid(parent_id) self.pipeline = PipelineFinder.by_id(parent_id, cache_dir) self.mid = None # this only affects LSI if mode not in ['r', 'w', 'fw']: raise WrongParameter( 'mode={} must be one of "r", "w", "fw"'.format(mode)) self.mode = mode # this is an alias that should be deprecated self.fe = FeatureVectorizer(cache_dir=cache_dir, dsid=self.pipeline['vectorizer']) self.model_dir = self.pipeline.get_path() / self._wrapper_type if self._wrapper_type == 'search': # no data need to be stored on disk return if not self.model_dir.exists(): self.model_dir.mkdir() if self.mid is not None and self.mode == 'r': self._pars = self._load_pars() else: self._pars = None if load_model: if self.mid is not None and self.mode == 'r': self.cmod = self._load_model() else: self.cmod = None
def fit(self, method='simhash'): """ Precompute all the required values for duplicate detection """ pars = {'method': method} if method not in ['simhash', 'i-match']: raise WrongParameter('Dup. detection method {} not implemented!'.format(method)) if method == 'simhash': from freediscovery.near_duplicates import SimhashNearDuplicates self.model = shash = SimhashNearDuplicates() else: self.model = None self._pars = pars mid, mid_dir = setup_model(self.model_dir) self.mid = mid X = self.pipeline.data if method == 'simhash': shash.fit(X) self._fit_X = X joblib.dump(self.model, str(self.model_dir / mid / 'model')) joblib.dump(pars, str(self.model_dir / mid / 'pars'))
def __init__(self, cache_dir='/tmp/', dsid=None, verbose=False, mode='r'): self.verbose = verbose self._filenames = None self._vect = None self._tfidf = None self._db = None self._pars = None self.cache_dir = cache_dir = PipelineFinder._normalize_cachedir(cache_dir) if not cache_dir.exists(): cache_dir.mkdir() self.dsid = dsid if mode not in ['r', 'w', 'fw']: raise WrongParameter('mode={} must be one of "r", "w", "fw"' .format(mode)) self.mode = mode if dsid is not None: validate_mid(dsid) dsid_dir = self.cache_dir / dsid if mode == 'r': if not dsid_dir.exists(): raise DatasetNotFound('Dataset ' '{} ({}) not found in {}!'.format( dsid, type(self).__name__, cache_dir)) else: if dsid_dir.exists(): if mode == 'w': raise WrongParameter(('dataset identified by dsid={} ' 'already exists. Use mode="fw" ' 'to overwrite.') .format(dsid)) elif mode == 'fw': shutil.rmtree(dsid_dir) else: dsid_dir = None self.dsid_dir = dsid_dir
def from_folder(cls, data_dir, file_pattern=None, dir_pattern=None, internal_id_offset=0, document_id_generator='indexed_file_path'): """ Create a DocumentIndex from files in data_dir Parmaters --------- data_dir : str path to the data directory (used only if metadata not provided), default: None Returns ------- result : DocumentIndex a DocumentIndex object """ data_dir = os.path.abspath(os.path.normpath(data_dir)) if not os.path.exists(data_dir): raise NotFound('data_dir={} does not exist'.format(data_dir)) if document_id_generator not in [ 'indexed_file_path', 'infer_file_path' ]: raise WrongParameter( ("document_id_generator={} not supported. It must be " "one of ['indexed_file_path', 'infer_file_path']" ).format(document_id_generator)) filenames = _list_filenames(data_dir, dir_pattern, file_pattern) db = [{ 'file_path': file_path, 'internal_id': idx + internal_id_offset } for idx, file_path in enumerate(filenames)] db = pd.DataFrame(db) _generate_document_id(db, document_id_generator) res = cls(data_dir, db) res.filenames_ = filenames return res
def __init__(self, cache_dir='/tmp/', parent_id=None, mid=None, verbose=False, random_state=None, mode='r'): if mode not in ['r', 'w', 'fw']: raise WrongParameter( 'mode={} must be one of "r", "w", "fw"'.format(mode)) self.mode = mode super(_LSIWrapper, self).__init__(cache_dir=cache_dir, parent_id=parent_id, mid=mid, mode=mode) self.random_state = random_state
def _build_estimator(Y_train, method, cv, cv_scoring, cv_n_folds, random_state=None, **options): if cv: #from sklearn.cross_validation import StratifiedKFold #cv_obj = StratifiedKFold(n_splits=cv_n_folds, shuffle=False) cv_obj = cv_n_folds # temporary hack (due to piclking issues otherwise, this needs to be fixed) else: cv_obj = None _rename_main_thread() if method == 'LinearSVC': from sklearn.svm import LinearSVC if cv is None: cmod = LinearSVC(random_state=random_state, **options) else: try: from freediscovery_extra import make_linearsvc_cv_model except ImportError: raise OptionalDependencyMissing('freediscovery_extra') cmod = make_linearsvc_cv_model(cv_obj, cv_scoring, **options) elif method == 'LogisticRegression': from sklearn.linear_model import LogisticRegression if cv is None: cmod = LogisticRegression(random_state=random_state, **options) else: try: from freediscovery_extra import make_logregr_cv_model except ImportError: raise OptionalDependencyMissing('freediscovery_extra') cmod = make_logregr_cv_model(cv_obj, cv_scoring, **options) elif method == 'NearestCentroid': cmod = NearestCentroidRanker() elif method == 'NearestNeighbor': cmod = NearestNeighborRanker() elif method == 'xgboost': try: import xgboost as xgb except ImportError: raise OptionalDependencyMissing('xgboost') if cv is None: try: from freediscovery_extra import make_xgboost_model except ImportError: raise OptionalDependencyMissing('freediscovery_extra') cmod = make_xgboost_model(cv_obj, cv_scoring, **options) else: try: from freediscovery_extra import make_xgboost_cv_model except ImportError: raise OptionalDependencyMissing('freediscovery_extra') cmod = make_xgboost_cv_model(cv, cv_obj, cv_scoring, **options) elif method == 'MLPClassifier': if cv is not None: raise NotImplementedFD('CV not supported with MLPClassifier') from sklearn.neural_network import MLPClassifier cmod = MLPClassifier(solver='adam', hidden_layer_sizes=10, max_iter=200, activation='identity', verbose=0, random_state=random_state) else: raise WrongParameter('Method {} not implemented!'.format(method)) return cmod
def predict(self, chunk_size=5000, ml_output='probability', metric='cosine'): """ Predict the relevance using a previously trained model Parameters ---------- chunck_size : int chunk size ml_output : str type of the output in ['decision_function', 'probability'], only affects ML methods. default: 'probability' metric : str The similarity returned by nearest neighbor classifier in ['cosine', 'jaccard', 'cosine_norm', 'jaccard_norm']. default: 'cosine' Returns ------- res : ndarray [n_samples, n_classes] the score for each class nn_ind : {ndarray [n_samples, n_classes], None} the index of the nearest neighbor for each class (when the NearestNeighborRanker is used) """ if ml_output not in ['probability', 'decision_function']: raise ValueError(("Wrong input value ml_output={}, must be one of " "['probability', 'decision_function']") .format(ml_output)) if ml_output == 'probability': ml_output = 'predict_proba' if self.cmod is not None: cmod = self.cmod else: raise WrongParameter('The model must be trained first, or sid must be provided to load\ a previously trained model!') ds = self.pipeline.data nn_ind = None if isinstance(cmod, NearestNeighborRanker): res, nn_ind_orig = cmod.kneighbors(ds) res = _scale_cosine_similarity(res, metric=metric) nn_ind = self._pars['index'][nn_ind_orig] elif hasattr(cmod, ml_output): res = getattr(cmod, ml_output)(ds) elif hasattr(cmod, 'decision_function'): # and we need predict_proba res = cmod.decision_function(ds) res = expit(res) elif hasattr(cmod, 'predict_proba'): # and we need decision_function res = cmod.predict_proba(ds) res = logit(res) else: raise ValueError('Model {} has neither decision_function nor predict_proba methods!'.format(cmod)) # handle the case of binary categorization # as two classes categorization if res.ndim == 1: if ml_output == 'decision_function': res_p = res res_n = - res else: res_p = res res_n = 1 - res res = np.hstack((res_n[:, None], res_p[:, None])) return res, nn_ind
def fit(self, index, y, method='LinearSVC', cv=None): """ Train the categorization model Parameters ---------- index : array-like, shape (n_samples) document indices of the training set y : array-like, shape (n_samples) target class relative to index (string or int) method : str the ML algorithm to use (one of "LogisticRegression", "LinearSVC", 'xgboost') cv : str use cross-validation Returns ------- cmod : sklearn.BaseEstimator the scikit learn classifier object Y_train : array-like, shape (n_samples) training predictions """ valid_methods = ["LinearSVC", "LogisticRegression", "xgboost", "NearestCentroid", "NearestNeighbor"] if method in ['MLPClassifier']: raise WrongParameter('method={} is implemented but not production ready. It was disabled for now.'.format(method)) if method not in valid_methods: raise WrongParameter('method={} is not supported, should be one of {}'.format( method, valid_methods)) if cv is not None and method in ['NearestNeighbor', 'NearestCentroid']: raise WrongParameter('Cross validation (cv={}) not supported with {}'.format( cv, method)) if cv not in [None, 'fast', 'full']: raise WrongParameter('cv') d_all = self.pipeline.data X_train = d_all[index, :] Y_labels = y self.le = LabelEncoder() Y_train = self.le.fit_transform(Y_labels) cmod = self._build_estimator(Y_train, method, cv, self.cv_scoring, self.cv_n_folds) mid, mid_dir = setup_model(self.model_dir) if method == 'xgboost' and not cv: cmod.fit(X_train, Y_train, eval_metric='auc') else: cmod.fit(X_train, Y_train) joblib.dump(self.le, str(mid_dir / 'label_encoder')) joblib.dump(cmod, str(mid_dir / 'model')) pars = { 'method': method, 'index': index, 'y': y, 'categories': self.le.classes_ } pars['options'] = cmod.get_params() self._pars = pars joblib.dump(pars, str(mid_dir / 'pars')) self.mid = mid self.cmod = cmod return cmod, Y_train
def setup(self, n_features=None, chunk_size=5000, analyzer='word', ngram_range=(1, 1), stop_words=None, n_jobs=1, use_hashing=False, weighting='nnc', norm_alpha=0.75, min_df=0.0, max_df=1.0, parse_email_headers=False, preprocess=[], column_ids=None, column_separator=',' ): """Initalize the features extraction. See sklearn.feature_extraction.text for a detailed description of the input parameters Parameters ---------- analyzer : string, {'word', 'char'} or callable Whether the feature should be made of word or character n-grams. If a callable is passed it is used to extract the sequence of features out of the raw, unprocessed input. ngram_range : tuple (min_n, max_n) The lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n such that min_n <= n <= max_n will be used. stop_words : string {'english'}, list, or None (default) If a string, it is passed to _check_stop_list and the appropriate stop list is returned. 'english' is currently the only supported string value. If None, no stop words will be used. max_df can be set to a value in the range [0.7, 1.0) to automatically detect and filter stop words based on intra corpus document frequency of terms. max_df : float in range [0.0, 1.0] or int, default=1.0 When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None. min_df : float in range [0.0, 1.0] or int, default=1 When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None. max_features : int or None, default=None or 100001 If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus. weighting : str SMART weighting type preprocess : list of strings, default: [] A list of pre-processing steps, including 'emails_ingore_header' column_ids: None or List[str] when provided the ingested files are assumed to be CSV column_separator: str, default=',' delimiter used for parsing CSV files. Only used when ``column_ids`` is not None. """ if self.mode not in ['w', 'fw']: raise WrongParameter('The vectorizer can be setup only with ' 'mode in ["w", "fw"]') if analyzer not in ['word', 'char', 'char_wb']: raise WrongParameter('analyzer={} not supported!'.format(analyzer)) if not isinstance(ngram_range, tuple) \ and not isinstance(ngram_range, list): raise WrongParameter(('not a valid input ngram_range=' '{}: should be a list or a typle!') .format(ngram_range)) if not len(ngram_range) == 2: raise WrongParameter('len(gram_range=={}!=2' .format(len(ngram_range))) if not 0 <= norm_alpha <= 1: raise WrongParameter('norm_alpha={} not in [0, 1]' .format(norm_alpha)) _, _, weighting_n = _validate_smart_notation(weighting) if weighting_n == 'n': warnings.warn('You should use either cosine or pivoted normalization ' 'i.e. weighting should be "**[cp]"', UserWarning) for key in preprocess: if key not in processing_filters: raise WrongParameter(('Unknown preprocessing step {} ' ' must of be of {}') .format(key, ', '.join(list(processing_filters.keys())))) if stop_words in [None, 'english', 'english_alphanumeric']: pass elif stop_words in _StopWordsWrapper(cache_dir=self.cache_dir): pass else: raise WrongParameter('stop_words = {}'.format(stop_words)) if not isinstance(column_separator, str): raise ValueError('column_separator={} expected string' .format(column_separator)) if not (column_ids is None or isinstance(column_ids, (list, tuple))): raise ValueError('column_ids={} expected None or sequence' .format(column_ids)) if n_features is None and use_hashing: n_features = 100001 # default size of the hashing table if self.dsid is None: self.dsid = dsid = generate_uuid() else: dsid = self.dsid self.dsid_dir = dsid_dir = self.cache_dir / dsid dsid_dir.mkdir() pars = {'data_dir': None, 'n_samples': None, "n_features": n_features, 'chunk_size': chunk_size, 'stop_words': stop_words, 'analyzer': analyzer, 'ngram_range': ngram_range, 'n_jobs': n_jobs, 'use_hashing': use_hashing, 'weighting': weighting, 'norm_alpha': norm_alpha, 'min_df': min_df, 'max_df': max_df, 'parse_email_headers': parse_email_headers, 'type': type(self).__name__, 'preprocess': preprocess, 'freediscovery_version': __version__, 'column_ids': column_ids, 'column_separator': column_separator} self._pars = pars with (dsid_dir / 'pars').open('wb') as fh: pickle.dump(self._pars, fh) return dsid