def __init__(self, cache_dir='/tmp/', parent_id=None, mid=None, load_model=False, mode='r'): if parent_id is None and mid is None: raise WrongParameter('At least one of parent_id or mid ' 'should be provided!') if self._wrapper_type == 'lsi' and self.mode in ['w', 'fw']: # lsi supports explicitly providing mid at creation if parent_id is None: raise WrongParameter(('parent_id={} must be provided for ' 'model creation!').format(parent_id)) else: validate_mid(parent_id) self.pipeline = PipelineFinder.by_id(parent_id, cache_dir) if mid is not None: validate_mid(mid) self.mid = mid else: if parent_id is None and mid is not None: validate_mid(mid) self.pipeline = PipelineFinder.by_id(mid, cache_dir).parent self.mid = mid elif parent_id is not None: validate_mid(parent_id) self.pipeline = PipelineFinder.by_id(parent_id, cache_dir) self.mid = None # this only affects LSI if mode not in ['r', 'w', 'fw']: raise WrongParameter( 'mode={} must be one of "r", "w", "fw"'.format(mode)) self.mode = mode # this is an alias that should be deprecated self.fe = FeatureVectorizer(cache_dir=cache_dir, dsid=self.pipeline['vectorizer']) self.model_dir = self.pipeline.get_path() / self._wrapper_type if self._wrapper_type == 'search': # no data need to be stored on disk return if not self.model_dir.exists(): self.model_dir.mkdir() if self.mid is not None and self.mode == 'r': self._pars = self._load_pars() else: self._pars = None if load_model: if self.mid is not None and self.mode == 'r': self.cmod = self._load_model() else: self.cmod = None
def _rm(args): cache_dir = _parse_cache_dir(args.cache_dir) if args.all: p = PipelineFinder(cache_dir=cache_dir) fpath = p.cache_dir elif args.mid: mid = args.mid p = PipelineFinder.by_id(mid=mid, cache_dir=cache_dir) fpath = p.get_path() else: print('Error: either mid or the -a (--all) flag should be provided. ' 'Exiting.') return _del_mid = _query_yes_no('Are you sure you want to delete\n' ' {} ?'.format(fpath), default='no', overwrite=args.yes) if _del_mid: shutil.rmtree(fpath) print('Folder {} deleted.'.format(fpath)) else: print('Nothing to be done. Exiting.')
def __init__(self, cache_dir='/tmp/'): """ Initialize a stop words wrapper Parameters ---------- cache_dir : str the cache directory """ self.cache_dir = PipelineFinder._normalize_cachedir(cache_dir) self.model_dir = self.cache_dir / 'stop_words' if not self.model_dir.exists(): self.model_dir.mkdir()
def _show(args): cache_dir = _parse_cache_dir(args.cache_dir) p = PipelineFinder.by_id(mid=args.mid, cache_dir=cache_dir) print(p) print(' * model_id: {}'.format(args.mid)) print(' * model_type: {}'.format(list(p.keys())[-1])) print(' * file_path: {}'.format(p.get_path())) try: pars = joblib.load(os.path.join(p.get_path(), 'pars')) for key, val in pars.items(): val_str = str(val) if len(val_str) > 30 and not isinstance(val, dict): continue print(' * {}: {}'.format(key, val_str)) except: pass
def test_pipeline(n_steps): """ Test a 2 or 3 step pipelines with vectorizer (+ lsi) + classifier """ if n_steps == 2: uuid = vect_uuid elif n_steps == 3: uuid = lsi.mid else: raise ValueError cat = _CategorizerWrapper(cache_dir=cache_dir, parent_id=uuid, cv_n_folds=2) cat.fe.db_.filenames_ = cat.fe.filenames_ index = cat.fe.db_._search_filenames(ground_truth.file_path.values) coefs, Y_train = cat.fit(index, ground_truth.is_relevant.values) cat.predict() assert len(cat.pipeline) == n_steps - 1 # additional tests if n_steps == 3: pf = PipelineFinder.by_id(cat.mid, cache_dir) assert list(pf.keys()) == ['vectorizer', 'lsi', 'categorizer'] assert list(pf.parent.keys()) == ['vectorizer', 'lsi'] assert list(pf.parent.parent.keys()) == ['vectorizer'] assert pf.mid == cat.mid assert pf.parent.mid == lsi.mid assert pf.parent.parent.mid == vect_uuid with pytest.raises(ValueError): pf.parent.parent.parent for estimator_type, mid in pf.items(): path = str(pf.get_path(mid, absolute=False)) if estimator_type == 'vectorizer': assert re.match('ediscovery_cache.*', path) elif estimator_type == 'lsi': assert re.match('ediscovery_cache.*lsi', path) elif estimator_type == 'categorizer': assert re.match('ediscovery_cache.*lsi.*categorizer', path) else: raise ValueError
def __init__(self, cache_dir='/tmp/', dsid=None, verbose=False, mode='r'): self.verbose = verbose self._filenames = None self._vect = None self._tfidf = None self._db = None self._pars = None self.cache_dir = cache_dir = PipelineFinder._normalize_cachedir(cache_dir) if not cache_dir.exists(): cache_dir.mkdir() self.dsid = dsid if mode not in ['r', 'w', 'fw']: raise WrongParameter('mode={} must be one of "r", "w", "fw"' .format(mode)) self.mode = mode if dsid is not None: validate_mid(dsid) dsid_dir = self.cache_dir / dsid if mode == 'r': if not dsid_dir.exists(): raise DatasetNotFound('Dataset ' '{} ({}) not found in {}!'.format( dsid, type(self).__name__, cache_dir)) else: if dsid_dir.exists(): if mode == 'w': raise WrongParameter(('dataset identified by dsid={} ' 'already exists. Use mode="fw" ' 'to overwrite.') .format(dsid)) elif mode == 'fw': shutil.rmtree(dsid_dir) else: dsid_dir = None self.dsid_dir = dsid_dir