コード例 #1
0
    def __init__(self,
                 cache_dir='/tmp/',
                 parent_id=None,
                 mid=None,
                 load_model=False,
                 mode='r'):
        if parent_id is None and mid is None:
            raise WrongParameter('At least one of parent_id or mid '
                                 'should be provided!')

        if self._wrapper_type == 'lsi' and self.mode in ['w', 'fw']:
            # lsi supports explicitly providing mid at creation
            if parent_id is None:
                raise WrongParameter(('parent_id={} must be provided for '
                                      'model creation!').format(parent_id))
            else:
                validate_mid(parent_id)
                self.pipeline = PipelineFinder.by_id(parent_id, cache_dir)
                if mid is not None:
                    validate_mid(mid)
                self.mid = mid
        else:
            if parent_id is None and mid is not None:
                validate_mid(mid)
                self.pipeline = PipelineFinder.by_id(mid, cache_dir).parent
                self.mid = mid
            elif parent_id is not None:
                validate_mid(parent_id)
                self.pipeline = PipelineFinder.by_id(parent_id, cache_dir)
                self.mid = None

        # this only affects LSI
        if mode not in ['r', 'w', 'fw']:
            raise WrongParameter(
                'mode={} must be one of "r", "w", "fw"'.format(mode))
        self.mode = mode

        # this is an alias that should be deprecated
        self.fe = FeatureVectorizer(cache_dir=cache_dir,
                                    dsid=self.pipeline['vectorizer'])

        self.model_dir = self.pipeline.get_path() / self._wrapper_type

        if self._wrapper_type == 'search':
            # no data need to be stored on disk
            return

        if not self.model_dir.exists():
            self.model_dir.mkdir()

        if self.mid is not None and self.mode == 'r':
            self._pars = self._load_pars()
        else:
            self._pars = None

        if load_model:
            if self.mid is not None and self.mode == 'r':
                self.cmod = self._load_model()
            else:
                self.cmod = None
コード例 #2
0
def _rm(args):
    cache_dir = _parse_cache_dir(args.cache_dir)
    if args.all:
        p = PipelineFinder(cache_dir=cache_dir)
        fpath = p.cache_dir
    elif args.mid:
        mid = args.mid
        p = PipelineFinder.by_id(mid=mid, cache_dir=cache_dir)
        fpath = p.get_path()
    else:
        print('Error: either mid or the -a (--all) flag should be provided. '
              'Exiting.')
        return
    _del_mid = _query_yes_no('Are you sure you want to delete\n'
                             '        {} ?'.format(fpath),
                             default='no',
                             overwrite=args.yes)
    if _del_mid:
        shutil.rmtree(fpath)
        print('Folder {} deleted.'.format(fpath))
    else:
        print('Nothing to be done. Exiting.')
コード例 #3
0
    def __init__(self, cache_dir='/tmp/'):
        """ Initialize a stop words wrapper

        Parameters
        ----------
        cache_dir : str
          the cache directory
        """
        self.cache_dir = PipelineFinder._normalize_cachedir(cache_dir)
        self.model_dir = self.cache_dir / 'stop_words'

        if not self.model_dir.exists():
            self.model_dir.mkdir()
コード例 #4
0
def _show(args):
    cache_dir = _parse_cache_dir(args.cache_dir)
    p = PipelineFinder.by_id(mid=args.mid, cache_dir=cache_dir)
    print(p)
    print(' * model_id: {}'.format(args.mid))
    print(' * model_type: {}'.format(list(p.keys())[-1]))
    print(' * file_path: {}'.format(p.get_path()))
    try:
        pars = joblib.load(os.path.join(p.get_path(), 'pars'))
        for key, val in pars.items():
            val_str = str(val)
            if len(val_str) > 30 and not isinstance(val, dict):
                continue
            print(' * {}: {}'.format(key, val_str))
    except:
        pass
コード例 #5
0
def test_pipeline(n_steps):
    """ Test a 2 or 3 step pipelines with
        vectorizer (+ lsi) + classifier """

    if n_steps == 2:
        uuid = vect_uuid
    elif n_steps == 3:
        uuid = lsi.mid
    else:
        raise ValueError

    cat = _CategorizerWrapper(cache_dir=cache_dir,
                              parent_id=uuid,
                              cv_n_folds=2)
    cat.fe.db_.filenames_ = cat.fe.filenames_
    index = cat.fe.db_._search_filenames(ground_truth.file_path.values)

    coefs, Y_train = cat.fit(index, ground_truth.is_relevant.values)

    cat.predict()

    assert len(cat.pipeline) == n_steps - 1

    # additional tests
    if n_steps == 3:
        pf = PipelineFinder.by_id(cat.mid, cache_dir)

        assert list(pf.keys()) == ['vectorizer', 'lsi', 'categorizer']
        assert list(pf.parent.keys()) == ['vectorizer', 'lsi']
        assert list(pf.parent.parent.keys()) == ['vectorizer']

        assert pf.mid == cat.mid
        assert pf.parent.mid == lsi.mid
        assert pf.parent.parent.mid == vect_uuid
        with pytest.raises(ValueError):
            pf.parent.parent.parent

        for estimator_type, mid in pf.items():
            path = str(pf.get_path(mid, absolute=False))
            if estimator_type == 'vectorizer':
                assert re.match('ediscovery_cache.*', path)
            elif estimator_type == 'lsi':
                assert re.match('ediscovery_cache.*lsi', path)
            elif estimator_type == 'categorizer':
                assert re.match('ediscovery_cache.*lsi.*categorizer', path)
            else:
                raise ValueError
コード例 #6
0
    def __init__(self, cache_dir='/tmp/', dsid=None, verbose=False, mode='r'):
        self.verbose = verbose

        self._filenames = None
        self._vect = None
        self._tfidf = None
        self._db = None
        self._pars = None

        self.cache_dir = cache_dir = PipelineFinder._normalize_cachedir(cache_dir)
        if not cache_dir.exists():
            cache_dir.mkdir()
        self.dsid = dsid
        if mode not in ['r', 'w', 'fw']:
            raise WrongParameter('mode={} must be one of "r", "w", "fw"'
                                 .format(mode))
        self.mode = mode
        if dsid is not None:
            validate_mid(dsid)
            dsid_dir = self.cache_dir / dsid
            if mode == 'r':
                if not dsid_dir.exists():
                    raise DatasetNotFound('Dataset '
                                          '{} ({}) not found in {}!'.format(
                                           dsid, type(self).__name__, cache_dir))
            else:
                if dsid_dir.exists():
                    if mode == 'w':
                        raise WrongParameter(('dataset identified by dsid={} '
                                              'already exists. Use mode="fw" '
                                              'to overwrite.')
                                             .format(dsid))
                    elif mode == 'fw':
                        shutil.rmtree(dsid_dir)
        else:
            dsid_dir = None
        self.dsid_dir = dsid_dir