def _load_features(self, dsid=None):
     """ Load a computed features from disk"""
     if self.cache_dir is None:
         raise InitException('cache_dir is None: cannot load from cache!')
     if dsid is None:
         dsid = self.dsid
     dsid_dir = self.cache_dir / dsid
     if not dsid_dir.exists():
         raise DatasetNotFound('dsid not found!')
     fset_new = joblib.load(str(dsid_dir / 'features'))
     return fset_new
 def db_(self):
     """ DatasetIndex """
     if not hasattr(self, '_db') or self._db is None:
         dsid = self.dsid
         if self.cache_dir is None:
             raise InitException('cache_dir is None: cannot load from cache!')
         dsid_dir = self.cache_dir / dsid
         if not dsid_dir.exists():
             raise DatasetNotFound('dsid {} not found!'.format(dsid))
         data = pd.read_pickle(str(dsid_dir / 'db'))
         self._db = DocumentIndex(self.pars_['data_dir'], data)
     return self._db
 def pars_(self):
     if not hasattr(self, '_pars') or self._pars is None:
         # Load parameters from disk
         dsid = self.dsid
         if self.cache_dir is None:
             raise InitException('cache_dir is None: '
                                 'cannot load from cache!')
         dsid_dir = self.cache_dir / dsid
         if not dsid_dir.exists():
             raise DatasetNotFound('dsid {} not found!'.format(dsid))
         with (dsid_dir / 'pars').open('rb') as fh:
             self._pars = pickle.load(fh)
     return self._pars
    def __init__(self, cache_dir='/tmp/', dsid=None, verbose=False, mode='r'):
        self.verbose = verbose

        self._filenames = None
        self._vect = None
        self._tfidf = None
        self._db = None
        self._pars = None

        self.cache_dir = cache_dir = PipelineFinder._normalize_cachedir(cache_dir)
        if not cache_dir.exists():
            cache_dir.mkdir()
        self.dsid = dsid
        if mode not in ['r', 'w', 'fw']:
            raise WrongParameter('mode={} must be one of "r", "w", "fw"'
                                 .format(mode))
        self.mode = mode
        if dsid is not None:
            validate_mid(dsid)
            dsid_dir = self.cache_dir / dsid
            if mode == 'r':
                if not dsid_dir.exists():
                    raise DatasetNotFound('Dataset '
                                          '{} ({}) not found in {}!'.format(
                                           dsid, type(self).__name__, cache_dir))
            else:
                if dsid_dir.exists():
                    if mode == 'w':
                        raise WrongParameter(('dataset identified by dsid={} '
                                              'already exists. Use mode="fw" '
                                              'to overwrite.')
                                             .format(dsid))
                    elif mode == 'fw':
                        shutil.rmtree(dsid_dir)
        else:
            dsid_dir = None
        self.dsid_dir = dsid_dir
    def transform(self):
        """
        Run the feature extraction
        """
        dsid_dir = self.dsid_dir
        if not dsid_dir.exists():
            raise DatasetNotFound()

        if not (dsid_dir / 'db').exists():
            raise ValueError('Please ingest some files before running '
                             'the vectorizer!')

        pars = self.pars_
        pars['filenames_abs'] = self.filenames_abs_
        chunk_size = pars['chunk_size']
        n_samples = pars['n_samples']
        use_hashing = pars['use_hashing']

        if use_hashing:
            # make sure that we can initialize the vectorizer
            # (easier outside of the paralel loop
            vect = _vectorize_chunk(dsid_dir, 0, pars, pretend=True)

        processing_lock = (dsid_dir / 'processing')
        _touch(processing_lock)
        custom_sw = _StopWordsWrapper(cache_dir=self.cache_dir)
        if pars['stop_words'] in custom_sw:
            pars['stop_words'] = custom_sw.load(pars['stop_words'])
        else:
            pars['stop_words'] = self._generate_stop_words(pars['stop_words'])

        try:
            if use_hashing:
                # fixed in https://github.com/joblib/joblib/pull/414
                _rename_main_thread()
                Parallel(n_jobs=pars['n_jobs'])(
                            delayed(_vectorize_chunk)(dsid_dir, k, pars)
                            for k in range(n_samples//chunk_size + 1))

                res = self._aggregate_features()

                self._vect = vect
            else:
                opts_tfidf = {key: val for key, val in pars.items()
                              if key in ['stop_words',
                                         'ngram_range', 'analyzer',
                                         'min_df', 'max_df']}

                vect = CountVectorizer(input='content',
                                       max_features=pars['n_features'],
                                       **opts_tfidf)
                if pars['column_ids'] is not None:
                    # joining again in case there are more than one `:` in the file name
                    file_path = ':'.join(pars["filenames_abs"][0].split(':')[:-1])
                    X = pd.read_csv(file_path, sep=pars['column_separator'],
                                    header=None)
                    X = X.iloc[:, pars['column_ids']]
                    # contactenate all columns together
                    text_gen = X.apply(lambda x: ''.join(str(el) for el in x), axis=1).values
                else:
                    text_gen = (
                        _preprocess_stream(_read_file(fname), pars['preprocess'])
                        for fname in pars['filenames_abs'])

                res = vect.fit_transform(text_gen)
                self._vect = vect
            fname = dsid_dir / 'vectorizer'
            if self._pars['use_hashing']:
                joblib.dump(self._vect, str(fname))
            else:
                # faster for pure python objects
                with fname.open('wb') as fh:
                    pickle.dump(self._vect, fh)
            fname = dsid_dir / 'tfidf_transformer'
            wt = SmartTfidfTransformer(pars['weighting'],
                                       norm_alpha=pars['norm_alpha'])
            self._idf = wt
            res = wt.fit_transform(res)
            joblib.dump(self._idf, str(fname))

            del self.pars_['filenames_abs']

            joblib.dump(res, str(dsid_dir / 'features'))
            # remove all identical files
            if use_hashing:
                for filename in dsid_dir.glob('features-*[0-9]*'):
                    filename.unlink()
        except:
            if processing_lock.exists():
                processing_lock.unlink()
            raise
        # remove processing lock if finished or if error
        if processing_lock.exists():
            processing_lock.unlink()
        _touch(dsid_dir / 'processing_finished')