def test_custom_mid():
    cache_dir = check_cache()

    fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
    uuid = fe.setup()
    fe.ingest(data_dir)

    mid_orig = "sklds"

    lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid, mid=mid_orig,
                      mode='w')
    lsi_res, exp_var = lsi.fit_transform(n_components=2, alpha=1.0)
    lsi._load_features()

    assert lsi.mid == mid_orig

    with pytest.raises(WrongParameter):
        lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid, mid=mid_orig,
                          mode='w')
        lsi.fit_transform(n_components=2, alpha=1.0)

    lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid, mid=mid_orig,
                      mode='fw')
    lsi.fit_transform(n_components=2, alpha=1.0)

    with pytest.raises(WrongParameter):
        lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid, mid='?',
                          mode='fw')
        lsi.fit_transform(n_components=2, alpha=1.0)
def fd_setup():
    basename = os.path.dirname(__file__)
    cache_dir = check_cache()
    np.random.seed(1)
    data_dir = os.path.join(basename, "..", "..", "data", "ds_001", "raw")
    n_features = 110000
    fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
    dsid = fe.setup(n_features=n_features,
                    use_hashing=False,
                    stop_words='english',
                    min_df=0.1,
                    max_df=0.9)
    fe.ingest(data_dir, file_pattern=r'.*\d.txt')

    lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=dsid, mode='w')
    lsi.fit_transform(n_components=6)
    return cache_dir, dsid, fe.filenames_, lsi.mid
def test_lsi_remove_documents():
    cache_dir = check_cache()

    fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
    uuid = fe.setup()
    fe.ingest(data_dir)

    lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid, mode='w')
    lsi_res, exp_var = lsi.fit_transform(n_components=2, alpha=1.0)
    X_lsi = lsi._load_features()

    docs = DocumentIndex.from_folder(data_dir).data
    dataset_definition = docs[['document_id']].to_dict(orient='records')
    fe.remove([dataset_definition[2], dataset_definition[4]])

    X_lsi_new = lsi._load_features()
    assert X_lsi_new.shape[0] == X_lsi.shape[0] - 2
def test_search_wrapper(kind):
    # check for syntax errors etc in the wrapper

    fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
    vect_uuid = fe.setup()
    fe.ingest(data_dir, file_pattern=r'.*\d.txt')

    if kind == 'semantic':
        lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=vect_uuid, mode='w')
        lsi.fit_transform(n_components=20)
        parent_id = lsi.mid
    else:
        parent_id = vect_uuid

    sw = _SearchWrapper(cache_dir=cache_dir, parent_id=parent_id)
    dist = sw.search("so that I can reserve a room")
    assert dist.shape == (fe.n_samples_, )
    # document 1 found by
    # grep -rn "so that I can reserve a room"
    # freediscovery/data/ds_001/raw/
    assert dist.argmax() == 1
def test_lsi():

    cache_dir = check_cache()
    n_components = 2

    fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
    uuid = fe.setup()
    fe.ingest(data_dir, file_pattern=r'.*\d.txt')

    lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid, mode='w')
    lsi_res, exp_var = lsi.fit_transform(n_components=n_components, alpha=1.0)
    assert lsi_res.components_.shape[0] == 5
    assert lsi_res.components_.shape[1] == fe.n_features_
    assert lsi._load_pars() is not None
    lsi._load_model()
    X_lsi = lsi._load_features()

    assert_allclose(normalize(X_lsi), X_lsi)

    lsi.list_models()
    lsi.delete()
def test_lsi_append_documents():
    cache_dir = check_cache()

    fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
    uuid = fe.setup()
    fe.ingest(data_dir)

    lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid, mode='w')
    lsi_res, exp_var = lsi.fit_transform(n_components=2, alpha=1.0)
    X_lsi = lsi._load_features()
    n_samples = fe.n_samples_

    docs = DocumentIndex.from_folder(data_dir).data
    docs['document_id'] += 10
    dataset_definition = docs[['file_path', 'document_id']].to_dict(orient='records')
    for row in dataset_definition:
        row['file_path'] = os.path.join(data_dir, row['file_path'])
    fe.append(dataset_definition)

    X_lsi_new = lsi._load_features()
    assert X_lsi_new.shape[0] == X_lsi.shape[0]*2
    assert_equal(X_lsi_new[:n_samples], X_lsi_new[:n_samples])
def test_features_hashing(use_hashing, use_lsi, method):
    # check that models work both with and without hashing

    cache_dir = check_cache()

    n_features = 20000

    fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
    uuid = fe.setup(n_features=n_features, use_hashing=use_hashing)
    fe.ingest(data_dir, file_pattern='.*\d.txt')

    ground_truth = parse_ground_truth_file(os.path.join(data_dir,
                                           "..", "ground_truth_file.txt"))

    lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid, mode='w')
    lsi_res, exp_var = lsi.fit_transform(n_components=100)
    assert lsi._load_pars() is not None
    lsi._load_model()

    if method == 'Categorization':
        if use_lsi:
            parent_id = lsi.mid
            method = 'NearestNeighbor'
        else:
            parent_id = uuid
            method = 'LogisticRegression'
        cat = _CategorizerWrapper(cache_dir=cache_dir, parent_id=parent_id,
                                  cv_n_folds=2)
        cat.fe.db_.filenames_ = cat.fe.filenames_
        index = cat.fe.db_._search_filenames(ground_truth.file_path.values)

        try:
            coefs, Y_train = cat.fit(
                                    index,
                                    ground_truth.is_relevant.values,
                                    method=method
                                    )
        except OptionalDependencyMissing:
            raise SkipTest

        Y_pred, md = cat.predict()
        X_pred = np.arange(cat.fe.n_samples_, dtype='int')
        idx_gt = cat.fe.db_._search_filenames(ground_truth.file_path.values)

        scores = categorization_score(idx_gt,
                                      ground_truth.is_relevant.values,
                                      X_pred, np.argmax(Y_pred, axis=1))
        assert_allclose(scores['precision'], 1, rtol=0.5)
        assert_allclose(scores['recall'], 1, rtol=0.7)
        cat.delete()
    elif method == 'DuplicateDetection':
        dd = _DuplicateDetectionWrapper(cache_dir=cache_dir, parent_id=uuid)
        try:
            dd.fit()
        except ImportError:
            raise SkipTest
        cluster_id = dd.query(distance=10)
    elif method == 'Clustering':
        if not use_hashing:
            if use_lsi:
                parent_id = lsi.mid
                method = 'birch'
            else:
                parent_id = uuid
                method = 'k_means'
            cat = _ClusteringWrapper(cache_dir=cache_dir, parent_id=parent_id)
            cm = getattr(cat, method)
            labels = cm(2)

            htree = cat._get_htree(cat.pipeline.data)

            terms = cat.compute_labels(n_top_words=10)
        else:
            with pytest.raises(NotImplementedError):
                _ClusteringWrapper(cache_dir=cache_dir, parent_id=uuid)
    else:
        raise ValueError
    def append(self, dataset_definition, data_dir=None):
        """ Add some documents to the dataset

        This is by no mean an efficient operation, processing all the files
        at once might be more suitable in most occastions.
        """
        from freediscovery.engine.lsi import _LSIWrapper
        dsid_dir = self.dsid_dir
        db_old = self.db_.data
        internal_id_offset = db_old.internal_id.max()
        db_extra = DocumentIndex.from_list(dataset_definition, data_dir,
                                           internal_id_offset + 1, dsid_dir)
        db_new = db_extra.data
        vect = self.vect_
        tfidf = self.tfidf_

        filenames_new = list(db_new.file_path.values)

        # write down the new features file
        X_new_raw = vect.transform(filenames_new)
        X_new = tfidf.transform(X_new_raw)
        X_old = self._load_features()
        X = scipy.sparse.vstack((X_new, X_old))
        joblib.dump(X, str(dsid_dir / 'features'))

        # write down the new filenames file
        filenames_old = list(self.filenames_)
        filenames = filenames_old + filenames_new

        data_dir = DocumentIndex._detect_data_dir(filenames)
        self._pars['data_dir'] = data_dir

        self._filenames = [os.path.relpath(el, data_dir)
                           for el in filenames]

        with (dsid_dir / 'filenames').open('wb') as fh:
            pickle.dump(self._filenames, fh)
        del db_new['file_path']

        # write down the new pars file
        self._pars = self.pars_
        self._pars['n_samples'] = len(filenames)
        with (dsid_dir / 'pars').open('wb') as fh:
            pickle.dump(self._pars, fh)

        # write down the new database file
        db = pd.concat((db_old, db_new))
        if 'file_path' in db.columns:
            del db['file_path']
        db.to_pickle(str(dsid_dir / 'db'))
        self._db = DocumentIndex(self.pars_['data_dir'], db)

        # find all exisisting LSI models and update them as well
        if (dsid_dir / 'lsi').exists():
            for lsi_id in os.listdir(str(dsid_dir / 'lsi')):
                lsi_obj = _LSIWrapper(cache_dir=self.cache_dir,
                                      mid=lsi_id)
                lsi_obj.append(X_new)

        # remove all trained models for this dataset
        for model_type in ['categorizer', 'dupdet', 'cluster', 'threading']:
            if (dsid_dir / model_type).exists():
                for mid in os.listdir(str(dsid_dir / model_type)):
                    shutil.rmtree(str(dsid_dir / model_type / mid))
basename = Path(__file__).parent

cache_dir = check_cache()

EPSILON = 1e-4

data_dir = basename / ".." / ".." / "data" / "ds_001" / "raw"

ground_truth = parse_ground_truth_file(
    str(data_dir / ".." / "ground_truth_file.txt"))

fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
vect_uuid = fe.setup()
fe.ingest(str(data_dir), file_pattern='.*\d.txt')

lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=vect_uuid, mode='w')
lsi.fit_transform(n_components=6)

_test_cases = itertools.product([False, True], [
    "LinearSVC", "LogisticRegression", 'xgboost', "NearestNeighbor",
    "NearestCentroid"
], [None, 'fast'])

# 'MLPClassifier', 'ensemble-stacking' not supported in production the moment
_test_cases = filter(lambda x: not (x[1].startswith("Nearest") and x[2]),
                     _test_cases)


@pytest.mark.parametrize('use_lsi, method, cv', _test_cases)
def test_categorization(use_lsi, method, cv):