Ejemplo n.º 1
0
def test_remove_documents():
    cache_dir = check_cache()

    fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
    uuid = fe.setup()
    fe.ingest(data_dir)

    X = fe._load_features(uuid)
    db = fe.db_.data
    filenames = fe.filenames_
    n_samples = len(fe.filenames_)

    docs = DocumentIndex.from_folder(data_dir).data
    dataset_definition = docs[['document_id']].to_dict(orient='records')
    fe.remove([dataset_definition[2], dataset_definition[4]])
    X_new = fe._load_features(uuid)
    assert X_new.shape[0] == X.shape[0] - 2
    assert fe.db_.data.shape[0] == db.shape[0] - 2
    assert len(fe.filenames_) == len(filenames) - 2

    dbn = fe.db_.data
    assert_equal(db.iloc[[0, 1, 3, 5]]['document_id'].values,
                 dbn['document_id'].values)
    # check that internal id is contiguous
    assert (np.diff(dbn.internal_id.values) == 1).all()

    # check the number of samples is consistent
    del fe._pars
    assert fe.n_samples_ == n_samples - 2

    fe.delete()
Ejemplo n.º 2
0
def test_append_documents():
    cache_dir = check_cache()

    fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
    uuid = fe.setup()
    fe.ingest(data_dir)

    X = fe._load_features(uuid)
    db = fe.db_
    filenames = fe.filenames_
    n_samples = len(fe.filenames_)

    docs = DocumentIndex.from_folder(data_dir).data
    docs['document_id'] += 10
    dataset_definition = docs[['file_path',
                               'document_id']].to_dict(orient='records')
    for row in dataset_definition:
        row['file_path'] = os.path.join(data_dir, row['file_path'])
    fe.append(dataset_definition)
    X_new = fe._load_features(uuid)
    assert X_new.shape[0] == X.shape[0] * 2
    assert fe.db_.data.shape[0] == db.data.shape[0] * 2
    assert len(fe.filenames_) == len(filenames) * 2

    dbn = fe.db_.data
    assert_equal(dbn.iloc[:n_samples]['document_id'].values,
                 dbn.iloc[n_samples:]['document_id'].values - 10)
    # check that internal id is contiguous
    assert (np.diff(dbn.internal_id.values) == 1).all()

    # check the number of samples is consistent
    del fe._pars
    assert fe.n_samples_ == n_samples * 2

    fe.delete()
Ejemplo n.º 3
0
def test_ingestion_pickling():
    from sklearn.externals import joblib
    db = DocumentIndex.from_folder(data_dir)
    fname = os.path.join(cache_dir, 'document_index')
    # check that db is picklable
    joblib.dump(db, fname)
    db2 = joblib.load(fname)
    os.remove(fname)
Ejemplo n.º 4
0
def test_search_not_found():
    dbi = DocumentIndex.from_folder(data_dir)
    query = pd.DataFrame([{
        'file_path': "DOES_NOT_EXISTS"
    }, {
        'file_path': "0.7.6.28637.txt"
    }])
    with pytest.raises(NotFound):
        sres = dbi.search(query)
Ejemplo n.º 5
0
def test_ingestion_base_dir():
    dbi = DocumentIndex.from_folder(data_dir)
    dbi._make_relative_paths()
    data_dir_res, filenames, db = dbi.data_dir, dbi.filenames_, dbi.data
    assert data_dir_res == os.path.normpath(data_dir)
    assert_array_equal(db.columns.values,
                       ['file_path', 'internal_id', 'document_id'])
    assert_array_equal(db.file_path.values, fnames_in)
    assert_array_equal(
        [os.path.normpath(os.path.join(data_dir_res, el)) for el in filenames],
        [os.path.join(data_dir_res, el) for el in db.file_path.values])
Ejemplo n.º 6
0
def test_search_2fields():
    dbi = DocumentIndex.from_folder(data_dir)
    dbi._make_relative_paths()

    query = pd.DataFrame([{
        'internal_id': 3
    }, {
        'internal_id': 1
    }, {
        'internal_id': 2
    }])
    sres = dbi.search(query)
    assert_equal(sres.internal_id.values, [3, 1, 2])
    assert_array_equal(sorted(sres.columns),
                       sorted(['internal_id', 'file_path', 'document_id']))

    # make sure that if we have some additional field,
    # we still use the internal_id
    query = pd.DataFrame([{
        'internal_id': 1,
        'a': 2
    }, {
        'internal_id': 2,
        'b': 4
    }, {
        'internal_id': 1,
        'a': 3
    }])
    sres = dbi.search(query)
    assert_equal(sres.internal_id.values, [1, 2, 1])
    assert_array_equal(sorted(sres.columns),
                       sorted(['internal_id', 'file_path', 'document_id']))

    sres = dbi.search(query, drop=False)
    assert_equal(sres.internal_id.values, [1, 2, 1])
    assert_array_equal(
        sorted(sres.columns),
        sorted(['internal_id', 'file_path', 'document_id', 'a', 'b']))

    query = pd.DataFrame([{
        'file_path': "0.7.6.28637.txt"
    }, {
        'file_path': "0.7.47.117435.txt"
    }])
    del dbi.data['file_path']
    sres = dbi.search(query)
    query_res = [
        dbi.data.file_path.values.tolist().index(el)
        for el in query.file_path.values
    ]
    assert_array_equal(query_res, sres.internal_id)
Ejemplo n.º 7
0
def test_lsi_remove_documents():
    cache_dir = check_cache()

    fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
    uuid = fe.setup()
    fe.ingest(data_dir)

    lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid, mode='w')
    lsi_res, exp_var = lsi.fit_transform(n_components=2, alpha=1.0)
    X_lsi = lsi._load_features()

    docs = DocumentIndex.from_folder(data_dir).data
    dataset_definition = docs[['document_id']].to_dict(orient='records')
    fe.remove([dataset_definition[2], dataset_definition[4]])

    X_lsi_new = lsi._load_features()
    assert X_lsi_new.shape[0] == X_lsi.shape[0] - 2
Ejemplo n.º 8
0
def test_lsi_append_documents():
    cache_dir = check_cache()

    fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
    uuid = fe.setup()
    fe.ingest(data_dir)

    lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid, mode='w')
    lsi_res, exp_var = lsi.fit_transform(n_components=2, alpha=1.0)
    X_lsi = lsi._load_features()
    n_samples = fe.n_samples_

    docs = DocumentIndex.from_folder(data_dir).data
    docs['document_id'] += 10
    dataset_definition = docs[['file_path', 'document_id']].to_dict(orient='records')
    for row in dataset_definition:
        row['file_path'] = os.path.join(data_dir, row['file_path'])
    fe.append(dataset_definition)

    X_lsi_new = lsi._load_features()
    assert X_lsi_new.shape[0] == X_lsi.shape[0]*2
    assert_equal(X_lsi_new[:n_samples], X_lsi_new[:n_samples])
Ejemplo n.º 9
0
    def ingest(self, data_dir=None, file_pattern='.*', dir_pattern='.*',
               dataset_definition=None, vectorize=True,
               document_id_generator='indexed_file_path',
               ):
        """Perform data ingestion

        Parameters
        ----------
        data_dir : str
            path to the data directory (used only if metadata not provided),
            default: None
        dataset_defintion : list of dicts
            a list of dictionaries with keys
            ['file_path', 'document_id', 'rendition_id']
            describing the data ingestion (this overwrites data_dir)
        vectorize : bool (default: True)
        """
        dsid_dir = self.cache_dir / self.dsid
        if (dsid_dir / 'db').exists():
            raise ValueError('Dataset {} already vectorized!'
                             .format(self.dsid))
        db_list = list(sorted(dsid_dir.glob('db*')))
        if len(db_list) == 0:
            internal_id_offset = -1
        elif len(db_list) >= 1:
            internal_id_offset = int(db_list[-1].name[3:])

        pars = self.pars_

        if pars.get('column_ids', None) is not None:
            if dataset_definition is None:
                raise ValueError("CSV files can only be privided using "
                                 "`dataset_definition` parameter")
            else:
                if len(dataset_definition) > 1:
                    raise ValueError(
                            "Only one CSV can be provided at a time"
                    )
                file_path = dataset_definition[0]['file_path']
                X = pd.read_csv(
                        file_path, sep=pars['column_separator'], header=None)
                dataset_definition = [
                        {'file_path': f"{file_path}:{idx}", 'document_id': idx}
                        for idx in range(len(X))]

                db = DocumentIndex.from_list(
                        dataset_definition, data_dir,
                        internal_id_offset + 1, dsid_dir,
                        document_id_generator=document_id_generator)
        elif dataset_definition is not None:
            db = DocumentIndex.from_list(
                    dataset_definition, data_dir,
                    internal_id_offset + 1, dsid_dir,
                    document_id_generator=document_id_generator)
        elif data_dir is not None:
            db = DocumentIndex.from_folder(
                    data_dir, file_pattern, dir_pattern,
                    internal_id_offset + 1,
                    document_id_generator=document_id_generator)
        else:
            db = None

        if db is not None:
            data_dir = db.data_dir

            batch_suffix = '.{:09}'.format(db.data.internal_id.iloc[-1])

            self._filenames = db.data.file_path.values.tolist()
            del db.data['file_path']


            if 'file_path' in db.data.columns:
                del db.data['file_path']
            db.data.to_pickle(str(dsid_dir / ('db' + batch_suffix)))
            with (dsid_dir / ('filenames' + batch_suffix)).open('wb') as fh:
                pickle.dump(self._filenames, fh)
            self._db = db

        if vectorize:
            db_list = list(sorted(dsid_dir.glob('db*')))
            filenames_list = list(sorted(dsid_dir.glob('filenames*')))
            if len(db_list) == 0:
                raise ValueError('No ingested files found!')

            if len(db_list) == 1:
                with filenames_list[0].open('rb') as fh:
                    filenames_concat = pickle.load(fh)
            elif len(db_list) >= 2:
                # accumulate different batches into a single file
                # filename file
                filenames_concat = []
                for fname in filenames_list:
                    with fname.open('rb') as fh:
                        filenames_concat += pickle.load(fh)

            if self.pars_['data_dir'] is None:
                data_dir = DocumentIndex._detect_data_dir(filenames_concat)
                self._pars['data_dir'] = data_dir
            else:
                data_dir = self._pars['data_dir']

            self._filenames = [os.path.relpath(el, data_dir)
                               for el in filenames_concat]

            with (dsid_dir / 'filenames').open('wb') as fh:
                pickle.dump(self._filenames, fh)

            for fname in filenames_list:
                fname.unlink()

            # save databases
            if len(db_list) == 1:
                db_list[0].rename(dsid_dir / 'db')
                self.db_.filenames_ = self._filenames
                self.db_.data['file_path'] = self._filenames
            elif len(db_list) >= 2:

                db_concat = []
                for fname in db_list:
                    db_concat.append(pd.read_pickle(str(fname)))
                db_new = pd.concat(db_concat, axis=0)
                db_new.filenames_ = self._filenames
                db_new.set_index('internal_id', drop=False, inplace=True)
                self._db = DocumentIndex(data_dir, db_new)
                if 'file_path' in db_new.columns:
                    del db_new['file_path']
                db_new.to_pickle(str(dsid_dir / 'db'))

            # save parameters
            self._pars['n_samples'] = len(self._filenames)
            self._pars['data_dir'] = data_dir

            with (dsid_dir / 'pars').open('wb') as fh:
                pickle.dump(self._pars, fh)

            self.transform()

            if (dsid_dir / 'raw').exists():
                shutil.rmtree(str(dsid_dir / 'raw'))

        if db is None and not vectorize:
            raise ValueError('At least one of data_dir, dataset_definition, '
                             'vectorize parameters must be provided!')
        return
Ejemplo n.º 10
0
def load_dataset(name='20_newsgroups_3categories',
                 cache_dir='/tmp',
                 verbose=False,
                 verify_checksum=False,
                 document_id_generation='squared',
                 categories=None):
    """Download a benchmark dataset.

    The currently supported datasets are listed below,

    1. TREC 2009 legal collection

       - `treclegal09_2k_subset`  :   2 400 documents,   2 MB
       - `treclegal09_20k_subset` :  20 000 documents,  30 MB
       - `treclegal09_37k_subset` :  37 000 documents,  55 MB
       - `treclegal09`            : 700 000 documents, 1.2 GB

       The ground truth files for categorization are adapted from TAR Toolkit.

    2. Fedora mailing list (2009-2009)
       - `fedora_ml_3k_subset`

    3. The 20 newsgoups dataset
       - `20_newsgroups_3categories`: only the ['comp.graphics',
       'rec.sport.baseball', 'sci.space'] categories

    If you encounter any issues for downloads with this function,
    you can also manually download and extract the required dataset to
    ``cache_dir`` (the download url is ``http://r0h.eu/d/<name>.tar.gz``),
    then re-run this function to get the required metadata.

    Parameters
    ----------
    name : str, default='20_newsgroups_3categories'
       the name of the dataset file to load
    cache_dir : str, default='/tmp/'
       root directory where to save the download
    verbose : bool, default=False
       print download progress
    verify_checksum : bool, default=False
       verify the checksum of the downloaded archive
    document_id_generation : str
       specifies how the document_id is computed from internal_id
       must be one of ``['identity', 'squared']``
       ``default="identity"`` (i.e. ``document_id = internal_id``)
    categories : str
       select a subsection of the dataset, ``default='all'``

    Returns
    -------

    metadata: dict
       a dictionary containing metadata corresponding to the dataset
    training_set : {dict, None}
       a list of dictionaries for the training set
    test_set : dict
       a list of dictionaries for the test set
    """
    from freediscovery.engine.ingestion import DocumentIndex
    from freediscovery.io import parse_ground_truth_file

    if name not in IR_DATASETS:
        raise ValueError('Dataset name {} not known!'.format(name))

    valid_fields = ['document_id', 'internal_id', 'file_path', 'category']

    has_categories = '20_newsgroups_' in name or 'treclegal09' in name

    # make sure we don't have "ediscovery_cache" in the path
    cache_dir = _normalize_cachedir(cache_dir)
    cache_dir = os.path.dirname(cache_dir)

    outdir = os.path.join(cache_dir, name)
    fname = outdir

    db = IR_DATASETS[name]

    if '20_newsgroups_' in name:
        if db['url'].endswith('.pkl.xz'):
            import lzma
            fname = name + '.pkl.xz'
            opener = lzma.open
        else:
            fname = name + '.pkl'
            opener = open

        with opener(os.path.join(INTERNAL_DATA_DIR, fname), 'rb') as fh:
            twenty_news = pickle.load(fh)

    # Download the dataset if it doesn't exist
    if not os.path.exists(outdir):
        if '20_newsgroups_' in name:
            os.mkdir(outdir)
            for idx, doc in enumerate(twenty_news.data):
                with open(os.path.join(outdir, '{:05}.txt'.format(idx)),
                          'wt') as fh:  # noqa
                    fh.write(doc)
        else:
            outdir = _get_file(str(fname),
                               db['url'],
                               extract=True,
                               cache_dir=str(cache_dir))
            print('Downloaded {} dataset to {}'.format(name, outdir))

    if 'treclegal09' in name or 'fedora_ml' in name:
        data_dir = os.path.join(outdir, 'data')
    else:
        data_dir = outdir
    md = {'data_dir': str(data_dir), 'name': name}

    di = DocumentIndex.from_folder(str(data_dir))
    di._make_relative_paths()

    training_set = None

    if 'treclegal09' in name:
        negative_files, positive_files = _load_erdm_ground_truth(outdir)

        ground_truth_file = os.path.join(outdir, "ground_truth_file.txt")
        gt = parse_ground_truth_file(str(ground_truth_file))

        res = di.search(gt, drop=False)
        di.data['category'] = res.is_relevant
        di.data['category'] = di.data['category'].apply(
            lambda x: 'positive' if x == 1 else 'negative')
        di.data['is_train'] = False
        res = di.search(
            pd.DataFrame({'file_path': positive_files + negative_files}))
        di.data.loc[res.internal_id.values, 'is_train'] = True
    elif '20_newsgroups_' in name:
        di.data['category'] = np.array(
            twenty_news.target_names)[twenty_news.target]  # noqa
        di.data['is_train'] = ['-train' in el for el in twenty_news.filenames]

    if categories is not None and has_categories:
        mask = di.data.category.isin(categories)
        di.data = di.data[mask]
        di.data['internal_id'] = np.arange(len(di.data['internal_id']))

    di.data.set_index('internal_id', drop=False, inplace=True)

    di.data['document_id'] = _compute_document_id(di.data['internal_id'],
                                                  document_id_generation)
    di.data = di.data.astype('object')

    if has_categories:
        mask = di.data['is_train']
        training_set = di.render_dict(di.data[mask], return_file_path=True)
        training_set = filter_dict(training_set, valid_fields)
        if name == '20_newsgroups_3categories':
            # make a smaller training set
            random.seed(999998)
            training_set = random.sample(
                training_set, min(len(training_set), di.data.shape[0] // 5))

    dataset = di.render_dict(return_file_path=True)

    dataset = filter_dict(dataset, valid_fields)

    return md, training_set, dataset