Beispiel #1
0
def test_append_documents():
    cache_dir = check_cache()

    fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
    uuid = fe.setup()
    fe.ingest(data_dir)

    X = fe._load_features(uuid)
    db = fe.db_
    filenames = fe.filenames_
    n_samples = len(fe.filenames_)

    docs = DocumentIndex.from_folder(data_dir).data
    docs['document_id'] += 10
    dataset_definition = docs[['file_path',
                               'document_id']].to_dict(orient='records')
    for row in dataset_definition:
        row['file_path'] = os.path.join(data_dir, row['file_path'])
    fe.append(dataset_definition)
    X_new = fe._load_features(uuid)
    assert X_new.shape[0] == X.shape[0] * 2
    assert fe.db_.data.shape[0] == db.data.shape[0] * 2
    assert len(fe.filenames_) == len(filenames) * 2

    dbn = fe.db_.data
    assert_equal(dbn.iloc[:n_samples]['document_id'].values,
                 dbn.iloc[n_samples:]['document_id'].values - 10)
    # check that internal id is contiguous
    assert (np.diff(dbn.internal_id.values) == 1).all()

    # check the number of samples is consistent
    del fe._pars
    assert fe.n_samples_ == n_samples * 2

    fe.delete()
Beispiel #2
0
def test_remove_documents():
    cache_dir = check_cache()

    fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
    uuid = fe.setup()
    fe.ingest(data_dir)

    X = fe._load_features(uuid)
    db = fe.db_.data
    filenames = fe.filenames_
    n_samples = len(fe.filenames_)

    docs = DocumentIndex.from_folder(data_dir).data
    dataset_definition = docs[['document_id']].to_dict(orient='records')
    fe.remove([dataset_definition[2], dataset_definition[4]])
    X_new = fe._load_features(uuid)
    assert X_new.shape[0] == X.shape[0] - 2
    assert fe.db_.data.shape[0] == db.shape[0] - 2
    assert len(fe.filenames_) == len(filenames) - 2

    dbn = fe.db_.data
    assert_equal(db.iloc[[0, 1, 3, 5]]['document_id'].values,
                 dbn['document_id'].values)
    # check that internal id is contiguous
    assert (np.diff(dbn.internal_id.values) == 1).all()

    # check the number of samples is consistent
    del fe._pars
    assert fe.n_samples_ == n_samples - 2

    fe.delete()
Beispiel #3
0
def test_ingestion_pickling():
    from sklearn.externals import joblib
    db = DocumentIndex.from_folder(data_dir)
    fname = os.path.join(cache_dir, 'document_index')
    # check that db is picklable
    joblib.dump(db, fname)
    db2 = joblib.load(fname)
    os.remove(fname)
Beispiel #4
0
def test_search_document_id():
    md = [{
        'file_path': '/test',
        'document_id': 2
    }, {
        'file_path': '/test2',
        'document_id': 1
    }, {
        'file_path': '/test3',
        'document_id': 7
    }, {
        'file_path': '/test8',
        'document_id': 9
    }, {
        'file_path': '/test9',
        'document_id': 4
    }]

    for idx, el in enumerate(md):
        el['internal_id'] = idx

    dbi = DocumentIndex.from_list(md)
    dbi._make_relative_paths()
    query = pd.DataFrame([{
        'internal_id': 1
    }, {
        'internal_id': 2
    }, {
        'internal_id': 1
    }])
    sres = dbi.search(query)
    assert_equal(sres.internal_id.values, [1, 2, 1])
    assert_array_equal(sorted(sres.columns),
                       sorted(['internal_id', 'file_path', 'document_id']))

    # make sure we use internal id first
    query = pd.DataFrame([{
        'internal_id': 1,
        'document_id': 2
    }, {
        'internal_id': 2,
        'document_id': 2
    }, {
        'internal_id': 1,
        'document_id': 2
    }])
    sres = dbi.search(query)
    assert_equal(sres.internal_id.values, [1, 2, 1])

    query = pd.DataFrame([{
        'document_id': 4
    }, {
        'document_id': 9
    }, {
        'document_id': 2
    }])
    sres = dbi.search(query)
    assert_equal(sres.internal_id.values, [4, 3, 0])
Beispiel #5
0
def test_search_not_found():
    dbi = DocumentIndex.from_folder(data_dir)
    query = pd.DataFrame([{
        'file_path': "DOES_NOT_EXISTS"
    }, {
        'file_path': "0.7.6.28637.txt"
    }])
    with pytest.raises(NotFound):
        sres = dbi.search(query)
    def remove(self, dataset_definition):
        """ Remove some documents from the dataset

        This is by no mean an efficient operation, processing all the files
        at once might be more suitable in most occastions.
        """
        from freediscovery.engine.lsi import _LSIWrapper
        dsid_dir = self.dsid_dir
        db_old = self.db_.data
        query = pd.DataFrame(dataset_definition)
        res = self.db_.search(query, drop=False)
        del_internal_id = res.internal_id.values
        internal_id_mask = ~np.in1d(db_old.internal_id.values, del_internal_id)

        # write down the new features file
        X_old = self._load_features()
        X = X_old[internal_id_mask, :]
        joblib.dump(X, str(dsid_dir / 'features'))

        # write down the new filenames file
        filenames = list(np.array(self.filenames_)[internal_id_mask])
        with (dsid_dir / 'filenames').open('wb') as fh:
            pickle.dump(filenames, fh)
        self._filenames = filenames

        # write down the new database file
        db = db_old.iloc[internal_id_mask].copy()
        # create a new contiguous internal_id
        db['internal_id'] = np.arange(db.shape[0], dtype='int')
        self._db = DocumentIndex(self.pars_['data_dir'], db)
        if 'file_path' in db.columns:
            del db['file_path']
        db.to_pickle(str(dsid_dir / 'db'))

        # write down the new pars file
        self._pars = self.pars_
        self._pars['n_samples'] = len(filenames)
        with (dsid_dir / 'pars').open('wb') as fh:
            pickle.dump(self._pars, fh)

        # find all exisisting LSI models and update them as well
        if (dsid_dir / 'lsi').exists():
            for lsi_id in os.listdir(str(dsid_dir / 'lsi')):
                _fname = dsid_dir / 'lsi' / lsi_id / 'data'
                if _fname.exists():
                    X_lsi_old = joblib.load(str(_fname))
                    X_lsi = X_lsi_old[internal_id_mask]
                    joblib.dump(X_lsi, str(_fname))

        # remove all trained models for this dataset
        for model_type in ['categorizer', 'dupdet', 'cluster', 'threading']:
            if (dsid_dir / model_type).exists():
                for mid in os.listdir(str(dsid_dir / model_type)):
                    shutil.rmtree(str(dsid_dir / model_type / mid))
Beispiel #7
0
def test_ingestion_base_dir():
    dbi = DocumentIndex.from_folder(data_dir)
    dbi._make_relative_paths()
    data_dir_res, filenames, db = dbi.data_dir, dbi.filenames_, dbi.data
    assert data_dir_res == os.path.normpath(data_dir)
    assert_array_equal(db.columns.values,
                       ['file_path', 'internal_id', 'document_id'])
    assert_array_equal(db.file_path.values, fnames_in)
    assert_array_equal(
        [os.path.normpath(os.path.join(data_dir_res, el)) for el in filenames],
        [os.path.join(data_dir_res, el) for el in db.file_path.values])
 def db_(self):
     """ DatasetIndex """
     if not hasattr(self, '_db') or self._db is None:
         dsid = self.dsid
         if self.cache_dir is None:
             raise InitException('cache_dir is None: cannot load from cache!')
         dsid_dir = self.cache_dir / dsid
         if not dsid_dir.exists():
             raise DatasetNotFound('dsid {} not found!'.format(dsid))
         data = pd.read_pickle(str(dsid_dir / 'db'))
         self._db = DocumentIndex(self.pars_['data_dir'], data)
     return self._db
Beispiel #9
0
def test_search_2fields():
    dbi = DocumentIndex.from_folder(data_dir)
    dbi._make_relative_paths()

    query = pd.DataFrame([{
        'internal_id': 3
    }, {
        'internal_id': 1
    }, {
        'internal_id': 2
    }])
    sres = dbi.search(query)
    assert_equal(sres.internal_id.values, [3, 1, 2])
    assert_array_equal(sorted(sres.columns),
                       sorted(['internal_id', 'file_path', 'document_id']))

    # make sure that if we have some additional field,
    # we still use the internal_id
    query = pd.DataFrame([{
        'internal_id': 1,
        'a': 2
    }, {
        'internal_id': 2,
        'b': 4
    }, {
        'internal_id': 1,
        'a': 3
    }])
    sres = dbi.search(query)
    assert_equal(sres.internal_id.values, [1, 2, 1])
    assert_array_equal(sorted(sres.columns),
                       sorted(['internal_id', 'file_path', 'document_id']))

    sres = dbi.search(query, drop=False)
    assert_equal(sres.internal_id.values, [1, 2, 1])
    assert_array_equal(
        sorted(sres.columns),
        sorted(['internal_id', 'file_path', 'document_id', 'a', 'b']))

    query = pd.DataFrame([{
        'file_path': "0.7.6.28637.txt"
    }, {
        'file_path': "0.7.47.117435.txt"
    }])
    del dbi.data['file_path']
    sres = dbi.search(query)
    query_res = [
        dbi.data.file_path.values.tolist().index(el)
        for el in query.file_path.values
    ]
    assert_array_equal(query_res, sres.internal_id)
def test_lsi_remove_documents():
    cache_dir = check_cache()

    fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
    uuid = fe.setup()
    fe.ingest(data_dir)

    lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid, mode='w')
    lsi_res, exp_var = lsi.fit_transform(n_components=2, alpha=1.0)
    X_lsi = lsi._load_features()

    docs = DocumentIndex.from_folder(data_dir).data
    dataset_definition = docs[['document_id']].to_dict(orient='records')
    fe.remove([dataset_definition[2], dataset_definition[4]])

    X_lsi_new = lsi._load_features()
    assert X_lsi_new.shape[0] == X_lsi.shape[0] - 2
def test_lsi_append_documents():
    cache_dir = check_cache()

    fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
    uuid = fe.setup()
    fe.ingest(data_dir)

    lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid, mode='w')
    lsi_res, exp_var = lsi.fit_transform(n_components=2, alpha=1.0)
    X_lsi = lsi._load_features()
    n_samples = fe.n_samples_

    docs = DocumentIndex.from_folder(data_dir).data
    docs['document_id'] += 10
    dataset_definition = docs[['file_path', 'document_id']].to_dict(orient='records')
    for row in dataset_definition:
        row['file_path'] = os.path.join(data_dir, row['file_path'])
    fe.append(dataset_definition)

    X_lsi_new = lsi._load_features()
    assert X_lsi_new.shape[0] == X_lsi.shape[0]*2
    assert_equal(X_lsi_new[:n_samples], X_lsi_new[:n_samples])
Beispiel #12
0
def test_ingestion_metadata(n_fields):
    metadata = []
    for idx, fname in enumerate(fnames_in_abs):
        el = {'file_path': fname}
        if n_fields >= 2:
            el['document_id'] = 'a' + str(idx + 100)
        if n_fields >= 3:
            el['rendition_id'] = 1
        metadata.append(el)

    dbi = DocumentIndex.from_list(metadata)
    dbi._make_relative_paths()
    data_dir_res, filenames, db = dbi.data_dir, dbi.filenames_, dbi.data

    if n_fields in [1, 2]:
        columns_ref = sorted(['file_path', 'document_id', 'internal_id'])
    elif n_fields == 3:
        columns_ref = sorted(
            ['file_path', 'document_id', 'rendition_id', 'internal_id'])

    assert_array_equal(sorted(db.columns.values), columns_ref)
    assert_array_equal(
        [os.path.normpath(os.path.join(data_dir_res, el)) for el in filenames],
        [os.path.join(data_dir_res, el) for el in db.file_path.values])
Beispiel #13
0
def test_bad_search_document_rendition_id():
    md = [{
        'file_path': '/test',
        'document_id': 0,
        'rendition_id': 0
    }, {
        'file_path': '/test2',
        'document_id': 0,
        'rendition_id': 1
    }, {
        'file_path': '/test3',
        'document_id': 1,
        'rendition_id': 0
    }, {
        'file_path': '/test8',
        'document_id': 2,
        'rendition_id': 0
    }, {
        'file_path': '/test9',
        'document_id': 3,
        'rendition_id': 0
    }]
    for idx, el in enumerate(md):
        el['internal_id'] = idx

    # can always index with internal_id
    dbi = DocumentIndex.from_list(md)
    query = pd.DataFrame([{
        'internal_id': 1
    }, {
        'internal_id': 2
    }, {
        'document_id': 1
    }])
    with pytest.raises(NotFound):
        sres = dbi.search(query)
    def append(self, dataset_definition, data_dir=None):
        """ Add some documents to the dataset

        This is by no mean an efficient operation, processing all the files
        at once might be more suitable in most occastions.
        """
        from freediscovery.engine.lsi import _LSIWrapper
        dsid_dir = self.dsid_dir
        db_old = self.db_.data
        internal_id_offset = db_old.internal_id.max()
        db_extra = DocumentIndex.from_list(dataset_definition, data_dir,
                                           internal_id_offset + 1, dsid_dir)
        db_new = db_extra.data
        vect = self.vect_
        tfidf = self.tfidf_

        filenames_new = list(db_new.file_path.values)

        # write down the new features file
        X_new_raw = vect.transform(filenames_new)
        X_new = tfidf.transform(X_new_raw)
        X_old = self._load_features()
        X = scipy.sparse.vstack((X_new, X_old))
        joblib.dump(X, str(dsid_dir / 'features'))

        # write down the new filenames file
        filenames_old = list(self.filenames_)
        filenames = filenames_old + filenames_new

        data_dir = DocumentIndex._detect_data_dir(filenames)
        self._pars['data_dir'] = data_dir

        self._filenames = [os.path.relpath(el, data_dir)
                           for el in filenames]

        with (dsid_dir / 'filenames').open('wb') as fh:
            pickle.dump(self._filenames, fh)
        del db_new['file_path']

        # write down the new pars file
        self._pars = self.pars_
        self._pars['n_samples'] = len(filenames)
        with (dsid_dir / 'pars').open('wb') as fh:
            pickle.dump(self._pars, fh)

        # write down the new database file
        db = pd.concat((db_old, db_new))
        if 'file_path' in db.columns:
            del db['file_path']
        db.to_pickle(str(dsid_dir / 'db'))
        self._db = DocumentIndex(self.pars_['data_dir'], db)

        # find all exisisting LSI models and update them as well
        if (dsid_dir / 'lsi').exists():
            for lsi_id in os.listdir(str(dsid_dir / 'lsi')):
                lsi_obj = _LSIWrapper(cache_dir=self.cache_dir,
                                      mid=lsi_id)
                lsi_obj.append(X_new)

        # remove all trained models for this dataset
        for model_type in ['categorizer', 'dupdet', 'cluster', 'threading']:
            if (dsid_dir / model_type).exists():
                for mid in os.listdir(str(dsid_dir / model_type)):
                    shutil.rmtree(str(dsid_dir / model_type / mid))
    def ingest(self, data_dir=None, file_pattern='.*', dir_pattern='.*',
               dataset_definition=None, vectorize=True,
               document_id_generator='indexed_file_path',
               ):
        """Perform data ingestion

        Parameters
        ----------
        data_dir : str
            path to the data directory (used only if metadata not provided),
            default: None
        dataset_defintion : list of dicts
            a list of dictionaries with keys
            ['file_path', 'document_id', 'rendition_id']
            describing the data ingestion (this overwrites data_dir)
        vectorize : bool (default: True)
        """
        dsid_dir = self.cache_dir / self.dsid
        if (dsid_dir / 'db').exists():
            raise ValueError('Dataset {} already vectorized!'
                             .format(self.dsid))
        db_list = list(sorted(dsid_dir.glob('db*')))
        if len(db_list) == 0:
            internal_id_offset = -1
        elif len(db_list) >= 1:
            internal_id_offset = int(db_list[-1].name[3:])

        pars = self.pars_

        if pars.get('column_ids', None) is not None:
            if dataset_definition is None:
                raise ValueError("CSV files can only be privided using "
                                 "`dataset_definition` parameter")
            else:
                if len(dataset_definition) > 1:
                    raise ValueError(
                            "Only one CSV can be provided at a time"
                    )
                file_path = dataset_definition[0]['file_path']
                X = pd.read_csv(
                        file_path, sep=pars['column_separator'], header=None)
                dataset_definition = [
                        {'file_path': f"{file_path}:{idx}", 'document_id': idx}
                        for idx in range(len(X))]

                db = DocumentIndex.from_list(
                        dataset_definition, data_dir,
                        internal_id_offset + 1, dsid_dir,
                        document_id_generator=document_id_generator)
        elif dataset_definition is not None:
            db = DocumentIndex.from_list(
                    dataset_definition, data_dir,
                    internal_id_offset + 1, dsid_dir,
                    document_id_generator=document_id_generator)
        elif data_dir is not None:
            db = DocumentIndex.from_folder(
                    data_dir, file_pattern, dir_pattern,
                    internal_id_offset + 1,
                    document_id_generator=document_id_generator)
        else:
            db = None

        if db is not None:
            data_dir = db.data_dir

            batch_suffix = '.{:09}'.format(db.data.internal_id.iloc[-1])

            self._filenames = db.data.file_path.values.tolist()
            del db.data['file_path']


            if 'file_path' in db.data.columns:
                del db.data['file_path']
            db.data.to_pickle(str(dsid_dir / ('db' + batch_suffix)))
            with (dsid_dir / ('filenames' + batch_suffix)).open('wb') as fh:
                pickle.dump(self._filenames, fh)
            self._db = db

        if vectorize:
            db_list = list(sorted(dsid_dir.glob('db*')))
            filenames_list = list(sorted(dsid_dir.glob('filenames*')))
            if len(db_list) == 0:
                raise ValueError('No ingested files found!')

            if len(db_list) == 1:
                with filenames_list[0].open('rb') as fh:
                    filenames_concat = pickle.load(fh)
            elif len(db_list) >= 2:
                # accumulate different batches into a single file
                # filename file
                filenames_concat = []
                for fname in filenames_list:
                    with fname.open('rb') as fh:
                        filenames_concat += pickle.load(fh)

            if self.pars_['data_dir'] is None:
                data_dir = DocumentIndex._detect_data_dir(filenames_concat)
                self._pars['data_dir'] = data_dir
            else:
                data_dir = self._pars['data_dir']

            self._filenames = [os.path.relpath(el, data_dir)
                               for el in filenames_concat]

            with (dsid_dir / 'filenames').open('wb') as fh:
                pickle.dump(self._filenames, fh)

            for fname in filenames_list:
                fname.unlink()

            # save databases
            if len(db_list) == 1:
                db_list[0].rename(dsid_dir / 'db')
                self.db_.filenames_ = self._filenames
                self.db_.data['file_path'] = self._filenames
            elif len(db_list) >= 2:

                db_concat = []
                for fname in db_list:
                    db_concat.append(pd.read_pickle(str(fname)))
                db_new = pd.concat(db_concat, axis=0)
                db_new.filenames_ = self._filenames
                db_new.set_index('internal_id', drop=False, inplace=True)
                self._db = DocumentIndex(data_dir, db_new)
                if 'file_path' in db_new.columns:
                    del db_new['file_path']
                db_new.to_pickle(str(dsid_dir / 'db'))

            # save parameters
            self._pars['n_samples'] = len(self._filenames)
            self._pars['data_dir'] = data_dir

            with (dsid_dir / 'pars').open('wb') as fh:
                pickle.dump(self._pars, fh)

            self.transform()

            if (dsid_dir / 'raw').exists():
                shutil.rmtree(str(dsid_dir / 'raw'))

        if db is None and not vectorize:
            raise ValueError('At least one of data_dir, dataset_definition, '
                             'vectorize parameters must be provided!')
        return
Beispiel #16
0
def test_search_document_rendition_id():
    md = [{
        'file_path': '/test',
        'document_id': 0,
        'rendition_id': 0
    }, {
        'file_path': '/test2',
        'document_id': 0,
        'rendition_id': 1
    }, {
        'file_path': '/test3',
        'document_id': 1,
        'rendition_id': 0
    }, {
        'file_path': '/test8',
        'document_id': 2,
        'rendition_id': 0
    }, {
        'file_path': '/test9',
        'document_id': 3,
        'rendition_id': 0
    }]

    for idx, el in enumerate(md):
        el['internal_id'] = idx

    # can always index with internal_id
    dbi = DocumentIndex.from_list(md)
    dbi._make_relative_paths()

    query = pd.DataFrame([{
        'internal_id': 1
    }, {
        'internal_id': 2
    }, {
        'internal_id': 1
    }])
    sres = dbi.search(query)
    assert_equal(sres.internal_id.values, [1, 2, 1])
    assert_array_equal(
        sorted(sres.columns),
        sorted(['internal_id', 'file_path', 'document_id', 'rendition_id']))

    # the internal id is not sufficient to fully index documents in this case
    query = pd.DataFrame([{
        'document_id': 0
    }, {
        'document_id': 1
    }, {
        'document_id': 2
    }])
    with pytest.raises(ValueError):
        sres = dbi.search(query)

    query = pd.DataFrame([{
        'document_id': 0,
        'rendition_id': 0
    }, {
        'document_id': 1,
        'rendition_id': 0
    }, {
        'document_id': 2,
        'rendition_id': 0
    }])

    sres = dbi.search(query)
    assert_equal(sres.internal_id.values, [0, 2, 3])
Beispiel #17
0
def load_dataset(name='20_newsgroups_3categories',
                 cache_dir='/tmp',
                 verbose=False,
                 verify_checksum=False,
                 document_id_generation='squared',
                 categories=None):
    """Download a benchmark dataset.

    The currently supported datasets are listed below,

    1. TREC 2009 legal collection

       - `treclegal09_2k_subset`  :   2 400 documents,   2 MB
       - `treclegal09_20k_subset` :  20 000 documents,  30 MB
       - `treclegal09_37k_subset` :  37 000 documents,  55 MB
       - `treclegal09`            : 700 000 documents, 1.2 GB

       The ground truth files for categorization are adapted from TAR Toolkit.

    2. Fedora mailing list (2009-2009)
       - `fedora_ml_3k_subset`

    3. The 20 newsgoups dataset
       - `20_newsgroups_3categories`: only the ['comp.graphics',
       'rec.sport.baseball', 'sci.space'] categories

    If you encounter any issues for downloads with this function,
    you can also manually download and extract the required dataset to
    ``cache_dir`` (the download url is ``http://r0h.eu/d/<name>.tar.gz``),
    then re-run this function to get the required metadata.

    Parameters
    ----------
    name : str, default='20_newsgroups_3categories'
       the name of the dataset file to load
    cache_dir : str, default='/tmp/'
       root directory where to save the download
    verbose : bool, default=False
       print download progress
    verify_checksum : bool, default=False
       verify the checksum of the downloaded archive
    document_id_generation : str
       specifies how the document_id is computed from internal_id
       must be one of ``['identity', 'squared']``
       ``default="identity"`` (i.e. ``document_id = internal_id``)
    categories : str
       select a subsection of the dataset, ``default='all'``

    Returns
    -------

    metadata: dict
       a dictionary containing metadata corresponding to the dataset
    training_set : {dict, None}
       a list of dictionaries for the training set
    test_set : dict
       a list of dictionaries for the test set
    """
    from freediscovery.engine.ingestion import DocumentIndex
    from freediscovery.io import parse_ground_truth_file

    if name not in IR_DATASETS:
        raise ValueError('Dataset name {} not known!'.format(name))

    valid_fields = ['document_id', 'internal_id', 'file_path', 'category']

    has_categories = '20_newsgroups_' in name or 'treclegal09' in name

    # make sure we don't have "ediscovery_cache" in the path
    cache_dir = _normalize_cachedir(cache_dir)
    cache_dir = os.path.dirname(cache_dir)

    outdir = os.path.join(cache_dir, name)
    fname = outdir

    db = IR_DATASETS[name]

    if '20_newsgroups_' in name:
        if db['url'].endswith('.pkl.xz'):
            import lzma
            fname = name + '.pkl.xz'
            opener = lzma.open
        else:
            fname = name + '.pkl'
            opener = open

        with opener(os.path.join(INTERNAL_DATA_DIR, fname), 'rb') as fh:
            twenty_news = pickle.load(fh)

    # Download the dataset if it doesn't exist
    if not os.path.exists(outdir):
        if '20_newsgroups_' in name:
            os.mkdir(outdir)
            for idx, doc in enumerate(twenty_news.data):
                with open(os.path.join(outdir, '{:05}.txt'.format(idx)),
                          'wt') as fh:  # noqa
                    fh.write(doc)
        else:
            outdir = _get_file(str(fname),
                               db['url'],
                               extract=True,
                               cache_dir=str(cache_dir))
            print('Downloaded {} dataset to {}'.format(name, outdir))

    if 'treclegal09' in name or 'fedora_ml' in name:
        data_dir = os.path.join(outdir, 'data')
    else:
        data_dir = outdir
    md = {'data_dir': str(data_dir), 'name': name}

    di = DocumentIndex.from_folder(str(data_dir))
    di._make_relative_paths()

    training_set = None

    if 'treclegal09' in name:
        negative_files, positive_files = _load_erdm_ground_truth(outdir)

        ground_truth_file = os.path.join(outdir, "ground_truth_file.txt")
        gt = parse_ground_truth_file(str(ground_truth_file))

        res = di.search(gt, drop=False)
        di.data['category'] = res.is_relevant
        di.data['category'] = di.data['category'].apply(
            lambda x: 'positive' if x == 1 else 'negative')
        di.data['is_train'] = False
        res = di.search(
            pd.DataFrame({'file_path': positive_files + negative_files}))
        di.data.loc[res.internal_id.values, 'is_train'] = True
    elif '20_newsgroups_' in name:
        di.data['category'] = np.array(
            twenty_news.target_names)[twenty_news.target]  # noqa
        di.data['is_train'] = ['-train' in el for el in twenty_news.filenames]

    if categories is not None and has_categories:
        mask = di.data.category.isin(categories)
        di.data = di.data[mask]
        di.data['internal_id'] = np.arange(len(di.data['internal_id']))

    di.data.set_index('internal_id', drop=False, inplace=True)

    di.data['document_id'] = _compute_document_id(di.data['internal_id'],
                                                  document_id_generation)
    di.data = di.data.astype('object')

    if has_categories:
        mask = di.data['is_train']
        training_set = di.render_dict(di.data[mask], return_file_path=True)
        training_set = filter_dict(training_set, valid_fields)
        if name == '20_newsgroups_3categories':
            # make a smaller training set
            random.seed(999998)
            training_set = random.sample(
                training_set, min(len(training_set), di.data.shape[0] // 5))

    dataset = di.render_dict(return_file_path=True)

    dataset = filter_dict(dataset, valid_fields)

    return md, training_set, dataset
Beispiel #18
0
def test_ingestion_render(return_file_path):
    def _process_results(rd):
        rd = pd.DataFrame(rd)
        if return_file_path:
            assert 'file_path' in rd.columns
            del rd['file_path']
        return rd

    # make it a binary variable
    return_file_path = (return_file_path == 'return_file_path')

    md = [{
        'file_path': '/test',
        'document_id': 2
    }, {
        'file_path': '/test2',
        'document_id': 1
    }, {
        'file_path': '/test3',
        'document_id': 7
    }, {
        'file_path': '/test8',
        'document_id': 9
    }, {
        'file_path': '/test9',
        'document_id': 4
    }]

    for idx, el in enumerate(md):
        el['internal_id'] = idx

    dbi = DocumentIndex.from_list(md)
    query = pd.DataFrame([{
        'a': 2,
        'internal_id': 3
    }, {
        'a': 4,
        'internal_id': 1
    }])
    res = pd.DataFrame([{
        'a': 2,
        'internal_id': 3,
        'document_id': 9
    }, {
        'a': 4,
        'internal_id': 1,
        'document_id': 1
    }])

    rd = dbi.render_dict(query, return_file_path=return_file_path)
    rd = _process_results(rd)
    assert_frame_equal(rd, res)
    rd = dbi.render_dict(return_file_path=return_file_path)
    rd = _process_results(rd)
    assert_frame_equal(rd.loc[[0]],
                       pd.DataFrame([{
                           'internal_id': 0,
                           'document_id': 2
                       }]))
    assert len(rd) == len(md)

    rd = dbi.render_list(res, return_file_path=return_file_path)
    rd = _process_results(rd)
    assert sorted(rd.keys()) == sorted(['internal_id', 'document_id', 'a'])
    assert_frame_equal(pd.DataFrame(rd),
                       pd.DataFrame([{
                           'a': 2,
                           'internal_id': 3,
                           'document_id': 9
                       }, {
                           'a': 4,
                           'internal_id': 1,
                           'document_id': 1
                       }]),
                       check_like=True)

    rd = dbi.render_list()
    assert sorted(rd.keys()) == sorted(['internal_id', 'document_id'])