예제 #1
0
    def analyse_document(self, project_id, document_file):
        """
        Analyse a document by comparing it to a project model.

        """
        project = self.get_project(project_id)

        # Create temporary index for the new document
        analyser = DefaultAnalyser(stopword_list=project.stopwords)
        doc_index = Index.create(Schema(text=TEXT(analyser=analyser)),
                                 storage_cls=SqliteMemoryStorage)
        text = document_file.read()
        doc_index.add_document(text=text,
                               frame_size=0,
                               encoding_errors='replace')

        index = Index.open(os.path.join(ProjectStorage.DATA_DIR, project_id),
                           storage_cls=SqliteStorage)
        lsi_plugin = LSIPlugin(index)
        results = {}
        searcher = index.searcher()
        for facet_name in project.facets:
            facet_results = lsi_plugin.compare_document(
                doc_index,
                model_filter_query=QSQ('facet={}'.format(facet_name)))
            # Replace frame ids with document names
            for frame_id in facet_results.keys():
                doc_name = index.get_frame(frame_id)['document_name']
                facet_results[doc_name] = facet_results.pop(frame_id)
            results[facet_name] = facet_results

        return results
예제 #2
0
    def __init__(self):

        # Load/initialise projects index
        index_path = os.path.join(ProjectStorage.DATA_DIR, 'projects')
        try:
            self._projects_index = Index.open(path=index_path,
                                              storage_cls=SqliteStorage)
            logger.info(
                'Loaded projects index ({} project(s) available)'.format(
                    self._projects_index.get_document_count()))
        except IndexNotFoundError:
            os.mkdir(index_path)
            self._projects_index = Index.create(Schema(
                id=ID(indexed=True),
                name=CATEGORICAL_TEXT(indexed=True),
                facets_json=CATEGORICAL_TEXT,
                stopwords_json=CATEGORICAL_TEXT,
                status=CATEGORICAL_TEXT,
                info=CATEGORICAL_TEXT),
                                                path=index_path,
                                                storage_cls=SqliteStorage)
            logger.info('Created projects index')
예제 #3
0
def test_lsi_plugin(index_dir):
    with open(os.path.abspath('caterpillar_lsi/test_resources/csg_data.csv'),
              'r') as f:
        analyser = DefaultAnalyser(stopword_list=stopwords.ENGLISH_TEST)
        with IndexWriter(
                index_dir,
                IndexConfig(
                    SqliteStorage,
                    Schema(text=TEXT(analyser=analyser),
                           id=ID(stored=True, indexed=True)))) as writer:
            csv_reader = csv.reader(f)
            i = 0
            doc_ids = []
            for row in csv_reader:
                if row[0] == 'AGEE':  # The Age
                    writer.add_document(frame_size=0,
                                        update_index=False,
                                        text=row[4],
                                        id="doc-{}".format(i))
                    doc_ids.append(i)
                i += 1

    with IndexReader(index_dir) as reader:
        with pytest.raises(RuntimeError):
            # Plugin not yet run
            LSIPlugin(reader).compare_index_with_model(
                reader, QueryStringQuery("id=doc-{}".format(doc_ids[0])))
    with IndexReader(index_dir) as reader:
        with pytest.raises(RuntimeError):
            # Plugin not yet run
            LSIPlugin(reader).compare_index_using_model(reader)

    with IndexWriter(index_dir) as writer:
        writer.run_plugin(LSIPlugin,
                          normalise_frequencies=True,
                          calculate_document_similarities=False)

    with IndexReader(index_dir) as reader:
        lsi_plugin = LSIPlugin(reader)
        with pytest.raises(RuntimeError):
            # Document similarities not available
            lsi_plugin.get_document_similarities()

    with IndexWriter(index_dir) as writer:
        lsi_plugin = writer.run_plugin(LSIPlugin,
                                       normalise_frequencies=True,
                                       calculate_document_similarities=True)

    with IndexReader(index_dir) as reader:
        info = LSIPlugin(reader).get_info()

        # Verify that we can produce the same results on the original input matrix
        for d_i in range(info.model.num_documents):
            nv = info.model._normalise_term_vector(lsi_plugin._C.T[d_i])
            fv = info.model._compute_document_feature_vector(nv)
            for fv_i in range(len(fv)):
                assert numpy.isclose(fv[fv_i], info.model.S_Vt_T[d_i][fv_i])

        # Verify documents match each other exactly
        r = lsi_plugin.compare_index_with_model(reader)
        for d_id in r.keys():
            assert_almost_equal(r[d_id][d_id], 1)
        similarities, sim_frame_ids = lsi_plugin.get_document_similarities()
        for i, d_similarities in enumerate(similarities):
            assert_almost_equal(d_similarities[i], 1)

        # Check compare document with model filter
        q = QueryStringQuery('environment')
        r = LSIPlugin(reader).compare_index_with_model(reader, q)
        count = reader.searcher().count(q)
        for d_id in r.keys():
            assert len(r[d_id]) == count

        # Check document similarities with modified order
        f_ids = list(lsi_plugin._frame_ids)
        f_ids.reverse()
        assert lsi_plugin.get_document_similarities(f_ids)[1] == f_ids
        num_features = reader.get_vocab_size() + 1

        # Classify text with the model
        sims = LSIPlugin(reader).compare_index_using_model(reader)
        for f_id in sims.keys():
            assert_almost_equal(
                sims[f_id][f_id],
                1)  # Documents should have a similarity of 1 with themselves

    with IndexWriter(index_dir) as writer:
        with pytest.raises(RuntimeError):
            # Too many features
            writer.run_plugin(LSIPlugin, num_features=num_features)
예제 #4
0
def test_schema():
    simple_schema = Schema(test=TEXT, user=ID)
    names = simple_schema.names()
    items = simple_schema.items()

    assert len(simple_schema) == 2
    assert len(names) == 2
    assert 'test' in names
    assert 'user' in names
    assert len(items) == 2

    assert isinstance(simple_schema['test'], TEXT)
    assert isinstance(simple_schema['user'], ID)
    with pytest.raises(KeyError):
        simple_schema['no_item']

    for field in simple_schema:
        assert isinstance(field, FieldType)

    assert 'test' in simple_schema
    assert 'text' not in simple_schema

    with pytest.raises(FieldConfigurationError):
        simple_schema.add("_test", TEXT)
    with pytest.raises(FieldConfigurationError):
        simple_schema.add("test", TEXT)
    with pytest.raises(FieldConfigurationError):
        simple_schema.add("text", object)
    with pytest.raises(FieldConfigurationError):
        simple_schema.add("text", str)
    with pytest.raises(FieldConfigurationError):
        simple_schema.add("text", IndexWriter)

    with pytest.raises(ValueError):
        NUMERIC(num_type=str)
    with pytest.raises(NotImplementedError):
        FieldType().equals('a', 'b')
    with pytest.raises(ValueError):
        list(NUMERIC().analyse('notanumber'))

    f = NUMERIC(num_type=float)
    assert f.equals('1', '1.0')

    dt = DATETIME(analyser=DateTimeAnalyser(datetime_formats=['HH:mm DD/MM/YYYY']))
    assert dt.value_of('10:05 01/12/2016') == '2016-12-01T10:05:00z'
    assert dt.equals('10:05 01/12/2016', '10:05 01/12/2016')
    assert dt.gt('10:06 01/12/2016', '10:05 01/12/2016')
    assert dt.gte('10:05 01/12/2016', '10:05 01/12/2016')
    assert dt.gte('10:05 02/12/2016', '10:05 01/12/2016')
    assert dt.lt('01:05 01/12/2016', '10:05 01/12/2016')
    assert dt.lte('10:05 01/12/2016', '10:05 01/12/2016')
    assert dt.lte('10:05 01/12/2015', '10:05 01/12/2016')

    assert list(BOOLEAN().analyse('1'))[0].value is True
예제 #5
0
def test_schema():
    simple_schema = Schema(test=TEXT, user=ID)
    names = simple_schema.names()
    items = simple_schema.items()

    assert len(simple_schema) == 2
    assert len(names) == 2
    assert 'test' in names
    assert 'user' in names
    assert len(items) == 2

    assert isinstance(simple_schema['test'], TEXT)
    assert isinstance(simple_schema['user'], ID)
    with pytest.raises(KeyError):
        simple_schema['no_item']

    for field in simple_schema:
        assert isinstance(field, FieldType)

    assert 'test' in simple_schema
    assert 'text' not in simple_schema

    with pytest.raises(FieldConfigurationError):
        simple_schema.add("_test", TEXT)
    with pytest.raises(FieldConfigurationError):
        simple_schema.add("test", TEXT)
    with pytest.raises(FieldConfigurationError):
        simple_schema.add("text", object)
    with pytest.raises(FieldConfigurationError):
        simple_schema.add("text", str)
    with pytest.raises(FieldConfigurationError):
        simple_schema.add("text", IndexWriter)

    with pytest.raises(ValueError):
        NUMERIC(num_type=str)
    with pytest.raises(NotImplementedError):
        FieldType().equals('a', 'b')
    with pytest.raises(ValueError):
        list(NUMERIC().analyse('notanumber'))

    f = NUMERIC(num_type=float)
    assert f.equals('1', '1.0')

    assert list(BOOLEAN().analyse('1'))[0].value is True
예제 #6
0
def test_schema():
    simple_schema = Schema(test=TEXT, user=ID)
    names = simple_schema.names()
    items = simple_schema.items()

    assert len(simple_schema) == 2
    assert len(names) == 2
    assert "test" in names
    assert "user" in names
    assert len(items) == 2

    assert isinstance(simple_schema["test"], TEXT)
    assert isinstance(simple_schema["user"], ID)
    with pytest.raises(KeyError):
        simple_schema["no_item"]

    for field in simple_schema:
        assert isinstance(field, FieldType)

    assert "test" in simple_schema
    assert "text" not in simple_schema

    with pytest.raises(FieldConfigurationError):
        simple_schema.add("_test", TEXT)
    with pytest.raises(FieldConfigurationError):
        simple_schema.add("test", TEXT)
    with pytest.raises(FieldConfigurationError):
        simple_schema.add("text", object)
    with pytest.raises(FieldConfigurationError):
        simple_schema.add("text", str)
    with pytest.raises(FieldConfigurationError):
        simple_schema.add("text", IndexWriter)

    with pytest.raises(ValueError):
        NUMERIC(num_type=str)
    with pytest.raises(NotImplementedError):
        FieldType().equals("a", "b")
    with pytest.raises(ValueError):
        list(NUMERIC().analyse("notanumber"))

    f = NUMERIC(num_type=float)
    assert f.equals("1", "1.0")

    assert list(BOOLEAN().analyse("1"))[0].value is True
예제 #7
0
    def create_project(self,
                       project_name,
                       data_zip_file,
                       num_features,
                       normalise_frequencies,
                       stopwords_file=None):
        """
        Create a project from a zip file.

        Expected structure is:

            facetA/document1.txt
            facetA/document2.txt
            facetB/document1.txt
            ...

        """
        if self._projects_index.searcher().count(
                QSQ('name={}'.format(project_name))) > 0:
            raise DuplicateProjectNameError(
                "A project with that name already exists")

        # Create project record
        stopword_list = stopwords.parse_stopwords(
            stopwords_file
        ) if stopwords_file is not None else stopwords.ENGLISH
        project_id = str(uuid4())
        info = {
            'num_documents': 0,
            'vocab_size': 0,
            'num_features': num_features,
            'normalise_frequencies': normalise_frequencies
        }
        doc_id = self._projects_index.add_document(
            frame_size=0,
            id=project_id,
            name=project_name,
            facets_json='[]',
            stopwords_json=json.dumps(stopword_list),
            status='Running',
            info=json.dumps(info))

        # Create index
        logger.info('Creating new project {}'.format(project_name))
        index_path = os.path.join(ProjectStorage.DATA_DIR, project_id)
        os.mkdir(index_path)
        analyser = DefaultAnalyser(stopword_list=stopword_list)
        index = Index.create(Schema(
            text=TEXT(analyser=analyser),
            facet=CATEGORICAL_TEXT(indexed=True),
            document_name=CATEGORICAL_TEXT(indexed=True)),
                             path=index_path,
                             storage_cls=SqliteStorage)

        try:
            # Process zip and add documents
            facets = defaultdict(int)
            with zipfile.ZipFile(data_zip_file) as data:
                for name in data.namelist():
                    if not name.endswith('/'):
                        facet, doc_name = name.split('/')
                        doc_name = re.sub(
                            r'([^\s\w\.]|_)+', ' ',
                            doc_name)  # Replace non alphanumeric characters
                        facets[facet] += 1
                        index.add_document(frame_size=0,
                                           update_index=False,
                                           encoding_errors='replace',
                                           text=data.open(name).read(),
                                           facet=facet,
                                           document_name=doc_name)
                        logger.info('Added document {} to facet {}'.format(
                            doc_name, facet))

            # Perform indexing and LSI generation
            index.reindex()
            num_features = min(num_features, index.get_vocab_size())
            index.run_plugin(LSIPlugin,
                             num_features=num_features,
                             normalise_frequencies=normalise_frequencies,
                             calculate_document_similarities=True)
        except Exception as e:
            # Update project record (error)
            self._projects_index.delete_document(doc_id)
            self._projects_index.add_document(
                frame_size=0,
                id=project_id,
                name=project_name,
                info=json.dumps(info),
                facets_json=json.dumps(facets),
                stopwords_json=json.dumps(stopword_list),
                status='Error ({})'.format(e))
            logger.info('Error creating project {}'.format(project_name))
            raise
        else:
            # Update project record (finished)
            self._projects_index.delete_document(doc_id)
            info = {
                'num_documents': index.get_document_count(),
                'vocab_size': index.get_vocab_size(),
                'num_features': num_features,
                'normalise_frequencies': normalise_frequencies
            }
            self._projects_index.add_document(
                frame_size=0,
                id=project_id,
                name=project_name,
                facets_json=json.dumps(facets),
                stopwords_json=json.dumps(stopword_list),
                status='Finished',
                info=json.dumps(info))

            logger.info('Created project {}'.format(project_name))