def analyse_document(self, project_id, document_file): """ Analyse a document by comparing it to a project model. """ project = self.get_project(project_id) # Create temporary index for the new document analyser = DefaultAnalyser(stopword_list=project.stopwords) doc_index = Index.create(Schema(text=TEXT(analyser=analyser)), storage_cls=SqliteMemoryStorage) text = document_file.read() doc_index.add_document(text=text, frame_size=0, encoding_errors='replace') index = Index.open(os.path.join(ProjectStorage.DATA_DIR, project_id), storage_cls=SqliteStorage) lsi_plugin = LSIPlugin(index) results = {} searcher = index.searcher() for facet_name in project.facets: facet_results = lsi_plugin.compare_document( doc_index, model_filter_query=QSQ('facet={}'.format(facet_name))) # Replace frame ids with document names for frame_id in facet_results.keys(): doc_name = index.get_frame(frame_id)['document_name'] facet_results[doc_name] = facet_results.pop(frame_id) results[facet_name] = facet_results return results
def __init__(self): # Load/initialise projects index index_path = os.path.join(ProjectStorage.DATA_DIR, 'projects') try: self._projects_index = Index.open(path=index_path, storage_cls=SqliteStorage) logger.info( 'Loaded projects index ({} project(s) available)'.format( self._projects_index.get_document_count())) except IndexNotFoundError: os.mkdir(index_path) self._projects_index = Index.create(Schema( id=ID(indexed=True), name=CATEGORICAL_TEXT(indexed=True), facets_json=CATEGORICAL_TEXT, stopwords_json=CATEGORICAL_TEXT, status=CATEGORICAL_TEXT, info=CATEGORICAL_TEXT), path=index_path, storage_cls=SqliteStorage) logger.info('Created projects index')
def test_lsi_plugin(index_dir): with open(os.path.abspath('caterpillar_lsi/test_resources/csg_data.csv'), 'r') as f: analyser = DefaultAnalyser(stopword_list=stopwords.ENGLISH_TEST) with IndexWriter( index_dir, IndexConfig( SqliteStorage, Schema(text=TEXT(analyser=analyser), id=ID(stored=True, indexed=True)))) as writer: csv_reader = csv.reader(f) i = 0 doc_ids = [] for row in csv_reader: if row[0] == 'AGEE': # The Age writer.add_document(frame_size=0, update_index=False, text=row[4], id="doc-{}".format(i)) doc_ids.append(i) i += 1 with IndexReader(index_dir) as reader: with pytest.raises(RuntimeError): # Plugin not yet run LSIPlugin(reader).compare_index_with_model( reader, QueryStringQuery("id=doc-{}".format(doc_ids[0]))) with IndexReader(index_dir) as reader: with pytest.raises(RuntimeError): # Plugin not yet run LSIPlugin(reader).compare_index_using_model(reader) with IndexWriter(index_dir) as writer: writer.run_plugin(LSIPlugin, normalise_frequencies=True, calculate_document_similarities=False) with IndexReader(index_dir) as reader: lsi_plugin = LSIPlugin(reader) with pytest.raises(RuntimeError): # Document similarities not available lsi_plugin.get_document_similarities() with IndexWriter(index_dir) as writer: lsi_plugin = writer.run_plugin(LSIPlugin, normalise_frequencies=True, calculate_document_similarities=True) with IndexReader(index_dir) as reader: info = LSIPlugin(reader).get_info() # Verify that we can produce the same results on the original input matrix for d_i in range(info.model.num_documents): nv = info.model._normalise_term_vector(lsi_plugin._C.T[d_i]) fv = info.model._compute_document_feature_vector(nv) for fv_i in range(len(fv)): assert numpy.isclose(fv[fv_i], info.model.S_Vt_T[d_i][fv_i]) # Verify documents match each other exactly r = lsi_plugin.compare_index_with_model(reader) for d_id in r.keys(): assert_almost_equal(r[d_id][d_id], 1) similarities, sim_frame_ids = lsi_plugin.get_document_similarities() for i, d_similarities in enumerate(similarities): assert_almost_equal(d_similarities[i], 1) # Check compare document with model filter q = QueryStringQuery('environment') r = LSIPlugin(reader).compare_index_with_model(reader, q) count = reader.searcher().count(q) for d_id in r.keys(): assert len(r[d_id]) == count # Check document similarities with modified order f_ids = list(lsi_plugin._frame_ids) f_ids.reverse() assert lsi_plugin.get_document_similarities(f_ids)[1] == f_ids num_features = reader.get_vocab_size() + 1 # Classify text with the model sims = LSIPlugin(reader).compare_index_using_model(reader) for f_id in sims.keys(): assert_almost_equal( sims[f_id][f_id], 1) # Documents should have a similarity of 1 with themselves with IndexWriter(index_dir) as writer: with pytest.raises(RuntimeError): # Too many features writer.run_plugin(LSIPlugin, num_features=num_features)
def test_schema(): simple_schema = Schema(test=TEXT, user=ID) names = simple_schema.names() items = simple_schema.items() assert len(simple_schema) == 2 assert len(names) == 2 assert 'test' in names assert 'user' in names assert len(items) == 2 assert isinstance(simple_schema['test'], TEXT) assert isinstance(simple_schema['user'], ID) with pytest.raises(KeyError): simple_schema['no_item'] for field in simple_schema: assert isinstance(field, FieldType) assert 'test' in simple_schema assert 'text' not in simple_schema with pytest.raises(FieldConfigurationError): simple_schema.add("_test", TEXT) with pytest.raises(FieldConfigurationError): simple_schema.add("test", TEXT) with pytest.raises(FieldConfigurationError): simple_schema.add("text", object) with pytest.raises(FieldConfigurationError): simple_schema.add("text", str) with pytest.raises(FieldConfigurationError): simple_schema.add("text", IndexWriter) with pytest.raises(ValueError): NUMERIC(num_type=str) with pytest.raises(NotImplementedError): FieldType().equals('a', 'b') with pytest.raises(ValueError): list(NUMERIC().analyse('notanumber')) f = NUMERIC(num_type=float) assert f.equals('1', '1.0') dt = DATETIME(analyser=DateTimeAnalyser(datetime_formats=['HH:mm DD/MM/YYYY'])) assert dt.value_of('10:05 01/12/2016') == '2016-12-01T10:05:00z' assert dt.equals('10:05 01/12/2016', '10:05 01/12/2016') assert dt.gt('10:06 01/12/2016', '10:05 01/12/2016') assert dt.gte('10:05 01/12/2016', '10:05 01/12/2016') assert dt.gte('10:05 02/12/2016', '10:05 01/12/2016') assert dt.lt('01:05 01/12/2016', '10:05 01/12/2016') assert dt.lte('10:05 01/12/2016', '10:05 01/12/2016') assert dt.lte('10:05 01/12/2015', '10:05 01/12/2016') assert list(BOOLEAN().analyse('1'))[0].value is True
def test_schema(): simple_schema = Schema(test=TEXT, user=ID) names = simple_schema.names() items = simple_schema.items() assert len(simple_schema) == 2 assert len(names) == 2 assert 'test' in names assert 'user' in names assert len(items) == 2 assert isinstance(simple_schema['test'], TEXT) assert isinstance(simple_schema['user'], ID) with pytest.raises(KeyError): simple_schema['no_item'] for field in simple_schema: assert isinstance(field, FieldType) assert 'test' in simple_schema assert 'text' not in simple_schema with pytest.raises(FieldConfigurationError): simple_schema.add("_test", TEXT) with pytest.raises(FieldConfigurationError): simple_schema.add("test", TEXT) with pytest.raises(FieldConfigurationError): simple_schema.add("text", object) with pytest.raises(FieldConfigurationError): simple_schema.add("text", str) with pytest.raises(FieldConfigurationError): simple_schema.add("text", IndexWriter) with pytest.raises(ValueError): NUMERIC(num_type=str) with pytest.raises(NotImplementedError): FieldType().equals('a', 'b') with pytest.raises(ValueError): list(NUMERIC().analyse('notanumber')) f = NUMERIC(num_type=float) assert f.equals('1', '1.0') assert list(BOOLEAN().analyse('1'))[0].value is True
def test_schema(): simple_schema = Schema(test=TEXT, user=ID) names = simple_schema.names() items = simple_schema.items() assert len(simple_schema) == 2 assert len(names) == 2 assert "test" in names assert "user" in names assert len(items) == 2 assert isinstance(simple_schema["test"], TEXT) assert isinstance(simple_schema["user"], ID) with pytest.raises(KeyError): simple_schema["no_item"] for field in simple_schema: assert isinstance(field, FieldType) assert "test" in simple_schema assert "text" not in simple_schema with pytest.raises(FieldConfigurationError): simple_schema.add("_test", TEXT) with pytest.raises(FieldConfigurationError): simple_schema.add("test", TEXT) with pytest.raises(FieldConfigurationError): simple_schema.add("text", object) with pytest.raises(FieldConfigurationError): simple_schema.add("text", str) with pytest.raises(FieldConfigurationError): simple_schema.add("text", IndexWriter) with pytest.raises(ValueError): NUMERIC(num_type=str) with pytest.raises(NotImplementedError): FieldType().equals("a", "b") with pytest.raises(ValueError): list(NUMERIC().analyse("notanumber")) f = NUMERIC(num_type=float) assert f.equals("1", "1.0") assert list(BOOLEAN().analyse("1"))[0].value is True
def create_project(self, project_name, data_zip_file, num_features, normalise_frequencies, stopwords_file=None): """ Create a project from a zip file. Expected structure is: facetA/document1.txt facetA/document2.txt facetB/document1.txt ... """ if self._projects_index.searcher().count( QSQ('name={}'.format(project_name))) > 0: raise DuplicateProjectNameError( "A project with that name already exists") # Create project record stopword_list = stopwords.parse_stopwords( stopwords_file ) if stopwords_file is not None else stopwords.ENGLISH project_id = str(uuid4()) info = { 'num_documents': 0, 'vocab_size': 0, 'num_features': num_features, 'normalise_frequencies': normalise_frequencies } doc_id = self._projects_index.add_document( frame_size=0, id=project_id, name=project_name, facets_json='[]', stopwords_json=json.dumps(stopword_list), status='Running', info=json.dumps(info)) # Create index logger.info('Creating new project {}'.format(project_name)) index_path = os.path.join(ProjectStorage.DATA_DIR, project_id) os.mkdir(index_path) analyser = DefaultAnalyser(stopword_list=stopword_list) index = Index.create(Schema( text=TEXT(analyser=analyser), facet=CATEGORICAL_TEXT(indexed=True), document_name=CATEGORICAL_TEXT(indexed=True)), path=index_path, storage_cls=SqliteStorage) try: # Process zip and add documents facets = defaultdict(int) with zipfile.ZipFile(data_zip_file) as data: for name in data.namelist(): if not name.endswith('/'): facet, doc_name = name.split('/') doc_name = re.sub( r'([^\s\w\.]|_)+', ' ', doc_name) # Replace non alphanumeric characters facets[facet] += 1 index.add_document(frame_size=0, update_index=False, encoding_errors='replace', text=data.open(name).read(), facet=facet, document_name=doc_name) logger.info('Added document {} to facet {}'.format( doc_name, facet)) # Perform indexing and LSI generation index.reindex() num_features = min(num_features, index.get_vocab_size()) index.run_plugin(LSIPlugin, num_features=num_features, normalise_frequencies=normalise_frequencies, calculate_document_similarities=True) except Exception as e: # Update project record (error) self._projects_index.delete_document(doc_id) self._projects_index.add_document( frame_size=0, id=project_id, name=project_name, info=json.dumps(info), facets_json=json.dumps(facets), stopwords_json=json.dumps(stopword_list), status='Error ({})'.format(e)) logger.info('Error creating project {}'.format(project_name)) raise else: # Update project record (finished) self._projects_index.delete_document(doc_id) info = { 'num_documents': index.get_document_count(), 'vocab_size': index.get_vocab_size(), 'num_features': num_features, 'normalise_frequencies': normalise_frequencies } self._projects_index.add_document( frame_size=0, id=project_id, name=project_name, facets_json=json.dumps(facets), stopwords_json=json.dumps(stopword_list), status='Finished', info=json.dumps(info)) logger.info('Created project {}'.format(project_name))