def import_site(xml_root, site_name, dump_date, site_desc, site_key, site_base_url, answer_yes=False): print('Using the XML root path: ' + xml_root + '\n') if not os.path.exists(xml_root): print('The given XML root path does not exist.') sys.exit(1) # connect to the database print('Connecting to the Stackdump database...') conn_str = settings.DATABASE_CONN_STR sqlhub.processConnection = connectionForURI(conn_str) print('Connected.\n') # connect to solr print('Connecting to solr...') solr = Solr(settings.SOLR_URL, assume_clean=True) # pysolr doesn't try to connect until a request is made, so we'll make a ping request try: solr._send_request('GET', 'admin/ping') except socket.error, e: print('Failed to connect to solr - error was: %s' % str(e)) print('Aborting.') sys.exit(2)
def delete_site(site_key): # connect to the data sources # connect to the database print('Connecting to the database...') conn_str = settings.DATABASE_CONN_STR sqlhub.processConnection = connectionForURI(conn_str) print('Connected.\n') # connect to solr print('Connecting to solr...') solr = Solr(settings.SOLR_URL) # pysolr doesn't try to connect until a request is made, so we'll make a ping request try: solr._send_request('GET', '%s/admin/ping' % solr.path) except socket.error, e: print('Failed to connect to solr - error was: %s' % str(e)) print('Aborting.') sys.exit(2)
class DocManager(): """The DocManager class creates a connection to the backend engine and adds/removes documents, and in the case of rollback, searches for them. The reason for storing id/doc pairs as opposed to doc's is so that multiple updates to the same doc reflect the most up to date version as opposed to multiple, slightly different versions of a doc. """ def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key='_id', **kwargs): """Verify Solr URL and establish a connection. """ self.solr = Solr(url) self.unique_key = unique_key # pysolr does things in milliseconds if auto_commit_interval is not None: self.auto_commit_interval = auto_commit_interval * 1000 else: self.auto_commit_interval = None self.field_list = [] self._build_fields() def _parse_fields(self, result, field_name): """ If Schema access, parse fields and build respective lists """ field_list = [] for key, value in result.get('schema', {}).get(field_name, {}).items(): if key not in field_list: field_list.append(key) return field_list def _build_fields(self): """ Builds a list of valid fields """ declared_fields = self.solr._send_request('get', ADMIN_URL) result = decoder.decode(declared_fields) self.field_list = self._parse_fields(result, 'fields') # Build regular expressions to match dynamic fields. # dynamic field names may have exactly one wildcard, either at # the beginning or the end of the name self._dynamic_field_regexes = [] for wc_pattern in self._parse_fields(result, 'dynamicFields'): if wc_pattern[0] == "*": self._dynamic_field_regexes.append( re.compile(".*%s\Z" % wc_pattern[1:])) elif wc_pattern[-1] == "*": self._dynamic_field_regexes.append( re.compile("\A%s.*" % wc_pattern[:-1])) def _clean_doc(self, doc): """Reformats the given document before insertion into Solr. This method reformats the document in the following ways: - removes extraneous fields that aren't defined in schema.xml - unwinds arrays in order to find and later flatten sub-documents - flattens the document so that there are no sub-documents, and every value is associated with its dot-separated path of keys An example: {"a": 2, "b": { "c": { "d": 5 } }, "e": [6, 7, 8] } becomes: {"a": 2, "b.c.d": 5, "e.0": 6, "e.1": 7, "e.2": 8} """ # SOLR cannot index fields within sub-documents, so flatten documents # with the dot-separated path to each value as the respective key def flattened(doc): def flattened_kernel(doc, path): for k, v in doc.items(): path.append(k) if isinstance(v, dict): for inner_k, inner_v in flattened_kernel(v, path): yield inner_k, inner_v elif isinstance(v, list): for li, lv in enumerate(v): path.append(str(li)) if isinstance(lv, dict): for dk, dv in flattened_kernel(lv, path): yield dk, dv else: yield ".".join(path), lv path.pop() else: yield ".".join(path), v path.pop() return dict(flattened_kernel(doc, [])) # Translate the _id field to whatever unique key we're using doc[self.unique_key] = doc["_id"] flat_doc = flattened(doc) # Only include fields that are explicitly provided in the # schema or match one of the dynamic field patterns, if # we were able to retrieve the schema if len(self.field_list) + len(self._dynamic_field_regexes) > 0: def include_field(field): return field in self.field_list or any( regex.match(field) for regex in self._dynamic_field_regexes) return dict( (k, v) for k, v in flat_doc.items() if include_field(k)) return flat_doc def stop(self): """ Stops the instance """ pass def upsert(self, doc): """Update or insert a document into Solr This method should call whatever add/insert/update method exists for the backend engine and add the document in there. The input will always be one mongo document, represented as a Python dictionary. """ try: if self.auto_commit_interval is not None: self.solr.add([self._clean_doc(doc)], commit=(self.auto_commit_interval == 0), commitWithin=str(self.auto_commit_interval)) else: self.solr.add([self._clean_doc(doc)], commit=False) except SolrError: raise errors.OperationFailed("Could not insert %r into Solr" % bsjson.dumps(doc)) def bulk_upsert(self, docs): """Update or insert multiple documents into Solr docs may be any iterable """ try: cleaned = (self._clean_doc(d) for d in docs) if self.auto_commit_interval is not None: self.solr.add(cleaned, commit=(self.auto_commit_interval == 0), commitWithin=str(self.auto_commit_interval)) else: self.solr.add(cleaned, commit=False) except SolrError: raise errors.OperationFailed( "Could not bulk-insert documents into Solr") def remove(self, doc): """Removes documents from Solr The input is a python dictionary that represents a mongo document. """ self.solr.delete(id=str(doc[self.unique_key]), commit=(self.auto_commit_interval == 0)) def _remove(self): """Removes everything """ self.solr.delete(q='*:*', commit=(self.auto_commit_interval == 0)) def search(self, start_ts, end_ts): """Called to query Solr for documents in a time range. """ query = '_ts: [%s TO %s]' % (start_ts, end_ts) return self.solr.search(query, rows=100000000) def _search(self, query): """For test purposes only. Performs search on Solr with given query Does not have to be implemented. """ return self.solr.search(query, rows=200) def commit(self): """This function is used to force a commit. """ retry_until_ok(self.solr.commit) def get_last_doc(self): """Returns the last document stored in the Solr engine. """ #search everything, sort by descending timestamp, return 1 row try: result = self.solr.search('*:*', sort='_ts desc', rows=1) except ValueError: return None if len(result) == 0: return None return result.docs[0]
class SolrTestCase(unittest.TestCase): def setUp(self): super(SolrTestCase, self).setUp() self.default_solr = Solr('http://localhost:8983/solr/core0') # Short timeouts. self.solr = Solr('http://localhost:8983/solr/core0', timeout=2) self.docs = [ { 'id': 'doc_1', 'title': 'Example doc 1', 'price': 12.59, 'popularity': 10, }, { 'id': 'doc_2', 'title': 'Another example ☃ doc 2', 'price': 13.69, 'popularity': 7, }, { 'id': 'doc_3', 'title': 'Another thing', 'price': 2.35, 'popularity': 8, }, { 'id': 'doc_4', 'title': 'doc rock', 'price': 99.99, 'popularity': 10, }, { 'id': 'doc_5', 'title': 'Boring', 'price': 1.12, 'popularity': 2, }, ] # Clear it. self.solr.delete(q='*:*') # Index our docs. Yes, this leans on functionality we're going to test # later & if it's broken, everything will catastrophically fail. # Such is life. self.solr.add(self.docs) def tearDown(self): self.solr.delete(q='*:*') super(SolrTestCase, self).tearDown() def test_init(self): self.assertEqual(self.default_solr.url, 'http://localhost:8983/solr/core0') self.assertTrue(isinstance(self.default_solr.decoder, json.JSONDecoder)) self.assertEqual(self.default_solr.timeout, 60) self.assertEqual(self.solr.url, 'http://localhost:8983/solr/core0') self.assertTrue(isinstance(self.solr.decoder, json.JSONDecoder)) self.assertEqual(self.solr.timeout, 2) def test__create_full_url(self): # Nada. self.assertEqual(self.solr._create_full_url(path=''), 'http://localhost:8983/solr/core0') # Basic path. self.assertEqual(self.solr._create_full_url(path='pysolr_tests'), 'http://localhost:8983/solr/core0/pysolr_tests') # Leading slash (& making sure we don't touch the trailing slash). self.assertEqual( self.solr._create_full_url( path='/pysolr_tests/select/?whatever=/'), 'http://localhost:8983/solr/core0/pysolr_tests/select/?whatever=/') def test__send_request(self): # Test a valid request. resp_body = self.solr._send_request('GET', 'select/?q=doc&wt=json') self.assertTrue('"numFound":3' in resp_body) # Test a lowercase method & a body. xml_body = '<add><doc><field name="id">doc_12</field><field name="title">Whee! ☃</field></doc></add>' resp_body = self.solr._send_request('POST', 'update/?commit=true', body=xml_body, headers={ 'Content-type': 'text/xml; charset=utf-8', }) self.assertTrue('<int name="status">0</int>' in resp_body) # Test a non-existent URL. old_url = self.solr.url self.solr.url = 'http://127.0.0.1:567898/wahtever' self.assertRaises(SolrError, self.solr._send_request, 'get', 'select/?q=doc&wt=json') self.solr.url = old_url def test__select(self): # Short params. resp_body = self.solr._select({'q': 'doc'}) resp_data = json.loads(resp_body) self.assertEqual(resp_data['response']['numFound'], 3) # Long params. resp_body = self.solr._select({'q': 'doc' * 1024}) resp_data = json.loads(resp_body) self.assertEqual(resp_data['response']['numFound'], 0) self.assertEqual(len(resp_data['responseHeader']['params']['q']), 3 * 1024) def test__mlt(self): resp_body = self.solr._mlt({'q': 'id:doc_1', 'mlt.fl': 'title'}) resp_data = json.loads(resp_body) self.assertEqual(resp_data['response']['numFound'], 0) def test__suggest_terms(self): resp_body = self.solr._select({'terms.fl': 'title'}) resp_data = json.loads(resp_body) self.assertEqual(resp_data['response']['numFound'], 0) def test__update(self): xml_body = '<add><doc><field name="id">doc_12</field><field name="title">Whee!</field></doc></add>' resp_body = self.solr._update(xml_body) self.assertTrue('<int name="status">0</int>' in resp_body) def test__extract_error(self): class RubbishResponse(object): def __init__(self, content, headers=None): if isinstance(content, bytes): content = content.decode('utf-8') self.content = content self.headers = headers if self.headers is None: self.headers = {} def json(self): return json.loads(self.content) # Just the reason. resp_1 = RubbishResponse("We don't care.", {'reason': 'Something went wrong.'}) self.assertEqual(self.solr._extract_error(resp_1), "[Reason: Something went wrong.]") # Empty reason. resp_2 = RubbishResponse("We don't care.", {'reason': None}) self.assertEqual(self.solr._extract_error(resp_2), "[Reason: None]\nWe don't care.") # No reason. Time to scrape. resp_3 = RubbishResponse( '<html><body><pre>Something is broke.</pre></body></html>', {'server': 'jetty'}) self.assertEqual(self.solr._extract_error(resp_3), "[Reason: Something is broke.]") # No reason. JSON response. resp_4 = RubbishResponse(b'\n {"error": {"msg": "It happens"}}', {'server': 'tomcat'}) self.assertEqual(self.solr._extract_error(resp_4), "[Reason: It happens]") # No reason. Weird JSON response. resp_5 = RubbishResponse(b'{"kinda": "weird"}', {'server': 'jetty'}) self.assertEqual(self.solr._extract_error(resp_5), '[Reason: None]\n{"kinda": "weird"}') def test__scrape_response(self): # Jetty. resp_1 = self.solr._scrape_response( {'server': 'jetty'}, '<html><body><pre>Something is broke.</pre></body></html>') self.assertEqual(resp_1, ('Something is broke.', u'')) # Other. resp_2 = self.solr._scrape_response({ 'server': 'crapzilla' }, '<html><head><title>Wow. Seriously weird.</title></head><body><pre>Something is broke.</pre></body></html>' ) self.assertEqual(resp_2, ('Wow. Seriously weird.', u'')) @unittest.skipUnless(HAS_LXML, "Cannot test Tomcat error extraction without lxml") def test__scrape_response_tomcat(self): """Tests for Tomcat error responses, which currently require lxml.html to parse""" # Tomcat. resp_1 = self.solr._scrape_response({ 'server': 'coyote' }, '<html><body><p><span>Error message</span><span>messed up.</span></p></body></html>' ) self.assertEqual(resp_1, ('messed up.', '')) # Broken Tomcat. resp_2 = self.solr._scrape_response({ 'server': 'coyote' }, '<html><body><p>Really broken. Scraping Java-generated HTML sucks.</pre></body></html>' ) self.assertEqual(resp_2, ( None, u'<div><body><p>Really broken. Scraping Java-generated HTML sucks.</p></body></div>' )) def test__from_python(self): self.assertEqual(self.solr._from_python(datetime.date(2013, 1, 18)), '2013-01-18T00:00:00Z') self.assertEqual( self.solr._from_python(datetime.datetime(2013, 1, 18, 0, 30, 28)), '2013-01-18T00:30:28Z') self.assertEqual(self.solr._from_python(True), 'true') self.assertEqual(self.solr._from_python(False), 'false') self.assertEqual(self.solr._from_python(1), '1') self.assertEqual(self.solr._from_python(1.2), '1.2') self.assertEqual(self.solr._from_python(b'hello'), 'hello') self.assertEqual(self.solr._from_python('hello ☃'), 'hello ☃') self.assertEqual(self.solr._from_python('\x01test\x02'), 'test') def test__to_python(self): self.assertEqual(self.solr._to_python('2013-01-18T00:00:00Z'), datetime.datetime(2013, 1, 18)) self.assertEqual(self.solr._to_python('2013-01-18T00:30:28Z'), datetime.datetime(2013, 1, 18, 0, 30, 28)) self.assertEqual(self.solr._to_python('true'), True) self.assertEqual(self.solr._to_python('false'), False) self.assertEqual(self.solr._to_python(1), 1) self.assertEqual(self.solr._to_python(1.2), 1.2) self.assertEqual(self.solr._to_python(b'hello'), 'hello') self.assertEqual(self.solr._to_python('hello ☃'), 'hello ☃') self.assertEqual(self.solr._to_python(['foo', 'bar']), 'foo') self.assertEqual(self.solr._to_python(('foo', 'bar')), 'foo') self.assertEqual(self.solr._to_python('tuple("foo", "bar")'), 'tuple("foo", "bar")') def test__is_null_value(self): self.assertTrue(self.solr._is_null_value(None)) self.assertTrue(self.solr._is_null_value('')) self.assertFalse(self.solr._is_null_value('Hello')) self.assertFalse(self.solr._is_null_value(1)) def test_search(self): results = self.solr.search('doc') self.assertEqual(len(results), 3) results = self.solr.search('example') self.assertEqual(len(results), 2) results = self.solr.search('nothing') self.assertEqual(len(results), 0) # Advanced options. results = self.solr.search( 'doc', **{ 'debug': 'true', 'hl': 'true', 'hl.fragsize': 8, 'facet': 'on', 'facet.field': 'popularity', 'spellcheck': 'true', 'spellcheck.collate': 'true', 'spellcheck.count': 1, # TODO: Can't get these working in my test setup. # 'group': 'true', # 'group.field': 'id', }) self.assertEqual(len(results), 3) self.assertTrue('explain' in results.debug) self.assertEqual(results.highlighting, { u'doc_4': {}, u'doc_2': {}, u'doc_1': {} }) self.assertEqual(results.spellcheck, {}) self.assertEqual(results.facets['facet_fields']['popularity'], ['10', 2, '7', 1, '2', 0, '8', 0]) self.assertTrue(results.qtime is not None) # TODO: Can't get these working in my test setup. # self.assertEqual(results.grouped, '') def test_more_like_this(self): results = self.solr.more_like_this('id:doc_1', 'text') self.assertEqual(len(results), 0) def test_suggest_terms(self): results = self.solr.suggest_terms('title', '') self.assertEqual(len(results), 1) self.assertEqual( results, { 'title': [('doc', 3), ('another', 2), ('example', 2), ('1', 1), ('2', 1), ('boring', 1), ('rock', 1), ('thing', 1)] }) def test__build_doc(self): doc = { 'id': 'doc_1', 'title': 'Example doc ☃ 1', 'price': 12.59, 'popularity': 10, } doc_xml = force_unicode( ET.tostring(self.solr._build_doc(doc), encoding='utf-8')) self.assertTrue( '<field name="title">Example doc ☃ 1</field>' in doc_xml) self.assertTrue('<field name="id">doc_1</field>' in doc_xml) self.assertEqual(len(doc_xml), 152) def test_add(self): self.assertEqual(len(self.solr.search('doc')), 3) self.assertEqual(len(self.solr.search('example')), 2) self.solr.add([ { 'id': 'doc_6', 'title': 'Newly added doc', }, { 'id': 'doc_7', 'title': 'Another example doc', }, ]) self.assertEqual(len(self.solr.search('doc')), 5) self.assertEqual(len(self.solr.search('example')), 3) def test_add_with_boost(self): self.assertEqual(len(self.solr.search('doc')), 3) self.solr.add([{ 'id': 'doc_6', 'title': 'Important doc' }], boost={'title': 10.0}) self.solr.add([{ 'id': 'doc_7', 'title': 'Spam doc doc' }], boost={'title': 0}) res = self.solr.search('doc') self.assertEqual(len(res), 5) self.assertEqual('doc_6', res.docs[0]['id']) def test_delete(self): self.assertEqual(len(self.solr.search('doc')), 3) self.solr.delete(id='doc_1') self.assertEqual(len(self.solr.search('doc')), 2) self.solr.delete(q='price:[0 TO 15]') self.assertEqual(len(self.solr.search('doc')), 1) self.assertEqual(len(self.solr.search('*:*')), 1) self.solr.delete(q='*:*') self.assertEqual(len(self.solr.search('*:*')), 0) # Need at least one. self.assertRaises(ValueError, self.solr.delete) # Can't have both. self.assertRaises(ValueError, self.solr.delete, id='foo', q='bar') def test_commit(self): self.assertEqual(len(self.solr.search('doc')), 3) self.solr.add([{ 'id': 'doc_6', 'title': 'Newly added doc', }], commit=False) self.assertEqual(len(self.solr.search('doc')), 3) self.solr.commit() self.assertEqual(len(self.solr.search('doc')), 4) def test_optimize(self): # Make sure it doesn't blow up. Side effects are hard to measure. :/ self.assertEqual(len(self.solr.search('doc')), 3) self.solr.add([{ 'id': 'doc_6', 'title': 'Newly added doc', }], commit=False) self.assertEqual(len(self.solr.search('doc')), 3) self.solr.optimize() self.assertEqual(len(self.solr.search('doc')), 4) def test_extract(self): fake_f = StringIO(""" <html> <head> <meta charset="utf-8"> <meta name="haystack-test" content="test 1234"> <title>Test Title ☃☃</title> </head> <body>foobar</body> </html> """) fake_f.name = "test.html" extracted = self.solr.extract(fake_f) # Verify documented response structure: self.assertIn('contents', extracted) self.assertIn('metadata', extracted) self.assertIn('foobar', extracted['contents']) m = extracted['metadata'] self.assertEqual([fake_f.name], m['stream_name']) self.assertIn('haystack-test', m, "HTML metadata should have been extracted!") self.assertEqual(['test 1234'], m['haystack-test']) # Note the underhanded use of a double snowman to verify both that Tika # correctly decoded entities and that our UTF-8 characters survived the # round-trip: self.assertEqual(['Test Title ☃☃'], m['title']) def test_full_url(self): self.solr.url = 'http://localhost:8983/solr/core0' full_url = self.solr._create_full_url(path='/update') # Make sure trailing and leading slashes do not collide: self.assertEqual(full_url, 'http://localhost:8983/solr/core0/update')
class DocManager(DocManagerBase): """The DocManager class creates a connection to the backend engine and adds/removes documents, and in the case of rollback, searches for them. The reason for storing id/doc pairs as opposed to doc's is so that multiple updates to the same doc reflect the most up to date version as opposed to multiple, slightly different versions of a doc. """ def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key='_id', chunk_size=DEFAULT_MAX_BULK, **kwargs): """Verify Solr URL and establish a connection. """ self.url = url self.solr = Solr(url, **kwargs.get('clientOptions', {})) self.unique_key = unique_key # pysolr does things in milliseconds if auto_commit_interval is not None: self.auto_commit_interval = auto_commit_interval * 1000 else: self.auto_commit_interval = None self.chunk_size = chunk_size self.field_list = [] self._build_fields() self._formatter = DocumentFlattener() def _parse_fields(self, result, field_name): """ If Schema access, parse fields and build respective lists """ field_list = [] for key, value in result.get('schema', {}).get(field_name, {}).items(): if key not in field_list: field_list.append(key) return field_list @wrap_exceptions def _build_fields(self): """ Builds a list of valid fields """ declared_fields = self.solr._send_request('get', ADMIN_URL) result = decoder.decode(declared_fields) self.field_list = self._parse_fields(result, 'fields') # Build regular expressions to match dynamic fields. # dynamic field names may have exactly one wildcard, either at # the beginning or the end of the name self._dynamic_field_regexes = [] for wc_pattern in self._parse_fields(result, 'dynamicFields'): if wc_pattern[0] == "*": self._dynamic_field_regexes.append( re.compile(".*%s\Z" % wc_pattern[1:])) elif wc_pattern[-1] == "*": self._dynamic_field_regexes.append( re.compile("\A%s.*" % wc_pattern[:-1])) def _clean_doc(self, doc, namespace, timestamp): """Reformats the given document before insertion into Solr. This method reformats the document in the following ways: - removes extraneous fields that aren't defined in schema.xml - unwinds arrays in order to find and later flatten sub-documents - flattens the document so that there are no sub-documents, and every value is associated with its dot-separated path of keys - inserts namespace and timestamp metadata into the document in order to handle rollbacks An example: {"a": 2, "b": { "c": { "d": 5 } }, "e": [6, 7, 8] } becomes: {"a": 2, "b.c.d": 5, "e.0": 6, "e.1": 7, "e.2": 8} """ # Translate the _id field to whatever unique key we're using. # _id may not exist in the doc, if we retrieved it from Solr # as part of update. if '_id' in doc: doc[self.unique_key] = u(doc.pop("_id")) # Update namespace and timestamp metadata if 'ns' in doc or '_ts' in doc: raise errors.OperationFailed( 'Need to set "ns" and "_ts" fields, but these fields already ' 'exist in the document %r!' % doc) doc['ns'] = namespace doc['_ts'] = timestamp # SOLR cannot index fields within sub-documents, so flatten documents # with the dot-separated path to each value as the respective key flat_doc = self._formatter.format_document(doc) # Only include fields that are explicitly provided in the # schema or match one of the dynamic field patterns, if # we were able to retrieve the schema if len(self.field_list) + len(self._dynamic_field_regexes) > 0: def include_field(field): return field in self.field_list or any( regex.match(field) for regex in self._dynamic_field_regexes ) return dict((k, v) for k, v in flat_doc.items() if include_field(k)) return flat_doc def stop(self): """ Stops the instance """ pass @wrap_exceptions def handle_command(self, doc, namespace, timestamp): db, _ = namespace.split('.', 1) if doc.get('dropDatabase'): for new_db in self.command_helper.map_db(db): self.solr.delete(q="ns:%s.*" % new_db, commit=(self.auto_commit_interval == 0)) if doc.get('renameCollection'): raise errors.OperationFailed( "solr_doc_manager does not support replication of " " renameCollection") if doc.get('create'): # nothing to do pass if doc.get('drop'): new_db, coll = self.command_helper.map_collection(db, doc['drop']) if new_db: self.solr.delete(q="ns:%s.%s" % (new_db, coll), commit=(self.auto_commit_interval == 0)) def apply_update(self, doc, update_spec): """Override DocManagerBase.apply_update to have flat documents.""" # Replace a whole document if not '$set' in update_spec and not '$unset' in update_spec: # update spec contains the new document # Update the key in Solr based on the unique_key mentioned as parameter update_spec['_id'] = doc[self.unique_key] return update_spec for to_set in update_spec.get("$set", []): value = update_spec['$set'][to_set] # Find dotted-path to the value, remove that key from doc, then # put value at key: keys_to_pop = [] for key in doc: if key.startswith(to_set): if key == to_set or key[len(to_set)] == '.': keys_to_pop.append(key) for key in keys_to_pop: doc.pop(key) doc[to_set] = value for to_unset in update_spec.get("$unset", []): # MongoDB < 2.5.2 reports $unset for fields that don't exist within # the document being updated. keys_to_pop = [] for key in doc: if key.startswith(to_unset): if key == to_unset or key[len(to_unset)] == '.': keys_to_pop.append(key) for key in keys_to_pop: doc.pop(key) return doc @wrap_exceptions def update(self, document_id, update_spec, namespace, timestamp): """Apply updates given in update_spec to the document whose id matches that of doc. """ # Commit outstanding changes so that the document to be updated is the # same version to which the changes apply. self.commit() # Need to escape special characters in the document_id. document_id = ''.join(map( lambda c: '\\' + c if c in ESCAPE_CHARACTERS else c, u(document_id) )) query = "%s:%s" % (self.unique_key, document_id) results = self.solr.search(query) if not len(results): # Document may not be retrievable yet self.commit() results = self.solr.search(query) # Results is an iterable containing only 1 result for doc in results: # Remove metadata previously stored by Mongo Connector. doc.pop('ns') doc.pop('_ts') updated = self.apply_update(doc, update_spec) # A _version_ of 0 will always apply the update updated['_version_'] = 0 self.upsert(updated, namespace, timestamp) return updated @wrap_exceptions def upsert(self, doc, namespace, timestamp): """Update or insert a document into Solr This method should call whatever add/insert/update method exists for the backend engine and add the document in there. The input will always be one mongo document, represented as a Python dictionary. """ if self.auto_commit_interval is not None: self.solr.add([self._clean_doc(doc, namespace, timestamp)], commit=(self.auto_commit_interval == 0), commitWithin=u(self.auto_commit_interval)) else: self.solr.add([self._clean_doc(doc, namespace, timestamp)], commit=False) @wrap_exceptions def bulk_upsert(self, docs, namespace, timestamp): """Update or insert multiple documents into Solr docs may be any iterable """ if self.auto_commit_interval is not None: add_kwargs = { "commit": (self.auto_commit_interval == 0), "commitWithin": str(self.auto_commit_interval) } else: add_kwargs = {"commit": False} cleaned = (self._clean_doc(d, namespace, timestamp) for d in docs) if self.chunk_size > 0: batch = list(next(cleaned) for i in range(self.chunk_size)) while batch: self.solr.add(batch, **add_kwargs) batch = list(next(cleaned) for i in range(self.chunk_size)) else: self.solr.add(cleaned, **add_kwargs) @wrap_exceptions def insert_file(self, f, namespace, timestamp): params = self._formatter.format_document(f.get_metadata()) params[self.unique_key] = params.pop('_id') params['ns'] = namespace params['_ts'] = timestamp params = dict(('literal.' + k, v) for k, v in params.items()) if self.auto_commit_interval == 0: params['commit'] = 'true' request = Request(os.path.join( self.url, "update/extract?%s" % urlencode(params))) request.add_header("Content-type", "application/octet-stream") request.data = f response = urlopen(request) logging.debug(response.read()) @wrap_exceptions def remove(self, document_id, namespace, timestamp): """Removes documents from Solr The input is a python dictionary that represents a mongo document. """ self.solr.delete(id=u(document_id), commit=(self.auto_commit_interval == 0)) @wrap_exceptions def _stream_search(self, query): """Helper method for iterating over Solr search results.""" for doc in self.solr.search(query, rows=100000000): if self.unique_key != "_id": doc["_id"] = doc.pop(self.unique_key) yield doc @wrap_exceptions def search(self, start_ts, end_ts): """Called to query Solr for documents in a time range.""" query = '_ts: [%s TO %s]' % (start_ts, end_ts) return self._stream_search(query) def commit(self): """This function is used to force a commit. """ retry_until_ok(self.solr.commit) @wrap_exceptions def get_last_doc(self): """Returns the last document stored in the Solr engine. """ #search everything, sort by descending timestamp, return 1 row try: result = self.solr.search('*:*', sort='_ts desc', rows=1) except ValueError: return None for r in result: r['_id'] = r.pop(self.unique_key) return r
class SolrTestCase(unittest.TestCase): def setUp(self): super(SolrTestCase, self).setUp() self.default_solr = Solr("http://localhost:8983/solr/core0") # Short timeouts. self.solr = Solr("http://localhost:8983/solr/core0", timeout=2) self.docs = [ {"id": "doc_1", "title": "Example doc 1", "price": 12.59, "popularity": 10}, {"id": "doc_2", "title": "Another example ☃ doc 2", "price": 13.69, "popularity": 7}, {"id": "doc_3", "title": "Another thing", "price": 2.35, "popularity": 8}, {"id": "doc_4", "title": "doc rock", "price": 99.99, "popularity": 10}, {"id": "doc_5", "title": "Boring", "price": 1.12, "popularity": 2}, ] # Clear it. self.solr.delete(q="*:*") # Index our docs. Yes, this leans on functionality we're going to test # later & if it's broken, everything will catastrophically fail. # Such is life. self.solr.add(self.docs) def tearDown(self): self.solr.delete(q="*:*") super(SolrTestCase, self).tearDown() def test_init(self): self.assertEqual(self.default_solr.url, "http://localhost:8983/solr/core0") self.assertTrue(isinstance(self.default_solr.decoder, json.JSONDecoder)) self.assertEqual(self.default_solr.timeout, 60) self.assertEqual(self.solr.url, "http://localhost:8983/solr/core0") self.assertTrue(isinstance(self.solr.decoder, json.JSONDecoder)) self.assertEqual(self.solr.timeout, 2) def test__create_full_url(self): # Nada. self.assertEqual(self.solr._create_full_url(path=""), "http://localhost:8983/solr/core0") # Basic path. self.assertEqual( self.solr._create_full_url(path="pysolr_tests"), "http://localhost:8983/solr/core0/pysolr_tests" ) # Leading slash (& making sure we don't touch the trailing slash). self.assertEqual( self.solr._create_full_url(path="/pysolr_tests/select/?whatever=/"), "http://localhost:8983/solr/core0/pysolr_tests/select/?whatever=/", ) def test__send_request(self): # Test a valid request. resp_body = self.solr._send_request("GET", "select/?q=doc&wt=json") self.assertTrue('"numFound":3' in resp_body) # Test a lowercase method & a body. xml_body = '<add><doc><field name="id">doc_12</field><field name="title">Whee! ☃</field></doc></add>' resp_body = self.solr._send_request( "POST", "update/?commit=true", body=xml_body, headers={"Content-type": "text/xml; charset=utf-8"} ) self.assertTrue('<int name="status">0</int>' in resp_body) # Test a non-existent URL. old_url = self.solr.url self.solr.url = "http://127.0.0.1:567898/wahtever" self.assertRaises(SolrError, self.solr._send_request, "get", "select/?q=doc&wt=json") self.solr.url = old_url # Test bad core as well self.solr.url = "http://localhost:8983/solr/bad_core" try: self.assertRaises(SolrError, self.solr._send_request, "get", "select/?q=doc&wt=json") finally: self.solr.url = old_url def test__select(self): # Short params. resp_body = self.solr._select({"q": "doc"}) resp_data = json.loads(resp_body) self.assertEqual(resp_data["response"]["numFound"], 3) # Long params. resp_body = self.solr._select({"q": "doc" * 1024}) resp_data = json.loads(resp_body) self.assertEqual(resp_data["response"]["numFound"], 0) self.assertEqual(len(resp_data["responseHeader"]["params"]["q"]), 3 * 1024) # Test Deep Pagination CursorMark resp_body = self.solr._select({"q": "*", "cursorMark": "*", "sort": "id desc", "start": 0, "rows": 2}) resp_data = json.loads(resp_body) self.assertEqual(len(resp_data["response"]["docs"]), 2) self.assertIn("nextCursorMark", resp_data) def test__mlt(self): resp_body = self.solr._mlt({"q": "id:doc_1", "mlt.fl": "title"}) resp_data = json.loads(resp_body) self.assertEqual(resp_data["response"]["numFound"], 0) def test__suggest_terms(self): resp_body = self.solr._select({"terms.fl": "title"}) resp_data = json.loads(resp_body) self.assertEqual(resp_data["response"]["numFound"], 0) def test__update(self): xml_body = '<add><doc><field name="id">doc_12</field><field name="title">Whee!</field></doc></add>' resp_body = self.solr._update(xml_body) self.assertTrue('<int name="status">0</int>' in resp_body) def test__soft_commit(self): xml_body = '<add><doc><field name="id">doc_12</field><field name="title">Whee!</field></doc></add>' resp_body = self.solr._update(xml_body, softCommit=True) self.assertTrue('<int name="status">0</int>' in resp_body) def test__extract_error(self): class RubbishResponse(object): def __init__(self, content, headers=None): if isinstance(content, bytes): content = content.decode("utf-8") self.content = content self.headers = headers if self.headers is None: self.headers = {} def json(self): return json.loads(self.content) # Just the reason. resp_1 = RubbishResponse("We don't care.", {"reason": "Something went wrong."}) self.assertEqual(self.solr._extract_error(resp_1), "[Reason: Something went wrong.]") # Empty reason. resp_2 = RubbishResponse("We don't care.", {"reason": None}) self.assertEqual(self.solr._extract_error(resp_2), "[Reason: None]\nWe don't care.") # No reason. Time to scrape. resp_3 = RubbishResponse("<html><body><pre>Something is broke.</pre></body></html>", {"server": "jetty"}) self.assertEqual(self.solr._extract_error(resp_3), "[Reason: Something is broke.]") # No reason. JSON response. resp_4 = RubbishResponse(b'\n {"error": {"msg": "It happens"}}', {"server": "tomcat"}) self.assertEqual(self.solr._extract_error(resp_4), "[Reason: It happens]") # No reason. Weird JSON response. resp_5 = RubbishResponse(b'{"kinda": "weird"}', {"server": "jetty"}) self.assertEqual(self.solr._extract_error(resp_5), '[Reason: None]\n{"kinda": "weird"}') def test__scrape_response(self): # Jetty. resp_1 = self.solr._scrape_response( {"server": "jetty"}, "<html><body><pre>Something is broke.</pre></body></html>" ) self.assertEqual(resp_1, ("Something is broke.", "")) # Other. resp_2 = self.solr._scrape_response( {"server": "crapzilla"}, "<html><head><title>Wow. Seriously weird.</title></head><body><pre>Something is broke.</pre></body></html>", ) self.assertEqual(resp_2, ("Wow. Seriously weird.", "")) @unittest.skipIf( sys.version_info < (2, 7), reason="Python 2.6 lacks the ElementTree 1.3 interface required for Solr XML error message parsing", ) def test__scrape_response_coyote_xml(self): resp_3 = self.solr._scrape_response( {"server": "coyote"}, '<?xml version="1.0"?>\n<response>\n<lst name="responseHeader"><int name="status">400</int><int name="QTime">0</int></lst><lst name="error"><str name="msg">Invalid Date String:\'2015-03-23 10:43:33\'</str><int name="code">400</int></lst>\n</response>\n', ) self.assertEqual( resp_3, ("Invalid Date String:'2015-03-23 10:43:33'", "Invalid Date String:'2015-03-23 10:43:33'") ) # Valid XML with a traceback resp_4 = self.solr._scrape_response( {"server": "coyote"}, """<?xml version="1.0"?> <response> <lst name="responseHeader"><int name="status">500</int><int name="QTime">138</int></lst><lst name="error"><str name="msg">Internal Server Error</str><str name="trace">org.apache.solr.common.SolrException: Internal Server Error at java.lang.Thread.run(Thread.java:745)</str><int name="code">500</int></lst> </response>""", ) self.assertEqual( resp_4, ( "Internal Server Error", "org.apache.solr.common.SolrException: Internal Server Error at java.lang.Thread.run(Thread.java:745)", ), ) def test__scrape_response_tomcat(self): """Tests for Tomcat error responses""" resp_0 = self.solr._scrape_response( {"server": "coyote"}, "<html><body><h1>Something broke!</h1><pre>gigantic stack trace</pre></body></html>" ) self.assertEqual(resp_0, ("Something broke!", "")) # Invalid XML bogus_xml = '<?xml version="1.0"?>\n<response>\n<lst name="responseHeader"><int name="status">400</int><int name="QTime">0</int></lst><lst name="error"><str name="msg">Invalid Date String:\'2015-03-23 10:43:33\'</str><int name="code">400</int></lst>' reason, full_html = self.solr._scrape_response({"server": "coyote"}, bogus_xml) self.assertEqual(reason, None) self.assertEqual(full_html, bogus_xml.replace("\n", "")) def test__from_python(self): self.assertEqual(self.solr._from_python(datetime.date(2013, 1, 18)), "2013-01-18T00:00:00Z") self.assertEqual(self.solr._from_python(datetime.datetime(2013, 1, 18, 0, 30, 28)), "2013-01-18T00:30:28Z") self.assertEqual(self.solr._from_python(True), "true") self.assertEqual(self.solr._from_python(False), "false") self.assertEqual(self.solr._from_python(1), "1") self.assertEqual(self.solr._from_python(1.2), "1.2") self.assertEqual(self.solr._from_python(b"hello"), "hello") self.assertEqual(self.solr._from_python("hello ☃"), "hello ☃") self.assertEqual(self.solr._from_python("\x01test\x02"), "test") def test__to_python(self): self.assertEqual(self.solr._to_python("2013-01-18T00:00:00Z"), datetime.datetime(2013, 1, 18)) self.assertEqual(self.solr._to_python("2013-01-18T00:30:28Z"), datetime.datetime(2013, 1, 18, 0, 30, 28)) self.assertEqual(self.solr._to_python("true"), True) self.assertEqual(self.solr._to_python("false"), False) self.assertEqual(self.solr._to_python(1), 1) self.assertEqual(self.solr._to_python(1.2), 1.2) self.assertEqual(self.solr._to_python(b"hello"), "hello") self.assertEqual(self.solr._to_python("hello ☃"), "hello ☃") self.assertEqual(self.solr._to_python(["foo", "bar"]), "foo") self.assertEqual(self.solr._to_python(("foo", "bar")), "foo") self.assertEqual(self.solr._to_python('tuple("foo", "bar")'), 'tuple("foo", "bar")') def test__is_null_value(self): self.assertTrue(self.solr._is_null_value(None)) self.assertTrue(self.solr._is_null_value("")) self.assertFalse(self.solr._is_null_value("Hello")) self.assertFalse(self.solr._is_null_value(1)) def test_search(self): results = self.solr.search("doc") self.assertEqual(len(results), 3) results = self.solr.search("example") self.assertEqual(len(results), 2) results = self.solr.search("nothing") self.assertEqual(len(results), 0) # Advanced options. results = self.solr.search( "doc", **{ "debug": "true", "hl": "true", "hl.fragsize": 8, "facet": "on", "facet.field": "popularity", "spellcheck": "true", "spellcheck.collate": "true", "spellcheck.count": 1, # TODO: Can't get these working in my test setup. # 'group': 'true', # 'group.field': 'id', } ) self.assertEqual(len(results), 3) self.assertTrue("explain" in results.debug) self.assertEqual(results.highlighting, {"doc_4": {}, "doc_2": {}, "doc_1": {}}) self.assertEqual(results.spellcheck, {}) self.assertEqual(results.facets["facet_fields"]["popularity"], ["10", 2, "7", 1, "2", 0, "8", 0]) self.assertTrue(results.qtime is not None) # TODO: Can't get these working in my test setup. # self.assertEqual(results.grouped, '') def test_more_like_this(self): results = self.solr.more_like_this("id:doc_1", "text") self.assertEqual(len(results), 0) def test_suggest_terms(self): results = self.solr.suggest_terms("title", "") self.assertEqual(len(results), 1) self.assertEqual( results, { "title": [ ("doc", 3), ("another", 2), ("example", 2), ("1", 1), ("2", 1), ("boring", 1), ("rock", 1), ("thing", 1), ] }, ) def test__build_doc(self): doc = {"id": "doc_1", "title": "Example doc ☃ 1", "price": 12.59, "popularity": 10} doc_xml = force_unicode(ET.tostring(self.solr._build_doc(doc), encoding="utf-8")) self.assertTrue('<field name="title">Example doc ☃ 1</field>' in doc_xml) self.assertTrue('<field name="id">doc_1</field>' in doc_xml) self.assertEqual(len(doc_xml), 152) def test_add(self): self.assertEqual(len(self.solr.search("doc")), 3) self.assertEqual(len(self.solr.search("example")), 2) self.solr.add([{"id": "doc_6", "title": "Newly added doc"}, {"id": "doc_7", "title": "Another example doc"}]) self.assertEqual(len(self.solr.search("doc")), 5) self.assertEqual(len(self.solr.search("example")), 3) def test_add_with_boost(self): self.assertEqual(len(self.solr.search("doc")), 3) self.solr.add([{"id": "doc_6", "title": "Important doc"}], boost={"title": 10.0}) self.solr.add([{"id": "doc_7", "title": "Spam doc doc"}], boost={"title": 0}) res = self.solr.search("doc") self.assertEqual(len(res), 5) self.assertEqual("doc_6", res.docs[0]["id"]) def test_field_update(self): originalDocs = self.solr.search("doc") self.assertEqual(len(originalDocs), 3) updateList = [] for i, doc in enumerate(originalDocs): updateList.append({"id": doc["id"], "popularity": 5}) self.solr.add(updateList, fieldUpdates={"popularity": "inc"}) updatedDocs = self.solr.search("doc") self.assertEqual(len(updatedDocs), 3) for i, (originalDoc, updatedDoc) in enumerate(zip(originalDocs, updatedDocs)): self.assertEqual(len(updatedDoc.keys()), len(originalDoc.keys())) self.assertEqual(updatedDoc["popularity"], originalDoc["popularity"] + 5) self.assertEqual( True, all(updatedDoc[k] == originalDoc[k] for k in updatedDoc.keys() if not k in ["_version_", "popularity"]), ) self.solr.add( [ {"id": "multivalued_1", "title": "Multivalued doc 1", "word_ss": ["alpha", "beta"]}, {"id": "multivalued_2", "title": "Multivalued doc 2", "word_ss": ["charlie", "delta"]}, ] ) originalDocs = self.solr.search("multivalued") self.assertEqual(len(originalDocs), 2) updateList = [] for i, doc in enumerate(originalDocs): updateList.append({"id": doc["id"], "word_ss": ["epsilon", "gamma"]}) self.solr.add(updateList, fieldUpdates={"word_ss": "add"}) updatedDocs = self.solr.search("multivalued") self.assertEqual(len(updatedDocs), 2) for i, (originalDoc, updatedDoc) in enumerate(zip(originalDocs, updatedDocs)): self.assertEqual(len(updatedDoc.keys()), len(originalDoc.keys())) self.assertEqual(updatedDoc["word_ss"], originalDoc["word_ss"] + ["epsilon", "gamma"]) self.assertEqual( True, all(updatedDoc[k] == originalDoc[k] for k in updatedDoc.keys() if not k in ["_version_", "word_ss"]), ) def test_delete(self): self.assertEqual(len(self.solr.search("doc")), 3) self.solr.delete(id="doc_1") self.assertEqual(len(self.solr.search("doc")), 2) self.solr.delete(q="price:[0 TO 15]") self.assertEqual(len(self.solr.search("doc")), 1) self.assertEqual(len(self.solr.search("*:*")), 1) self.solr.delete(q="*:*") self.assertEqual(len(self.solr.search("*:*")), 0) # Need at least one. self.assertRaises(ValueError, self.solr.delete) # Can't have both. self.assertRaises(ValueError, self.solr.delete, id="foo", q="bar") def test_commit(self): self.assertEqual(len(self.solr.search("doc")), 3) self.solr.add([{"id": "doc_6", "title": "Newly added doc"}], commit=False) self.assertEqual(len(self.solr.search("doc")), 3) self.solr.commit() self.assertEqual(len(self.solr.search("doc")), 4) def test_optimize(self): # Make sure it doesn't blow up. Side effects are hard to measure. :/ self.assertEqual(len(self.solr.search("doc")), 3) self.solr.add([{"id": "doc_6", "title": "Newly added doc"}], commit=False) self.assertEqual(len(self.solr.search("doc")), 3) self.solr.optimize() self.assertEqual(len(self.solr.search("doc")), 4) def test_extract(self): fake_f = StringIO( """ <html> <head> <meta charset="utf-8"> <meta name="haystack-test" content="test 1234"> <title>Test Title ☃☃</title> </head> <body>foobar</body> </html> """ ) fake_f.name = "test.html" extracted = self.solr.extract(fake_f) # Verify documented response structure: self.assertIn("contents", extracted) self.assertIn("metadata", extracted) self.assertIn("foobar", extracted["contents"]) m = extracted["metadata"] self.assertEqual([fake_f.name], m["stream_name"]) self.assertIn("haystack-test", m, "HTML metadata should have been extracted!") self.assertEqual(["test 1234"], m["haystack-test"]) # Note the underhanded use of a double snowman to verify both that Tika # correctly decoded entities and that our UTF-8 characters survived the # round-trip: self.assertEqual(["Test Title ☃☃"], m["title"]) def test_full_url(self): self.solr.url = "http://localhost:8983/solr/core0" full_url = self.solr._create_full_url(path="/update") # Make sure trailing and leading slashes do not collide: self.assertEqual(full_url, "http://localhost:8983/solr/core0/update")
class DocManager(): """The DocManager class creates a connection to the backend engine and adds/removes documents, and in the case of rollback, searches for them. The reason for storing id/doc pairs as opposed to doc's is so that multiple updates to the same doc reflect the most up to date version as opposed to multiple, slightly different versions of a doc. """ def __init__(self, url, auto_commit=False, unique_key='_id', **kwargs): """Verify Solr URL and establish a connection. """ self.solr = Solr(url) self.unique_key = unique_key self.auto_commit = auto_commit self.field_list = [] self.dynamic_field_list = [] self.build_fields() if auto_commit: self.run_auto_commit() def _parse_fields(self, result, field_name): """ If Schema access, parse fields and build respective lists """ field_list = [] for key, value in result.get('schema', {}).get(field_name, {}).items(): if key not in field_list: field_list.append(key) return field_list def build_fields(self): """ Builds a list of valid fields """ declared_fields = self.solr._send_request('get', ADMIN_URL) result = decoder.decode(declared_fields) self.field_list = self._parse_fields(result, 'fields'), self.dynamic_field_list = self._parse_fields(result, 'dynamicFields') def clean_doc(self, doc): """ Cleans a document passed in to be compliant with the Solr as used by Solr. This WILL remove fields that aren't in the schema, so the document may actually get altered. """ if not self.field_list: return doc fixed_doc = {} doc[self.unique_key] = doc["_id"] for key, value in doc.items(): if key in self.field_list[0]: fixed_doc[key] = value # Dynamic strings. * can occur only at beginning and at end else: for field in self.dynamic_field_list: if field[0] == '*': regex = re.compile(r'\w%s\b' % (field)) else: regex = re.compile(r'\b%s\w' % (field)) if regex.match(key): fixed_doc[key] = value return fixed_doc def stop(self): """ Stops the instance """ self.auto_commit = False def upsert(self, doc): """Update or insert a document into Solr This method should call whatever add/insert/update method exists for the backend engine and add the document in there. The input will always be one mongo document, represented as a Python dictionary. """ try: self.solr.add([self.clean_doc(doc)], commit=True) except SolrError: raise errors.OperationFailed( "Could not insert %r into Solr" % bsjson.dumps(doc)) def bulk_upsert(self, docs): """Update or insert multiple documents into Solr docs may be any iterable """ try: cleaned = (self.clean_doc(d) for d in docs) self.solr.add(cleaned, commit=True) except SolrError: raise errors.OperationFailed( "Could not bulk-insert documents into Solr") def remove(self, doc): """Removes documents from Solr The input is a python dictionary that represents a mongo document. """ self.solr.delete(id=str(doc[self.unique_key]), commit=True) def _remove(self): """Removes everything """ self.solr.delete(q='*:*') def search(self, start_ts, end_ts): """Called to query Solr for documents in a time range. """ query = '_ts: [%s TO %s]' % (start_ts, end_ts) return self.solr.search(query, rows=100000000) def _search(self, query): """For test purposes only. Performs search on Solr with given query Does not have to be implemented. """ return self.solr.search(query, rows=200) def commit(self): """This function is used to force a commit. """ retry_until_ok(self.solr.commit) def run_auto_commit(self): """Periodically commits to the Solr server. """ self.solr.commit() if self.auto_commit: Timer(1, self.run_auto_commit).start() def get_last_doc(self): """Returns the last document stored in the Solr engine. """ #search everything, sort by descending timestamp, return 1 row try: result = self.solr.search('*:*', sort='_ts desc', rows=1) except ValueError: return None if len(result) == 0: return None return result.docs[0]
class SolrTestCase(unittest.TestCase): def setUp(self): super(SolrTestCase, self).setUp() self.default_solr = Solr('http://localhost:8983/solr/core0') # Short timeouts. self.solr = Solr('http://localhost:8983/solr/core0', timeout=2) self.docs = [ { 'id': 'doc_1', 'title': 'Example doc 1', 'price': 12.59, 'popularity': 10, }, { 'id': 'doc_2', 'title': 'Another example ☃ doc 2', 'price': 13.69, 'popularity': 7, }, { 'id': 'doc_3', 'title': 'Another thing', 'price': 2.35, 'popularity': 8, }, { 'id': 'doc_4', 'title': 'doc rock', 'price': 99.99, 'popularity': 10, }, { 'id': 'doc_5', 'title': 'Boring', 'price': 1.12, 'popularity': 2, }, ] # Clear it. self.solr.delete(q='*:*') # Index our docs. Yes, this leans on functionality we're going to test # later & if it's broken, everything will catastrophically fail. # Such is life. self.solr.add(self.docs) def tearDown(self): self.solr.delete(q='*:*') super(SolrTestCase, self).tearDown() def test_init(self): self.assertEqual(self.default_solr.url, 'http://localhost:8983/solr/core0') self.assertTrue(isinstance(self.default_solr.decoder, json.JSONDecoder)) self.assertEqual(self.default_solr.timeout, 60) self.assertEqual(self.solr.url, 'http://localhost:8983/solr/core0') self.assertTrue(isinstance(self.solr.decoder, json.JSONDecoder)) self.assertEqual(self.solr.timeout, 2) def test__create_full_url(self): # Nada. self.assertEqual(self.solr._create_full_url(path=''), 'http://localhost:8983/solr/core0') # Basic path. self.assertEqual(self.solr._create_full_url(path='pysolr_tests'), 'http://localhost:8983/solr/core0/pysolr_tests') # Leading slash (& making sure we don't touch the trailing slash). self.assertEqual( self.solr._create_full_url( path='/pysolr_tests/select/?whatever=/'), 'http://localhost:8983/solr/core0/pysolr_tests/select/?whatever=/') def test__send_request(self): # Test a valid request. resp_body = self.solr._send_request('GET', 'select/?q=doc&wt=json') self.assertTrue('"numFound":3' in resp_body) # Test a lowercase method & a body. xml_body = '<add><doc><field name="id">doc_12</field><field name="title">Whee! ☃</field></doc></add>' resp_body = self.solr._send_request('POST', 'update/?commit=true', body=xml_body, headers={ 'Content-type': 'text/xml; charset=utf-8', }) self.assertTrue('<int name="status">0</int>' in resp_body) # Test a non-existent URL. old_url = self.solr.url self.solr.url = 'http://127.0.0.1:567898/wahtever' self.assertRaises(SolrError, self.solr._send_request, 'get', 'select/?q=doc&wt=json') self.solr.url = old_url def test__select(self): # Short params. resp_body = self.solr._select({'q': 'doc'}) resp_data = json.loads(resp_body) self.assertEqual(resp_data['response']['numFound'], 3) # Long params. resp_body = self.solr._select({'q': 'doc' * 1024}) resp_data = json.loads(resp_body) self.assertEqual(resp_data['response']['numFound'], 0) self.assertEqual(len(resp_data['responseHeader']['params']['q']), 3 * 1024) def test__mlt(self): resp_body = self.solr._mlt({'q': 'id:doc_1', 'mlt.fl': 'title'}) resp_data = json.loads(resp_body) self.assertEqual(resp_data['response']['numFound'], 0) def test__suggest_terms(self): resp_body = self.solr._select({'terms.fl': 'title'}) resp_data = json.loads(resp_body) self.assertEqual(resp_data['response']['numFound'], 0) def test__update(self): xml_body = '<add><doc><field name="id">doc_12</field><field name="title">Whee!</field></doc></add>' resp_body = self.solr._update(xml_body) self.assertTrue('<int name="status">0</int>' in resp_body) def test__soft_commit(self): xml_body = '<add><doc><field name="id">doc_12</field><field name="title">Whee!</field></doc></add>' resp_body = self.solr._update(xml_body, softCommit=True) self.assertTrue('<int name="status">0</int>' in resp_body) def test__extract_error(self): class RubbishResponse(object): def __init__(self, content, headers=None): if isinstance(content, bytes): content = content.decode('utf-8') self.content = content self.headers = headers if self.headers is None: self.headers = {} def json(self): return json.loads(self.content) # Just the reason. resp_1 = RubbishResponse("We don't care.", {'reason': 'Something went wrong.'}) self.assertEqual(self.solr._extract_error(resp_1), "[Reason: Something went wrong.]") # Empty reason. resp_2 = RubbishResponse("We don't care.", {'reason': None}) self.assertEqual(self.solr._extract_error(resp_2), "[Reason: None]\nWe don't care.") # No reason. Time to scrape. resp_3 = RubbishResponse( '<html><body><pre>Something is broke.</pre></body></html>', {'server': 'jetty'}) self.assertEqual(self.solr._extract_error(resp_3), "[Reason: Something is broke.]") # No reason. JSON response. resp_4 = RubbishResponse(b'\n {"error": {"msg": "It happens"}}', {'server': 'tomcat'}) self.assertEqual(self.solr._extract_error(resp_4), "[Reason: It happens]") # No reason. Weird JSON response. resp_5 = RubbishResponse(b'{"kinda": "weird"}', {'server': 'jetty'}) self.assertEqual(self.solr._extract_error(resp_5), '[Reason: None]\n{"kinda": "weird"}') def test__scrape_response(self): # Jetty. resp_1 = self.solr._scrape_response( {'server': 'jetty'}, '<html><body><pre>Something is broke.</pre></body></html>') self.assertEqual(resp_1, ('Something is broke.', u'')) # Other. resp_2 = self.solr._scrape_response({ 'server': 'crapzilla' }, '<html><head><title>Wow. Seriously weird.</title></head><body><pre>Something is broke.</pre></body></html>' ) self.assertEqual(resp_2, ('Wow. Seriously weird.', u'')) @unittest.skipIf( sys.version_info < (2, 7), reason= u'Python 2.6 lacks the ElementTree 1.3 interface required for Solr XML error message parsing' ) def test__scrape_response_coyote_xml(self): resp_3 = self.solr._scrape_response({ 'server': 'coyote' }, '<?xml version="1.0"?>\n<response>\n<lst name="responseHeader"><int name="status">400</int><int name="QTime">0</int></lst><lst name="error"><str name="msg">Invalid Date String:\'2015-03-23 10:43:33\'</str><int name="code">400</int></lst>\n</response>\n' ) self.assertEqual(resp_3, ("Invalid Date String:'2015-03-23 10:43:33'", "Invalid Date String:'2015-03-23 10:43:33'")) # Valid XML with a traceback resp_4 = self.solr._scrape_response({'server': 'coyote'}, """<?xml version="1.0"?> <response> <lst name="responseHeader"><int name="status">500</int><int name="QTime">138</int></lst><lst name="error"><str name="msg">Internal Server Error</str><str name="trace">org.apache.solr.common.SolrException: Internal Server Error at java.lang.Thread.run(Thread.java:745)</str><int name="code">500</int></lst> </response>""") self.assertEqual(resp_4, ( u"Internal Server Error", u"org.apache.solr.common.SolrException: Internal Server Error at java.lang.Thread.run(Thread.java:745)" )) def test__scrape_response_tomcat(self): """Tests for Tomcat error responses""" resp_0 = self.solr._scrape_response({ 'server': 'coyote' }, '<html><body><h1>Something broke!</h1><pre>gigantic stack trace</pre></body></html>' ) self.assertEqual(resp_0, ('Something broke!', '')) # Invalid XML bogus_xml = '<?xml version="1.0"?>\n<response>\n<lst name="responseHeader"><int name="status">400</int><int name="QTime">0</int></lst><lst name="error"><str name="msg">Invalid Date String:\'2015-03-23 10:43:33\'</str><int name="code">400</int></lst>' reason, full_html = self.solr._scrape_response({'server': 'coyote'}, bogus_xml) self.assertEqual(reason, None) self.assertEqual(full_html, bogus_xml.replace("\n", "")) def test__from_python(self): self.assertEqual(self.solr._from_python(datetime.date(2013, 1, 18)), '2013-01-18T00:00:00Z') self.assertEqual( self.solr._from_python(datetime.datetime(2013, 1, 18, 0, 30, 28)), '2013-01-18T00:30:28Z') self.assertEqual(self.solr._from_python(True), 'true') self.assertEqual(self.solr._from_python(False), 'false') self.assertEqual(self.solr._from_python(1), '1') self.assertEqual(self.solr._from_python(1.2), '1.2') self.assertEqual(self.solr._from_python(b'hello'), 'hello') self.assertEqual(self.solr._from_python('hello ☃'), 'hello ☃') self.assertEqual(self.solr._from_python('\x01test\x02'), 'test') def test__to_python(self): self.assertEqual(self.solr._to_python('2013-01-18T00:00:00Z'), datetime.datetime(2013, 1, 18)) self.assertEqual(self.solr._to_python('2013-01-18T00:30:28Z'), datetime.datetime(2013, 1, 18, 0, 30, 28)) self.assertEqual(self.solr._to_python('true'), True) self.assertEqual(self.solr._to_python('false'), False) self.assertEqual(self.solr._to_python(1), 1) self.assertEqual(self.solr._to_python(1.2), 1.2) self.assertEqual(self.solr._to_python(b'hello'), 'hello') self.assertEqual(self.solr._to_python('hello ☃'), 'hello ☃') self.assertEqual(self.solr._to_python(['foo', 'bar']), 'foo') self.assertEqual(self.solr._to_python(('foo', 'bar')), 'foo') self.assertEqual(self.solr._to_python('tuple("foo", "bar")'), 'tuple("foo", "bar")') def test__is_null_value(self): self.assertTrue(self.solr._is_null_value(None)) self.assertTrue(self.solr._is_null_value('')) self.assertFalse(self.solr._is_null_value('Hello')) self.assertFalse(self.solr._is_null_value(1)) def test_search(self): results = self.solr.search('doc') self.assertEqual(len(results), 3) results = self.solr.search('example') self.assertEqual(len(results), 2) results = self.solr.search('nothing') self.assertEqual(len(results), 0) # Advanced options. results = self.solr.search( 'doc', **{ 'debug': 'true', 'hl': 'true', 'hl.fragsize': 8, 'facet': 'on', 'facet.field': 'popularity', 'spellcheck': 'true', 'spellcheck.collate': 'true', 'spellcheck.count': 1, # TODO: Can't get these working in my test setup. # 'group': 'true', # 'group.field': 'id', }) self.assertEqual(len(results), 3) self.assertTrue('explain' in results.debug) self.assertEqual(results.highlighting, { u'doc_4': {}, u'doc_2': {}, u'doc_1': {} }) self.assertEqual(results.spellcheck, {}) self.assertEqual(results.facets['facet_fields']['popularity'], ['10', 2, '7', 1, '2', 0, '8', 0]) self.assertTrue(results.qtime is not None) # TODO: Can't get these working in my test setup. # self.assertEqual(results.grouped, '') def test_more_like_this(self): results = self.solr.more_like_this('id:doc_1', 'text') self.assertEqual(len(results), 0) def test_suggest_terms(self): results = self.solr.suggest_terms('title', '') self.assertEqual(len(results), 1) self.assertEqual( results, { 'title': [('doc', 3), ('another', 2), ('example', 2), ('1', 1), ('2', 1), ('boring', 1), ('rock', 1), ('thing', 1)] }) def test__build_doc(self): doc = { 'id': 'doc_1', 'title': 'Example doc ☃ 1', 'price': 12.59, 'popularity': 10, } doc_xml = force_unicode( ET.tostring(self.solr._build_doc(doc), encoding='utf-8')) self.assertTrue( '<field name="title">Example doc ☃ 1</field>' in doc_xml) self.assertTrue('<field name="id">doc_1</field>' in doc_xml) self.assertEqual(len(doc_xml), 152) def test_add(self): self.assertEqual(len(self.solr.search('doc')), 3) self.assertEqual(len(self.solr.search('example')), 2) self.solr.add([ { 'id': 'doc_6', 'title': 'Newly added doc', }, { 'id': 'doc_7', 'title': 'Another example doc', }, ]) self.assertEqual(len(self.solr.search('doc')), 5) self.assertEqual(len(self.solr.search('example')), 3) def test_add_with_boost(self): self.assertEqual(len(self.solr.search('doc')), 3) self.solr.add([{ 'id': 'doc_6', 'title': 'Important doc' }], boost={'title': 10.0}) self.solr.add([{ 'id': 'doc_7', 'title': 'Spam doc doc' }], boost={'title': 0}) res = self.solr.search('doc') self.assertEqual(len(res), 5) self.assertEqual('doc_6', res.docs[0]['id']) def test_field_update(self): originalDocs = self.solr.search('doc') self.assertEqual(len(originalDocs), 3) updateList = [] for i, doc in enumerate(originalDocs): updateList.append({'id': doc['id'], 'popularity': 5}) self.solr.add(updateList, fieldUpdates={'popularity': 'inc'}) updatedDocs = self.solr.search('doc') self.assertEqual(len(updatedDocs), 3) for i, (originalDoc, updatedDoc) in enumerate(zip(originalDocs, updatedDocs)): self.assertEqual(len(updatedDoc.keys()), len(originalDoc.keys())) self.assertEqual(updatedDoc['popularity'], originalDoc['popularity'] + 5) self.assertEqual( True, all(updatedDoc[k] == originalDoc[k] for k in updatedDoc.keys() if not k in ['_version_', 'popularity'])) self.solr.add([ { 'id': 'multivalued_1', 'title': 'Multivalued doc 1', 'word_ss': ['alpha', 'beta'], }, { 'id': 'multivalued_2', 'title': 'Multivalued doc 2', 'word_ss': ['charlie', 'delta'], }, ]) originalDocs = self.solr.search('multivalued') self.assertEqual(len(originalDocs), 2) updateList = [] for i, doc in enumerate(originalDocs): updateList.append({ 'id': doc['id'], 'word_ss': ['epsilon', 'gamma'] }) self.solr.add(updateList, fieldUpdates={'word_ss': 'add'}) updatedDocs = self.solr.search('multivalued') self.assertEqual(len(updatedDocs), 2) for i, (originalDoc, updatedDoc) in enumerate(zip(originalDocs, updatedDocs)): self.assertEqual(len(updatedDoc.keys()), len(originalDoc.keys())) self.assertEqual(updatedDoc['word_ss'], originalDoc['word_ss'] + ['epsilon', 'gamma']) self.assertEqual( True, all(updatedDoc[k] == originalDoc[k] for k in updatedDoc.keys() if not k in ['_version_', 'word_ss'])) def test_delete(self): self.assertEqual(len(self.solr.search('doc')), 3) self.solr.delete(id='doc_1') self.assertEqual(len(self.solr.search('doc')), 2) self.solr.delete(q='price:[0 TO 15]') self.assertEqual(len(self.solr.search('doc')), 1) self.assertEqual(len(self.solr.search('*:*')), 1) self.solr.delete(q='*:*') self.assertEqual(len(self.solr.search('*:*')), 0) # Need at least one. self.assertRaises(ValueError, self.solr.delete) # Can't have both. self.assertRaises(ValueError, self.solr.delete, id='foo', q='bar') def test_commit(self): self.assertEqual(len(self.solr.search('doc')), 3) self.solr.add([{ 'id': 'doc_6', 'title': 'Newly added doc', }], commit=False) self.assertEqual(len(self.solr.search('doc')), 3) self.solr.commit() self.assertEqual(len(self.solr.search('doc')), 4) def test_optimize(self): # Make sure it doesn't blow up. Side effects are hard to measure. :/ self.assertEqual(len(self.solr.search('doc')), 3) self.solr.add([{ 'id': 'doc_6', 'title': 'Newly added doc', }], commit=False) self.assertEqual(len(self.solr.search('doc')), 3) self.solr.optimize() self.assertEqual(len(self.solr.search('doc')), 4) def test_extract(self): fake_f = StringIO(""" <html> <head> <meta charset="utf-8"> <meta name="haystack-test" content="test 1234"> <title>Test Title ☃☃</title> </head> <body>foobar</body> </html> """) fake_f.name = "test.html" extracted = self.solr.extract(fake_f) # Verify documented response structure: self.assertIn('contents', extracted) self.assertIn('metadata', extracted) self.assertIn('foobar', extracted['contents']) m = extracted['metadata'] self.assertEqual([fake_f.name], m['stream_name']) self.assertIn('haystack-test', m, "HTML metadata should have been extracted!") self.assertEqual(['test 1234'], m['haystack-test']) # Note the underhanded use of a double snowman to verify both that Tika # correctly decoded entities and that our UTF-8 characters survived the # round-trip: self.assertEqual(['Test Title ☃☃'], m['title']) def test_full_url(self): self.solr.url = 'http://localhost:8983/solr/core0' full_url = self.solr._create_full_url(path='/update') # Make sure trailing and leading slashes do not collide: self.assertEqual(full_url, 'http://localhost:8983/solr/core0/update')
class SolrTestCase(unittest.TestCase): def setUp(self): super(SolrTestCase, self).setUp() self.default_solr = Solr('http://localhost:8983/solr/core0') # Short timeouts. self.solr = Solr('http://localhost:8983/solr/core0', timeout=2) self.docs = [ { 'id': 'doc_1', 'title': 'Example doc 1', 'price': 12.59, 'popularity': 10, }, { 'id': 'doc_2', 'title': 'Another example ☃ doc 2', 'price': 13.69, 'popularity': 7, }, { 'id': 'doc_3', 'title': 'Another thing', 'price': 2.35, 'popularity': 8, }, { 'id': 'doc_4', 'title': 'doc rock', 'price': 99.99, 'popularity': 10, }, { 'id': 'doc_5', 'title': 'Boring', 'price': 1.12, 'popularity': 2, }, { "id": "sn1", "cat": "pony", "comments": "blue", "description": "black", "store": "50.03131,10.12135" }, { "id": "sn2", "cat": "pony", "name": "fake unicorn", "comments": "yellow", "description": "blue", "store": "54.23131,10.12135" }, { "id": "sn3", "cat": "pony", "comments": "yellow", "description": "red", "store": "54.33131,10.12135" }, { "id": "sn4", "cat": "unicorn", "comments": "yellow", "description": "blue" }, { "id": "sn5", "cat": "unicorn", "comments": "steel", "description": "steel", "store": "54.43131,10.12135" }, { "id": "sn6", "name": "blue pony", "cat": "unicorn", "comments": "blue", "description": "blue", "store": "54.33131,10.22135" }, ] # Clear it. self.solr.delete(q='*:*') # Index our docs. Yes, this leans on functionality we're going to test # later & if it's broken, everything will catastrophically fail. # Such is life. self.solr.add(self.docs) def tearDown(self): self.solr.delete(q='*:*') super(SolrTestCase, self).tearDown() def test_init(self): self.assertEqual(self.default_solr.url, 'http://localhost:8983/solr/core0') self.assertTrue(isinstance(self.default_solr.decoder, json.JSONDecoder)) self.assertEqual(self.default_solr.timeout, 60) self.assertEqual(self.solr.url, 'http://localhost:8983/solr/core0') self.assertTrue(isinstance(self.solr.decoder, json.JSONDecoder)) self.assertEqual(self.solr.timeout, 2) def assertSameIDs(self, docs, expected_ids): doc_ids = frozenset([doc['id'] for doc in docs]) ids_set = frozenset(expected_ids) self.assertEqual(doc_ids, ids_set) def test__create_full_url(self): # Nada. self.assertEqual(self.solr._create_full_url(path=''), 'http://localhost:8983/solr/core0') # Basic path. self.assertEqual(self.solr._create_full_url(path='pysolr_tests'), 'http://localhost:8983/solr/core0/pysolr_tests') # Leading slash (& making sure we don't touch the trailing slash). self.assertEqual(self.solr._create_full_url(path='/pysolr_tests/select/?whatever=/'), 'http://localhost:8983/solr/core0/pysolr_tests/select/?whatever=/') def test__send_request(self): # Test a valid request. resp_body = self.solr._send_request('GET', 'select/?q=doc&wt=json') self.assertTrue('"numFound":3' in resp_body) # Test a lowercase method & a body. xml_body = '<add><doc><field name="id">doc_12</field><field name="title">Whee!</field></doc></add>' resp_body = self.solr._send_request('POST', 'update/?commit=true', body=xml_body, headers={ 'Content-type': 'text/xml; charset=utf-8', }) self.assertTrue('<int name="status">0</int>' in resp_body) # Test a non-existent URL. old_url = self.solr.url self.solr.url = 'http://127.0.0.1:567898/wahtever' self.assertRaises(SolrError, self.solr._send_request, 'get', 'select/?q=doc&wt=json') self.solr.url = old_url def test__select(self): # Short params. resp_body = self.solr._select({'q': 'doc'}) resp_data = json.loads(resp_body) self.assertEqual(resp_data['response']['numFound'], 3) # Long params. resp_body = self.solr._select({'q': 'doc' * 1024}) resp_data = json.loads(resp_body) self.assertEqual(resp_data['response']['numFound'], 0) self.assertEqual(len(resp_data['responseHeader']['params']['q']), 3 * 1024) def test__mlt(self): resp_body = self.solr._mlt({'q': 'id:doc_1', 'mlt.fl': 'title'}) resp_data = json.loads(resp_body) self.assertEqual(resp_data['response']['numFound'], 0) def test__suggest_terms(self): resp_body = self.solr._select({'terms.fl': 'title'}) resp_data = json.loads(resp_body) self.assertEqual(resp_data['response']['numFound'], 0) def test__update(self): xml_body = '<add><doc><field name="id">doc_12</field><field name="title">Whee!</field></doc></add>' resp_body = self.solr._update(xml_body) self.assertTrue('<int name="status">0</int>' in resp_body) def test__extract_error(self): class RubbishResponse(object): def __init__(self, content, headers=None): self.content = content self.headers = headers if self.headers is None: self.headers = {} # Just the reason. resp_1 = RubbishResponse("We don't care.", {'reason': 'Something went wrong.'}) self.assertEqual(self.solr._extract_error(resp_1), "[Reason: Something went wrong.]") # Empty reason. resp_2 = RubbishResponse("We don't care.", {'reason': None}) self.assertEqual(self.solr._extract_error(resp_2), "[Reason: None]\nWe don't care.") # No reason. Time to scrape. resp_3 = RubbishResponse('<html><body><pre>Something is broke.</pre></body></html>', {'server': 'jetty'}) self.assertEqual(self.solr._extract_error(resp_3), "[Reason: Something is broke.]") def test__scrape_response(self): # Tomcat. resp_1 = self.solr._scrape_response({'server': 'coyote'}, '<html><body><p><span>Error message</span><span>messed up.</span></p></body></html>') self.assertEqual(resp_1, ('messed up.', '')) # Jetty. resp_2 = self.solr._scrape_response({'server': 'jetty'}, '<html><body><pre>Something is broke.</pre></body></html>') self.assertEqual(resp_2, ('Something is broke.', u'')) # Broken Tomcat. resp_3 = self.solr._scrape_response({'server': 'coyote'}, '<html><body><p>Really broken. Scraping Java-generated HTML sucks.</pre></body></html>') self.assertEqual(resp_3, (None, u'<div><body><p>Really broken. Scraping Java-generated HTML sucks.</p></body></div>')) # Other. resp_4 = self.solr._scrape_response({'server': 'crapzilla'}, '<html><head><title>Wow. Seriously weird.</title></head><body><pre>Something is broke.</pre></body></html>') self.assertEqual(resp_4, ('Wow. Seriously weird.', u'')) def test__from_python(self): self.assertEqual(self.solr._from_python(datetime.date(2013, 1, 18)), '2013-01-18T00:00:00Z') self.assertEqual(self.solr._from_python(datetime.datetime(2013, 1, 18, 0, 30, 28)), '2013-01-18T00:30:28Z') self.assertEqual(self.solr._from_python(True), 'true') self.assertEqual(self.solr._from_python(False), 'false') self.assertEqual(self.solr._from_python(1), '1') self.assertEqual(self.solr._from_python(1.2), '1.2') self.assertEqual(self.solr._from_python(b'hello'), 'hello') self.assertEqual(self.solr._from_python('hello ☃'), 'hello ☃') def test__to_python(self): self.assertEqual(self.solr._to_python('2013-01-18T00:00:00Z'), datetime.datetime(2013, 1, 18)) self.assertEqual(self.solr._to_python('2013-01-18T00:30:28Z'), datetime.datetime(2013, 1, 18, 0, 30, 28)) self.assertEqual(self.solr._to_python('true'), True) self.assertEqual(self.solr._to_python('false'), False) self.assertEqual(self.solr._to_python(1), 1) self.assertEqual(self.solr._to_python(1.2), 1.2) self.assertEqual(self.solr._to_python(b'hello'), 'hello') self.assertEqual(self.solr._to_python('hello ☃'), 'hello ☃') self.assertEqual(self.solr._to_python(['foo', 'bar']), 'foo') self.assertEqual(self.solr._to_python(('foo', 'bar')), 'foo') def test__is_null_value(self): self.assertTrue(self.solr._is_null_value(None)) self.assertTrue(self.solr._is_null_value('')) self.assertFalse(self.solr._is_null_value('Hello')) self.assertFalse(self.solr._is_null_value(1)) def test_create_nested_q(self): query = self.solr.create_nested_q("dismax", "how now brown cow", **{ 'pf': 'myfield', 'qf': 'myfield2', }) self.assertEqual(query, '_query_:"{!dismax pf=\'myfield\' qf=\'myfield2\'}how now brown cow"') def test_search(self): results = self.solr.search('doc') self.assertEqual(len(results), 3) results = self.solr.search('example') self.assertEqual(len(results), 2) results = self.solr.search('nothing') self.assertEqual(len(results), 0) # Advanced options. results = self.solr.search('doc', **{ 'debug': 'true', 'hl': 'true', 'hl.fragsize': 8, 'facet': 'on', 'facet.field': 'popularity', 'spellcheck': 'true', 'spellcheck.collate': 'true', 'spellcheck.count': 1, # TODO: Can't get these working in my test setup. # 'group': 'true', # 'group.field': 'id', }) self.assertEqual(len(results), 3) self.assertTrue('explain' in results.debug) self.assertEqual(results.highlighting, {u'doc_4': {}, u'doc_2': {}, u'doc_1': {}}) self.assertEqual(results.spellcheck, {}) self.assertEqual(results.facets['facet_fields']['popularity'], ['10', 2, '7', 1, '2', 0, '8', 0]) self.assertTrue(results.qtime is not None) # TODO: Can't get these working in my test setup. # self.assertEqual(results.grouped, '') def test_search_with_nested_q(self): nested_q = self.solr.create_nested_q('edismax', 'blue', **{ 'qf': 'description comments' }) results = self.solr.search('pony AND {}'.format(nested_q)) self.assertSameIDs(results, ['sn6', 'sn2', 'sn1']) def test_disjunction_max(self): results = self.solr.disjunction_max('blue', 'description comments') self.assertSameIDs(results, ['sn6', 'sn4', 'sn2', 'sn1']) def test_disjunction_max_with_nested_q(self): nested_q = self.solr.create_nested_q('edismax', 'blue', **{ 'qf': 'description comments' }) results = self.solr.disjunction_max('unicorn AND {}'.format(nested_q), 'cat name') self.assertSameIDs(results, ['sn6', 'sn4', 'sn2']) def test_spatial_search(self): results = self.solr.spatial_search('pony', 'store', '54.33131,10.12135', '100') self.assertSameIDs(results, ['sn6', 'sn3', 'sn2']) def test_more_like_this(self): results = self.solr.more_like_this('id:doc_1', 'text') self.assertEqual(len(results), 0) def test_suggest_terms(self): results = self.solr.suggest_terms('title', '') self.assertEqual(len(results), 1) self.assertEqual(results, {'title': [('doc', 3), ('another', 2), ('example', 2), ('1', 1), ('2', 1), ('boring', 1), ('rock', 1), ('thing', 1)]}) def test__build_doc(self): doc = { 'id': 'doc_1', 'title': 'Example doc ☃ 1', 'price': 12.59, 'popularity': 10, } doc_xml = force_unicode(ET.tostring(self.solr._build_doc(doc), encoding='utf-8')) self.assertTrue('<field name="title">Example doc ☃ 1</field>' in doc_xml) self.assertTrue('<field name="id">doc_1</field>' in doc_xml) self.assertEqual(len(doc_xml), 152) def test_add(self): self.assertEqual(len(self.solr.search('doc')), 3) self.assertEqual(len(self.solr.search('example')), 2) self.solr.add([ { 'id': 'doc_6', 'title': 'Newly added doc', }, { 'id': 'doc_7', 'title': 'Another example doc', }, ]) self.assertEqual(len(self.solr.search('doc')), 5) self.assertEqual(len(self.solr.search('example')), 3) def test_add_with_boost(self): self.assertEqual(len(self.solr.search('doc')), 3) self.solr.add([{'id': 'doc_6', 'title': 'Important doc'}], boost={'title': 10.0}) self.solr.add([{'id': 'doc_7', 'title': 'Spam doc doc'}], boost={'title': 0}) res = self.solr.search('doc') self.assertEqual(len(res), 5) self.assertEqual('doc_6', res.docs[0]['id']) def test_delete(self): self.assertEqual(len(self.solr.search('doc')), 3) self.solr.delete(id='doc_1') self.assertEqual(len(self.solr.search('doc')), 2) self.solr.delete(q='price:[0 TO 15]') self.assertEqual(len(self.solr.search('doc')), 1) self.assertEqual(len(self.solr.search('*:*')), 7) self.solr.delete(q='*:*') self.assertEqual(len(self.solr.search('*:*')), 0) # Need at least one. self.assertRaises(ValueError, self.solr.delete) # Can't have both. self.assertRaises(ValueError, self.solr.delete, id='foo', q='bar') def test_commit(self): self.assertEqual(len(self.solr.search('doc')), 3) self.solr.add([ { 'id': 'doc_6', 'title': 'Newly added doc', } ], commit=False) self.assertEqual(len(self.solr.search('doc')), 3) self.solr.commit() self.assertEqual(len(self.solr.search('doc')), 4) def test_optimize(self): # Make sure it doesn't blow up. Side effects are hard to measure. :/ self.assertEqual(len(self.solr.search('doc')), 3) self.solr.add([ { 'id': 'doc_6', 'title': 'Newly added doc', } ], commit=False) self.assertEqual(len(self.solr.search('doc')), 3) self.solr.optimize() self.assertEqual(len(self.solr.search('doc')), 4) def test_extract(self): fake_f = StringIO(""" <html> <head> <meta charset="utf-8"> <meta name="haystack-test" content="test 1234"> <title>Test Title ☃☃</title> </head> <body>foobar</body> </html> """) fake_f.name = "test.html" extracted = self.solr.extract(fake_f) # Verify documented response structure: self.assertIn('contents', extracted) self.assertIn('metadata', extracted) self.assertIn('foobar', extracted['contents']) m = extracted['metadata'] self.assertEqual([fake_f.name], m['stream_name']) self.assertIn('haystack-test', m, "HTML metadata should have been extracted!") self.assertEqual(['test 1234'], m['haystack-test']) # Note the underhanded use of a double snowman to verify both that Tika # correctly decoded entities and that our UTF-8 characters survived the # round-trip: self.assertEqual(['Test Title ☃☃'], m['title']) def test_full_url(self): self.solr.url = 'http://localhost:8983/solr/' full_url = self.solr._create_full_url(path='/update') # Make sure trailing and leading slashes do not collide: self.assertEqual(full_url, 'http://localhost:8983/solr/update')
class DocManager(DocManagerBase): """The DocManager class creates a connection to the backend engine and adds/removes documents, and in the case of rollback, searches for them. The reason for storing id/doc pairs as opposed to doc's is so that multiple updates to the same doc reflect the most up to date version as opposed to multiple, slightly different versions of a doc. """ def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key='_id', chunk_size=DEFAULT_MAX_BULK, **kwargs): """Verify Solr URL and establish a connection. """ self.url = url self.solr = Solr(url, **kwargs.get('clientOptions', {})) self.unique_key = unique_key # pysolr does things in milliseconds if auto_commit_interval is not None: self.auto_commit_interval = auto_commit_interval * 1000 else: self.auto_commit_interval = None self.chunk_size = chunk_size self.field_list = [] self._build_fields() self._formatter = DocumentFlattener() self._content_type = kwargs.get("content_type", None) logging.info("begin to init content_type args ,value is %s" % str(self._content_type)) if self._content_type is None: logging.info("content_type args is none, will receive all type") self._receive_all_type = True else: logging.debug("begin to check content_type args") self._receive_all_type = False if isinstance(self._content_type, dict): self._content_type_list = dict(self._content_type).keys() logging.debug("the support type list is %s" % str(self._content_type_list)) else: raise errors.InvalidConfiguration( "args content type is not is dict") def _parse_fields(self, result, field_name): """ If Schema access, parse fields and build respective lists """ field_list = [] for key, value in result.get('schema', {}).get(field_name, {}).items(): if key not in field_list: field_list.append(key) return field_list @wrap_exceptions def _build_fields(self): """ Builds a list of valid fields """ declared_fields = self.solr._send_request('get', ADMIN_URL) result = decoder.decode(declared_fields) self.field_list = self._parse_fields(result, 'fields') # Build regular expressions to match dynamic fields. # dynamic field names may have exactly one wildcard, either at # the beginning or the end of the name self._dynamic_field_regexes = [] for wc_pattern in self._parse_fields(result, 'dynamicFields'): if wc_pattern[0] == "*": self._dynamic_field_regexes.append( re.compile(".*%s\Z" % wc_pattern[1:])) elif wc_pattern[-1] == "*": self._dynamic_field_regexes.append( re.compile("\A%s.*" % wc_pattern[:-1])) def _clean_doc(self, doc, namespace, timestamp): """Reformats the given document before insertion into Solr. This method reformats the document in the following ways: - removes extraneous fields that aren't defined in schema.xml - unwinds arrays in order to find and later flatten sub-documents - flattens the document so that there are no sub-documents, and every value is associated with its dot-separated path of keys - inserts namespace and timestamp metadata into the document in order to handle rollbacks An example: {"a": 2, "b": { "c": { "d": 5 } }, "e": [6, 7, 8] } becomes: {"a": 2, "b.c.d": 5, "e.0": 6, "e.1": 7, "e.2": 8} """ # Translate the _id field to whatever unique key we're using. # _id may not exist in the doc, if we retrieved it from Solr # as part of update. if '_id' in doc: doc[self.unique_key] = u(doc.pop("_id")) # Update namespace and timestamp metadata if 'ns' in doc or '_ts' in doc: raise errors.OperationFailed( 'Need to set "ns" and "_ts" fields, but these fields already ' 'exist in the document %r!' % doc) doc['ns'] = namespace doc['_ts'] = timestamp #doc 提前进行扁平化 doc = self._formatter.format_document(doc) #对doc中tag*变量长度进行限制 for k, v in doc.items(): if (k[0:3] == "tag" and v and isinstance(v, basestring)): doc[k] = v[0:9000] # 获取mongo表名称 collecion_name = self._get_collection_name(namespace) # 处理用户行为表数据 if ("b_dynamic" == collecion_name): logging.info("to process doc from b_dynamic ,the doc is %s" % str(doc[self.unique_key])) return self._parse_user_dynamic_collection(doc) #处理用户表 if ("T_USER" == collecion_name): logging.info("to process doc from T_USER ,the doc is %s" % str(doc[self.unique_key])) return self._parse_t_user_collection(doc) #to process the content data logging.info("begin to process b_content ,the doc is %s" % str(doc[self.unique_key])) doctemp = self._parse_content_doc(doc) if doctemp is None: logging.info("don't send doc to solr ,the doc is %s" % str(doc)) return None if (isinstance(doctemp, list) and len(doctemp) == 0): logging.info("don't send doc to solr ,the doc is %s" % str(doc)) return None if (isinstance(doctemp, list) and len(doctemp) > 1): logging.info( "to process doc from b_content after it is a list,the doc is %s" % str(doc[self.unique_key])) flat_doc = [] for docvalue in doctemp: flat_doc.append(self._parse_doc_to_solr_doc(docvalue)) return flat_doc if (isinstance(doctemp, list)): logging.info( "to process doc from b_content after it is a one-value list,the doc is %s" % str(doc[self.unique_key])) return self._parse_doc_to_solr_doc(doctemp[0]) logging.info( "to process doc from b_content after it is a object,the doc is %s" % str(doc[self.unique_key])) return self._parse_doc_to_solr_doc(doctemp) def _get_collection_name(self, namespace): '''获取mongodb的collection 的名称 ''' coll = namespace.split('.', 1)[1] return coll def _parse_user_dynamic_collection(self, doc): '''解析用户行为表,转换为搜索引擎识别的数据结构 ''' if doc.get("content"): doc["detail"] = doc.pop("content") #赋予作者字段 if doc.get("createUser.userId"): doc["author.id"] = doc.get("createUser.userId") if doc.get("createUser.userName"): doc["author.name"] = doc.get("createUser.userName") if doc.get("target"): doc["fkTag.0"] = doc.pop("target") #内容不可查询 doc["op"] = "LDEL" return self._parse_doc_to_solr_doc(doc) def _parse_t_user_collection(self, doc): '''解析用户表,转换为搜索引擎识别的数据结构 ''' #用户昵称转化 nickName = doc.pop("nickName", None) if nickName: doc["title.0.name"] = nickName doc["tag.0.name"] = nickName #用户描述转化 description = doc.pop("description", None) if description: doc["title.1.name"] = description doc["tag.1.name"] = description figureurl40 = doc.pop("figureurl40", None) if figureurl40: doc["imgurl"] = figureurl40 website = doc.pop("website", None) if website: doc["resurl"] = u"/u/" + str(website) doc["title.2.name"] = website doc["tag.2.name"] = website #如果用户被锁定说明用户不能被搜索 isLocked = doc.pop("isLocked", None) if isLocked == "N": doc["status"] = u"released" elif isLocked == "Y": doc["status"] = u"draft" #清除多余信息 doc.pop("password", None) doc.pop("salt", None) doc.pop("phoneNum", None) doc.pop("userName", None) #补充必要信息 doc["type"] = u"user" return self._parse_doc_to_solr_doc(doc) def _parse_doc_to_solr_doc(self, doc): # SOLR cannot index fields within sub-documents, so flatten documents # with the dot-separated path to each value as the respective key flat_doc = self._formatter.format_document(doc) # Only include fields that are explicitly provided in the # schema or match one of the dynamic field patterns, if # we were able to retrieve the schema if len(self.field_list) + len(self._dynamic_field_regexes) > 0: def include_field(field): return field in self.field_list or any( regex.match(field) for regex in self._dynamic_field_regexes) return dict( (k, v) for k, v in flat_doc.items() if include_field(k)) return flat_doc def _parse_content_doc(self, doc): type = doc.get("type") if doc.get("releaseTime"): doc["createTime"] = doc.get("releaseTime") if (type == "product"): return self._parse_product(doc) #不再需要对图片视频文档等做特殊处理 # if (type == "explain"): # return self._parse_explain(doc) # elif(type == "video"): # return self._parse_video(doc) # elif(type == "picture"): # return self._paser_picture(doc) # else: return [doc] def _parse_product(self, doc): """ 处理项目数据,主要是对项目详情进行处理 """ spiltflag = False resultlist = [] flat_doc = self._formatter.format_document(doc) #获取地址各个字段的数据 adlist = [] country = flat_doc.get("address.country.name") if country: self._add_list_with_not_empty_string(adlist, country) province = flat_doc.get("address.province.name") if province: self._add_list_with_not_empty_string(adlist, province) city = flat_doc.get("address.city.name") if city: self._add_list_with_not_empty_string(adlist, city) area = flat_doc.get("address.area.name") if area: self._add_list_with_not_empty_string(adlist, area) detail = flat_doc.get("address.detail.name") if detail: self._add_list_with_not_empty_string(adlist, detail) #合并为真的地址 address_str = "".join(adlist) if address_str: resultlist.append("项目地址:" + address_str) #开发建设方处理 dev_str = self._get_flat_array(flat_doc, "devBuilder.", ".name") if dev_str: resultlist.append("开发建设方:" + dev_str) #主要设计师处理 design_str = self._get_flat_array(flat_doc, "buildingMainDesigner.", ".name") if design_str: resultlist.append("建筑主创设计师:" + design_str) #建筑面积处理 buildingArea = doc.get("buildingArea") if buildingArea: resultlist.append("建筑面积:" + str(buildingArea) + "㎡") doc["detail"] = " / ".join(resultlist) return [doc] def _add_list_with_not_empty_string(self, v_list, value): if value: v_list.append(str(value)) def _get_flat_array(self, doc, prefix, suffix): """ 获取扁平化的数组并且连接为一体并返回 """ r = [] i = 0 while (True): value = doc.get(prefix + str(i) + suffix) if (value): r.append(str(value)) i = i + 1 else: break return ",".join(r) def _parse_explain(self, doc): """parse the content explain to replace the resurl value to be composited of fkTag """ return [doc] ''' 不需要对explain即点评做特殊处理了 fkTag=doc.get("fkTag") if(isinstance(fkTag,list) and len(fkTag) > 0): resurl="/detail/"+str(fkTag[0]) logging.info("resurl is replace from %s to %s" % (doc.get("resurl"),resurl)) doc["resurl"]=u(resurl) else: logging.error("fail to change resurl(%s) ,because the fkTag(%s) is not valid" % (str(doc.get("resurl")),str(doc.get("fkTag")) )) return [doc] ''' def _parse_video(self, doc): return self._parse_content_list_to_serval(doc, "video", "video") def _paser_picture(self, doc): """parse the picture content to subdoc doclist=[doc] logging.debug("parse picture ,the raw doc is %s:" % str(doc)) picture=doc.get("picture") if(isinstance(picture, list) and len(picture)>0): for index,value in enumerate(picture): doctemp=doc.copy() doctemp["s_picture_id"]=u(value.get("id")) doctemp["s_pitcure_name"]=u(value.get("name")) doctemp["_id"]=u(doctemp.get("_id")+"_"+str(index)) doctemp["s_parent_id"]=u(doctemp.get("_id")) doctemp["type"]="s_picture" doclist.append(doctemp) #only picture is existed , to replace s_picture attr doc["s_picture"]=picture # !!!!!there is bug when update picture status return doclist """ return self._parse_content_list_to_serval(doc, "picture", "picture") def _parse_content_list_to_serval(self, doc, fieldName, type): """parse the picture content to subdoc """ doclist = [doc] logging.debug("parse %s ,the raw doc is %s:" % (fieldName, str(doc))) picture = doc.get(fieldName) if (isinstance(picture, list) and len(picture) > 0): s_field_id = "s_" + fieldName + "_id" s_field_name = "s_" + fieldName + "_name" new_type = "s_" + type for index, value in enumerate(picture): doctemp = doc.copy() doctemp[s_field_id] = u(value.get("id")) doctemp[s_field_name] = u(value.get("name")) doctemp["_id"] = u(doctemp.get("_id") + "_" + str(index)) doctemp["s_parent_id"] = u(doctemp.get("_id")) doctemp["type"] = new_type doclist.append(doctemp) #only picture is existed , to replace s_picture attr doc["s_" + fieldName] = picture # !!!!!there is bug when update picture status return doclist def stop(self): """ Stops the instance """ pass @wrap_exceptions def handle_command(self, doc, namespace, timestamp): db, _ = namespace.split('.', 1) if doc.get('dropDatabase'): for new_db in self.command_helper.map_db(db): self.solr.delete(q="ns:%s.*" % new_db, commit=(self.auto_commit_interval == 0)) if doc.get('renameCollection'): raise errors.OperationFailed( "solr_doc_manager does not support replication of " " renameCollection") if doc.get('create'): # nothing to do pass if doc.get('drop'): new_db, coll = self.command_helper.map_collection(db, doc['drop']) if new_db: self.solr.delete(q="ns:%s.%s" % (new_db, coll), commit=(self.auto_commit_interval == 0)) def apply_update(self, doc, update_spec): """Override DocManagerBase.apply_update to have flat documents.""" # Replace a whole document if not '$set' in update_spec and not '$unset' in update_spec: # update_spec contains the new document. # Update the key in Solr based on the unique_key mentioned as # parameter. update_spec['_id'] = doc[self.unique_key] return update_spec for to_set in update_spec.get("$set", []): value = update_spec['$set'][to_set] # Find dotted-path to the value, remove that key from doc, then # put value at key: keys_to_pop = [] for key in doc: if key.startswith(to_set): if key == to_set or key[len(to_set)] == '.': keys_to_pop.append(key) for key in keys_to_pop: doc.pop(key) doc[to_set] = value for to_unset in update_spec.get("$unset", []): # MongoDB < 2.5.2 reports $unset for fields that don't exist within # the document being updated. keys_to_pop = [] for key in doc: if key.startswith(to_unset): if key == to_unset or key[len(to_unset)] == '.': keys_to_pop.append(key) tmp_to_unset = "s_" + to_unset if key.startswith(tmp_to_unset): if key == tmp_to_unset or key[len(tmp_to_unset)] == '.': keys_to_pop.append(key) for key in keys_to_pop: doc.pop(key) return doc @wrap_exceptions def update(self, document_id, update_spec, namespace, timestamp): """Apply updates given in update_spec to the document whose id matches that of doc. """ # Commit outstanding changes so that the document to be updated is the # same version to which the changes apply. self.commit() # Need to escape special characters in the document_id. document_id = ''.join( map(lambda c: '\\' + c if c in ESCAPE_CHARACTERS else c, u(document_id))) query = "%s:%s" % (self.unique_key, document_id) results = self.solr.search(query) if not len(results): # Document may not be retrievable yet self.commit() results = self.solr.search(query) # Results is an iterable containing only 1 result for doc in results: # Remove metadata previously stored by Mongo Connector. doc.pop('ns') doc.pop('_ts') updated = self.apply_update(doc, update_spec) # A _version_ of 0 will always apply the update updated['_version_'] = 0 self.upsert(updated, namespace, timestamp) return updated @wrap_exceptions def upsert(self, doc, namespace, timestamp): """Update or insert a document into Solr This method should call whatever add/insert/update method exists for the backend engine and add the document in there. The input will always be one mongo document, represented as a Python dictionary. """ logging.debug("before insert the raw doc is :(%s)" % str(doc)) docs = self._clean_doc(doc, namespace, timestamp) logging.debug("before insert the processed doc is :(%s)" % str(doc)) if docs is None: return None if not isinstance(docs, list): docs = [docs] docid = doc.get("_id") #self.remove(docid, namespace, timestamp) #delete the child node about this file, TODO # if docid : # logging.info("remove solr document which id is %s _* ,timestamp is %s" % (str(docid), str(timestamp))) # self.solr.delete(q=u("_id:"+docid+"_*"), # commit=(self.auto_commit_interval == 0)) # else: # raise errors.OperationFailed("delete solr document error for the id(%s) is not valid" % str(docid)); try: if self.auto_commit_interval is not None: self.solr.add(docs, commit=(self.auto_commit_interval == 0), commitWithin=u(self.auto_commit_interval)) else: self.solr.add(docs, commit=False) logging.debug("insert into solr docs:(%s)" % str(docs)) except UnicodeDecodeError: logging.exception( "Unable to process processed document for UnicodeDecodeError, %r " % str(docs)) @wrap_exceptions def bulk_upsert(self, docs, namespace, timestamp): """Update or insert multiple documents into Solr docs may be any iterable """ if self.auto_commit_interval is not None: add_kwargs = { "commit": (self.auto_commit_interval == 0), "commitWithin": str(self.auto_commit_interval) } else: add_kwargs = {"commit": False} cleaned = (self._clean_doc(d, namespace, timestamp) for d in docs) if self.chunk_size > 0: batch = list(next(cleaned) for i in range(self.chunk_size)) while batch: self.solr.add(batch, **add_kwargs) batch = list(next(cleaned) for i in range(self.chunk_size)) else: self.solr.add(cleaned, **add_kwargs) @wrap_exceptions def insert_file(self, f, namespace, timestamp): params = self._formatter.format_document(f.get_metadata()) params[self.unique_key] = params.pop('_id') params['ns'] = namespace params['_ts'] = timestamp params = dict(('literal.' + k, v) for k, v in params.items()) if self.auto_commit_interval == 0: params['commit'] = 'true' request = Request( os.path.join(self.url, "update/extract?%s" % urlencode(params))) request.add_header("Content-type", "application/octet-stream") request.data = f response = urlopen(request) logging.debug(response.read()) @wrap_exceptions def remove(self, document_id, namespace, timestamp): """Removes documents from Solr The input is a python dictionary that represents a mongo document. """ if document_id: self.solr.delete(id=u(document_id), commit=(self.auto_commit_interval == 0)) self.solr.delete(q=u("_id:" + document_id + "_*"), commit=(self.auto_commit_interval == 0)) else: raise errors.OperationFailed( "delete solr document error for the id(%s) is not valid" % str(document_id)) @wrap_exceptions def _stream_search(self, query): """Helper method for iterating over Solr search results.""" for doc in self.solr.search(query, rows=100000000): if self.unique_key != "_id": doc["_id"] = doc.pop(self.unique_key) yield doc @wrap_exceptions def search(self, start_ts, end_ts): """Called to query Solr for documents in a time range.""" query = '_ts: [%s TO %s]' % (start_ts, end_ts) return self._stream_search(query) def commit(self): """This function is used to force a commit. """ retry_until_ok(self.solr.commit) @wrap_exceptions def get_last_doc(self): """Returns the last document stored in the Solr engine. """ #search everything, sort by descending timestamp, return 1 row try: result = self.solr.search('*:*', sort='_ts desc', rows=1) except ValueError: return None for r in result: r['_id'] = r.pop(self.unique_key) return r
class SolrTestCase(unittest.TestCase): def setUp(self): super(SolrTestCase, self).setUp() self.default_solr = Solr('http://*****:*****@unittest.skipUnless(HAS_LXML, "Cannot test Tomcat error extraction without lxml") def test__scrape_response_tomcat(self): """Tests for Tomcat error responses, which currently require lxml.html to parse""" # Tomcat. resp_1 = self.solr._scrape_response({'server': 'coyote'}, '<html><body><p><span>Error message</span><span>messed up.</span></p></body></html>') self.assertEqual(resp_1, ('messed up.', '')) # Broken Tomcat. resp_2 = self.solr._scrape_response({'server': 'coyote'}, '<html><body><p>Really broken. Scraping Java-generated HTML sucks.</pre></body></html>') self.assertEqual(resp_2, (None, u'<div><body><p>Really broken. Scraping Java-generated HTML sucks.</p></body></div>')) def test__from_python(self): self.assertEqual(self.solr._from_python(datetime.date(2013, 1, 18)), '2013-01-18T00:00:00Z') self.assertEqual(self.solr._from_python(datetime.datetime(2013, 1, 18, 0, 30, 28)), '2013-01-18T00:30:28Z') self.assertEqual(self.solr._from_python(True), 'true') self.assertEqual(self.solr._from_python(False), 'false') self.assertEqual(self.solr._from_python(1), '1') self.assertEqual(self.solr._from_python(1.2), '1.2') self.assertEqual(self.solr._from_python(b'hello'), 'hello') self.assertEqual(self.solr._from_python('hello ☃'), 'hello ☃') self.assertEqual(self.solr._from_python('\x01test\x02'), 'test') def test__to_python(self): self.assertEqual(self.solr._to_python('2013-01-18T00:00:00Z'), datetime.datetime(2013, 1, 18)) self.assertEqual(self.solr._to_python('2013-01-18T00:30:28Z'), datetime.datetime(2013, 1, 18, 0, 30, 28)) self.assertEqual(self.solr._to_python('true'), True) self.assertEqual(self.solr._to_python('false'), False) self.assertEqual(self.solr._to_python(1), 1) self.assertEqual(self.solr._to_python(1.2), 1.2) self.assertEqual(self.solr._to_python(b'hello'), 'hello') self.assertEqual(self.solr._to_python('hello ☃'), 'hello ☃') self.assertEqual(self.solr._to_python(['foo', 'bar']), 'foo') self.assertEqual(self.solr._to_python(('foo', 'bar')), 'foo') self.assertEqual(self.solr._to_python('tuple("foo", "bar")'), 'tuple("foo", "bar")') def test__is_null_value(self): self.assertTrue(self.solr._is_null_value(None)) self.assertTrue(self.solr._is_null_value('')) self.assertFalse(self.solr._is_null_value('Hello')) self.assertFalse(self.solr._is_null_value(1)) def test_search(self): results = self.solr.search('doc') self.assertEqual(len(results), 3) results = self.solr.search('example') self.assertEqual(len(results), 2) results = self.solr.search('nothing') self.assertEqual(len(results), 0) # Advanced options. results = self.solr.search('doc', **{ 'debug': 'true', 'hl': 'true', 'hl.fragsize': 8, 'facet': 'on', 'facet.field': 'popularity', 'spellcheck': 'true', 'spellcheck.collate': 'true', 'spellcheck.count': 1, # TODO: Can't get these working in my test setup. # 'group': 'true', # 'group.field': 'id', }) self.assertEqual(len(results), 3) self.assertTrue('explain' in results.debug) self.assertEqual(results.highlighting, {u'doc_4': {}, u'doc_2': {}, u'doc_1': {}}) self.assertEqual(results.spellcheck, {}) self.assertEqual(results.facets['facet_fields']['popularity'], ['10', 2, '7', 1, '2', 0, '8', 0]) self.assertTrue(results.qtime is not None) # TODO: Can't get these working in my test setup. # self.assertEqual(results.grouped, '') def test_more_like_this(self): results = self.solr.more_like_this('id:doc_1', 'text') self.assertEqual(len(results), 0) def test_suggest_terms(self): results = self.solr.suggest_terms('title', '') self.assertEqual(len(results), 1) self.assertEqual(results, {'title': [('doc', 3), ('another', 2), ('example', 2), ('1', 1), ('2', 1), ('boring', 1), ('rock', 1), ('thing', 1)]}) def test__build_doc(self): doc = { 'id': 'doc_1', 'title': 'Example doc ☃ 1', 'price': 12.59, 'popularity': 10, } doc_xml = force_unicode(ET.tostring(self.solr._build_doc(doc), encoding='utf-8')) self.assertTrue('<field name="title">Example doc ☃ 1</field>' in doc_xml) self.assertTrue('<field name="id">doc_1</field>' in doc_xml) self.assertEqual(len(doc_xml), 152) def test_add(self): self.assertEqual(len(self.solr.search('doc')), 3) self.assertEqual(len(self.solr.search('example')), 2) self.solr.add([ { 'id': 'doc_6', 'title': 'Newly added doc', }, { 'id': 'doc_7', 'title': 'Another example doc', }, ]) self.assertEqual(len(self.solr.search('doc')), 5) self.assertEqual(len(self.solr.search('example')), 3) def test_add_with_boost(self): self.assertEqual(len(self.solr.search('doc')), 3) self.solr.add([{'id': 'doc_6', 'title': 'Important doc'}], boost={'title': 10.0}) self.solr.add([{'id': 'doc_7', 'title': 'Spam doc doc'}], boost={'title': 0}) res = self.solr.search('doc') self.assertEqual(len(res), 5) self.assertEqual('doc_6', res.docs[0]['id']) def test_field_update(self): originalDocs = self.solr.search('doc') self.assertEqual(len(originalDocs), 3) updateList = [] for i, doc in enumerate(originalDocs): updateList.append( {'id': doc['id'], 'popularity': 5} ) self.solr.add(updateList, fieldUpdates={'popularity': 'inc'}) updatedDocs = self.solr.search('doc') self.assertEqual(len(updatedDocs), 3) for i, (originalDoc, updatedDoc) in enumerate(zip(originalDocs, updatedDocs)): self.assertEqual(len(updatedDoc.keys()), len(originalDoc.keys())) self.assertEqual(updatedDoc['popularity'], originalDoc['popularity'] + 5) self.assertEqual(True, all(updatedDoc[k] == originalDoc[k] for k in updatedDoc.keys() if not k in ['_version_', 'popularity'])) self.solr.add([ { 'id': 'multivalued_1', 'title': 'Multivalued doc 1', 'word_ss': ['alpha', 'beta'], }, { 'id': 'multivalued_2', 'title': 'Multivalued doc 2', 'word_ss': ['charlie', 'delta'], }, ]) originalDocs = self.solr.search('multivalued') self.assertEqual(len(originalDocs), 2) updateList = [] for i, doc in enumerate(originalDocs): updateList.append( {'id': doc['id'], 'word_ss': ['epsilon', 'gamma']} ) self.solr.add(updateList, fieldUpdates={'word_ss': 'add'}) updatedDocs = self.solr.search('multivalued') self.assertEqual(len(updatedDocs), 2) for i, (originalDoc, updatedDoc) in enumerate(zip(originalDocs, updatedDocs)): self.assertEqual(len(updatedDoc.keys()), len(originalDoc.keys())) self.assertEqual(updatedDoc['word_ss'], originalDoc['word_ss'] + ['epsilon', 'gamma']) self.assertEqual(True, all(updatedDoc[k] == originalDoc[k] for k in updatedDoc.keys() if not k in ['_version_', 'word_ss'])) def test_delete(self): self.assertEqual(len(self.solr.search('doc')), 3) self.solr.delete(id='doc_1') self.assertEqual(len(self.solr.search('doc')), 2) self.solr.delete(q='price:[0 TO 15]') self.assertEqual(len(self.solr.search('doc')), 1) self.assertEqual(len(self.solr.search('*:*')), 1) self.solr.delete(q='*:*') self.assertEqual(len(self.solr.search('*:*')), 0) # Need at least one. self.assertRaises(ValueError, self.solr.delete) # Can't have both. self.assertRaises(ValueError, self.solr.delete, id='foo', q='bar') def test_commit(self): self.assertEqual(len(self.solr.search('doc')), 3) self.solr.add([ { 'id': 'doc_6', 'title': 'Newly added doc', } ], commit=False) self.assertEqual(len(self.solr.search('doc')), 3) self.solr.commit() self.assertEqual(len(self.solr.search('doc')), 4) def test_optimize(self): # Make sure it doesn't blow up. Side effects are hard to measure. :/ self.assertEqual(len(self.solr.search('doc')), 3) self.solr.add([ { 'id': 'doc_6', 'title': 'Newly added doc', } ], commit=False) self.assertEqual(len(self.solr.search('doc')), 3) self.solr.optimize() self.assertEqual(len(self.solr.search('doc')), 4) def test_extract(self): fake_f = StringIO(""" <html> <head> <meta charset="utf-8"> <meta name="haystack-test" content="test 1234"> <title>Test Title ☃☃</title> </head> <body>foobar</body> </html> """) fake_f.name = "test.html" extracted = self.solr.extract(fake_f) # Verify documented response structure: self.assertIn('contents', extracted) self.assertIn('metadata', extracted) self.assertIn('foobar', extracted['contents']) m = extracted['metadata'] self.assertEqual([fake_f.name], m['stream_name']) self.assertIn('haystack-test', m, "HTML metadata should have been extracted!") self.assertEqual(['test 1234'], m['haystack-test']) # Note the underhanded use of a double snowman to verify both that Tika # correctly decoded entities and that our UTF-8 characters survived the # round-trip: self.assertEqual(['Test Title ☃☃'], m['title']) def test_full_url(self): self.solr.url = 'http://localhost:8983/solr/core0' full_url = self.solr._create_full_url(path='/update') # Make sure trailing and leading slashes do not collide: self.assertEqual(full_url, 'http://localhost:8983/solr/core0/update')
class DocManager(DocManagerBase): """The DocManager class creates a connection to the backend engine and adds/removes documents, and in the case of rollback, searches for them. The reason for storing id/doc pairs as opposed to doc's is so that multiple updates to the same doc reflect the most up to date version as opposed to multiple, slightly different versions of a doc. """ def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key='_id', chunk_size=DEFAULT_MAX_BULK, **kwargs): """Verify Solr URL and establish a connection. """ self.url = url self.solr = Solr(url) self.unique_key = unique_key # pysolr does things in milliseconds if auto_commit_interval is not None: self.auto_commit_interval = auto_commit_interval * 1000 else: self.auto_commit_interval = None self.chunk_size = chunk_size self.field_list = [] self._build_fields() self._formatter = DocumentFlattener() def _parse_fields(self, result, field_name): """ If Schema access, parse fields and build respective lists """ field_list = [] for key, value in result.get('schema', {}).get(field_name, {}).items(): if key not in field_list: field_list.append(key) return field_list @wrap_exceptions def _build_fields(self): """ Builds a list of valid fields """ declared_fields = self.solr._send_request('get', ADMIN_URL) result = decoder.decode(declared_fields) self.field_list = self._parse_fields(result, 'fields') # Build regular expressions to match dynamic fields. # dynamic field names may have exactly one wildcard, either at # the beginning or the end of the name self._dynamic_field_regexes = [] for wc_pattern in self._parse_fields(result, 'dynamicFields'): if wc_pattern[0] == "*": self._dynamic_field_regexes.append( re.compile(".*%s\Z" % wc_pattern[1:])) elif wc_pattern[-1] == "*": self._dynamic_field_regexes.append( re.compile("\A%s.*" % wc_pattern[:-1])) def _clean_doc(self, doc, namespace, timestamp): """Reformats the given document before insertion into Solr. This method reformats the document in the following ways: - removes extraneous fields that aren't defined in schema.xml - unwinds arrays in order to find and later flatten sub-documents - flattens the document so that there are no sub-documents, and every value is associated with its dot-separated path of keys - inserts namespace and timestamp metadata into the document in order to handle rollbacks An example: {"a": 2, "b": { "c": { "d": 5 } }, "e": [6, 7, 8] } becomes: {"a": 2, "b.c.d": 5, "e.0": 6, "e.1": 7, "e.2": 8} """ # Translate the _id field to whatever unique key we're using. # _id may not exist in the doc, if we retrieved it from Solr # as part of update. if '_id' in doc: doc[self.unique_key] = u(doc.pop("_id")) # Update namespace and timestamp metadata if 'ns' in doc or '_ts' in doc: raise errors.OperationFailed( 'Need to set "ns" and "_ts" fields, but these fields already ' 'exist in the document %r!' % doc) doc['ns'] = namespace doc['_ts'] = timestamp # SOLR cannot index fields within sub-documents, so flatten documents # with the dot-separated path to each value as the respective key flat_doc = self._formatter.format_document(doc) # Only include fields that are explicitly provided in the # schema or match one of the dynamic field patterns, if # we were able to retrieve the schema if len(self.field_list) + len(self._dynamic_field_regexes) > 0: def include_field(field): return field in self.field_list or any( regex.match(field) for regex in self._dynamic_field_regexes) return dict( (k, v) for k, v in flat_doc.items() if include_field(k)) return flat_doc def stop(self): """ Stops the instance """ pass @wrap_exceptions def handle_command(self, doc, namespace, timestamp): db, _ = namespace.split('.', 1) if doc.get('dropDatabase'): for new_db in self.command_helper.map_db(db): self.solr.delete(q="ns:%s.*" % new_db, commit=(self.auto_commit_interval == 0)) if doc.get('renameCollection'): raise errors.OperationFailed( "solr_doc_manager does not support replication of " " renameCollection") if doc.get('create'): # nothing to do pass if doc.get('drop'): new_db, coll = self.command_helper.map_collection(db, doc['drop']) if new_db: self.solr.delete(q="ns:%s.%s" % (new_db, coll), commit=(self.auto_commit_interval == 0)) def apply_update(self, doc, update_spec): """Override DocManagerBase.apply_update to have flat documents.""" # Replace a whole document if not '$set' in update_spec and not '$unset' in update_spec: # update spec contains the new document update_spec['_id'] = doc['_id'] return update_spec for to_set in update_spec.get("$set", []): value = update_spec['$set'][to_set] # Find dotted-path to the value, remove that key from doc, then # put value at key: keys_to_pop = [] for key in doc: if key.startswith(to_set): if key == to_set or key[len(to_set)] == '.': keys_to_pop.append(key) for key in keys_to_pop: doc.pop(key) doc[to_set] = value for to_unset in update_spec.get("$unset", []): # MongoDB < 2.5.2 reports $unset for fields that don't exist within # the document being updated. keys_to_pop = [] for key in doc: if key.startswith(to_unset): if key == to_unset or key[len(to_unset)] == '.': keys_to_pop.append(key) for key in keys_to_pop: doc.pop(key) return doc @wrap_exceptions def update(self, document_id, update_spec, namespace, timestamp): """Apply updates given in update_spec to the document whose id matches that of doc. """ # Commit outstanding changes so that the document to be updated is the # same version to which the changes apply. self.commit() query = "%s:%s" % (self.unique_key, u(document_id)) results = self.solr.search(query) if not len(results): # Document may not be retrievable yet self.commit() results = self.solr.search(query) # Results is an iterable containing only 1 result for doc in results: # Remove metadata previously stored by Mongo Connector. doc.pop('ns') doc.pop('_ts') updated = self.apply_update(doc, update_spec) # A _version_ of 0 will always apply the update updated['_version_'] = 0 self.upsert(updated, namespace, timestamp) return updated @wrap_exceptions def upsert(self, doc, namespace, timestamp): """Update or insert a document into Solr This method should call whatever add/insert/update method exists for the backend engine and add the document in there. The input will always be one mongo document, represented as a Python dictionary. """ if self.auto_commit_interval is not None: self.solr.add([self._clean_doc(doc, namespace, timestamp)], commit=(self.auto_commit_interval == 0), commitWithin=u(self.auto_commit_interval)) else: self.solr.add([self._clean_doc(doc, namespace, timestamp)], commit=False) @wrap_exceptions def bulk_upsert(self, docs, namespace, timestamp): """Update or insert multiple documents into Solr docs may be any iterable """ if self.auto_commit_interval is not None: add_kwargs = { "commit": (self.auto_commit_interval == 0), "commitWithin": str(self.auto_commit_interval) } else: add_kwargs = {"commit": False} cleaned = (self._clean_doc(d, namespace, timestamp) for d in docs) if self.chunk_size > 0: batch = list(next(cleaned) for i in range(self.chunk_size)) while batch: self.solr.add(batch, **add_kwargs) batch = list(next(cleaned) for i in range(self.chunk_size)) else: self.solr.add(cleaned, **add_kwargs) @wrap_exceptions def insert_file(self, f, namespace, timestamp): params = self._formatter.format_document(f.get_metadata()) params[self.unique_key] = params.pop('_id') params['ns'] = namespace params['_ts'] = timestamp params = dict(('literal.' + k, v) for k, v in params.items()) if self.auto_commit_interval == 0: params['commit'] = 'true' request = Request( os.path.join(self.url, "update/extract?%s" % urlencode(params))) request.add_header("Content-type", "application/octet-stream") request.data = f response = urlopen(request) logging.debug(response.read()) @wrap_exceptions def remove(self, document_id, namespace, timestamp): """Removes documents from Solr The input is a python dictionary that represents a mongo document. """ self.solr.delete(id=u(document_id), commit=(self.auto_commit_interval == 0)) @wrap_exceptions def _stream_search(self, query): """Helper method for iterating over Solr search results.""" for doc in self.solr.search(query, rows=100000000): if self.unique_key != "_id": doc["_id"] = doc.pop(self.unique_key) yield doc @wrap_exceptions def search(self, start_ts, end_ts): """Called to query Solr for documents in a time range.""" query = '_ts: [%s TO %s]' % (start_ts, end_ts) return self._stream_search(query) def commit(self): """This function is used to force a commit. """ retry_until_ok(self.solr.commit) @wrap_exceptions def get_last_doc(self): """Returns the last document stored in the Solr engine. """ #search everything, sort by descending timestamp, return 1 row try: result = self.solr.search('*:*', sort='_ts desc', rows=1) except ValueError: return None for r in result: r['_id'] = r.pop(self.unique_key) return r
class DocManager(DocManagerBase): """The DocManager class creates a connection to the backend engine and adds/removes documents, and in the case of rollback, searches for them. The reason for storing id/doc pairs as opposed to doc's is so that multiple updates to the same doc reflect the most up to date version as opposed to multiple, slightly different versions of a doc. """ def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key='_id', chunk_size=DEFAULT_MAX_BULK, **kwargs): """Verify Solr URL and establish a connection. """ self.solr = Solr(url) self.unique_key = unique_key # pysolr does things in milliseconds if auto_commit_interval is not None: self.auto_commit_interval = auto_commit_interval * 1000 else: self.auto_commit_interval = None self.chunk_size = chunk_size self.field_list = [] self._build_fields() self._formatter = DocumentFlattener() def _parse_fields(self, result, field_name): """ If Schema access, parse fields and build respective lists """ field_list = [] for key, value in result.get('schema', {}).get(field_name, {}).items(): if key not in field_list: field_list.append(key) return field_list @wrap_exceptions def _build_fields(self): """ Builds a list of valid fields """ declared_fields = self.solr._send_request('get', ADMIN_URL) result = decoder.decode(declared_fields) self.field_list = self._parse_fields(result, 'fields') # Build regular expressions to match dynamic fields. # dynamic field names may have exactly one wildcard, either at # the beginning or the end of the name self._dynamic_field_regexes = [] for wc_pattern in self._parse_fields(result, 'dynamicFields'): if wc_pattern[0] == "*": self._dynamic_field_regexes.append( re.compile(".*%s\Z" % wc_pattern[1:])) elif wc_pattern[-1] == "*": self._dynamic_field_regexes.append( re.compile("\A%s.*" % wc_pattern[:-1])) def _clean_doc(self, doc): """Reformats the given document before insertion into Solr. This method reformats the document in the following ways: - removes extraneous fields that aren't defined in schema.xml - unwinds arrays in order to find and later flatten sub-documents - flattens the document so that there are no sub-documents, and every value is associated with its dot-separated path of keys An example: {"a": 2, "b": { "c": { "d": 5 } }, "e": [6, 7, 8] } becomes: {"a": 2, "b.c.d": 5, "e.0": 6, "e.1": 7, "e.2": 8} """ # Translate the _id field to whatever unique key we're using. # _id may not exist in the doc, if we retrieved it from Solr # as part of update. if '_id' in doc: doc[self.unique_key] = doc.pop("_id") # SOLR cannot index fields within sub-documents, so flatten documents # with the dot-separated path to each value as the respective key flat_doc = self._formatter.format_document(doc) # Only include fields that are explicitly provided in the # schema or match one of the dynamic field patterns, if # we were able to retrieve the schema if len(self.field_list) + len(self._dynamic_field_regexes) > 0: def include_field(field): return field in self.field_list or any( regex.match(field) for regex in self._dynamic_field_regexes ) return dict((k, v) for k, v in flat_doc.items() if include_field(k)) return flat_doc def stop(self): """ Stops the instance """ pass def apply_update(self, doc, update_spec): """Override DocManagerBase.apply_update to have flat documents.""" # Replace a whole document if not '$set' in update_spec and not '$unset' in update_spec: # update spec contains the new document update_spec['_ts'] = doc['_ts'] update_spec['ns'] = doc['ns'] return update_spec for to_set in update_spec.get("$set", []): value = update_spec['$set'][to_set] # Find dotted-path to the value, remove that key from doc, then # put value at key: keys_to_pop = [] for key in doc: if key.startswith(to_set): if key == to_set or key[len(to_set)] == '.': keys_to_pop.append(key) for key in keys_to_pop: doc.pop(key) doc[to_set] = value for to_unset in update_spec.get("$unset", []): doc.pop(to_unset) return doc @wrap_exceptions def update(self, doc, update_spec): """Apply updates given in update_spec to the document whose id matches that of doc. """ query = "%s:%s" % (self.unique_key, str(doc['_id'])) results = self.solr.search(query) if not len(results): # Document may not be retrievable yet self.commit() results = self.solr.search(query) # Results is an iterable containing only 1 result for doc in results: updated = self.apply_update(doc, update_spec) # A _version_ of 0 will always apply the update updated['_version_'] = 0 self.upsert(updated) return updated @wrap_exceptions def upsert(self, doc): """Update or insert a document into Solr This method should call whatever add/insert/update method exists for the backend engine and add the document in there. The input will always be one mongo document, represented as a Python dictionary. """ if self.auto_commit_interval is not None: self.solr.add([self._clean_doc(doc)], commit=(self.auto_commit_interval == 0), commitWithin=str(self.auto_commit_interval)) else: self.solr.add([self._clean_doc(doc)], commit=False) @wrap_exceptions def bulk_upsert(self, docs): """Update or insert multiple documents into Solr docs may be any iterable """ if self.auto_commit_interval is not None: add_kwargs = { "commit": (self.auto_commit_interval == 0), "commitWithin": self.auto_commit_interval } else: add_kwargs = {"commit": False} cleaned = (self._clean_doc(d) for d in docs) if self.chunk_size > 0: batch = list(next(cleaned) for i in range(self.chunk_size)) while batch: self.solr.add(batch, **add_kwargs) batch = list(next(cleaned) for i in range(self.chunk_size)) else: self.solr.add(cleaned, **add_kwargs) @wrap_exceptions def remove(self, doc): """Removes documents from Solr The input is a python dictionary that represents a mongo document. """ self.solr.delete(id=str(doc["_id"]), commit=(self.auto_commit_interval == 0)) @wrap_exceptions def _remove(self): """Removes everything """ self.solr.delete(q='*:*', commit=(self.auto_commit_interval == 0)) @wrap_exceptions def _stream_search(self, query): """Helper method for iterating over Solr search results.""" for doc in self.solr.search(query, rows=100000000): if self.unique_key != "_id": doc["_id"] = doc.pop(self.unique_key) yield doc @wrap_exceptions def search(self, start_ts, end_ts): """Called to query Solr for documents in a time range.""" query = '_ts: [%s TO %s]' % (start_ts, end_ts) return self._stream_search(query) @wrap_exceptions def _search(self, query): """For test purposes only. Performs search on Solr with given query Does not have to be implemented. """ return self._stream_search(query) def commit(self): """This function is used to force a commit. """ retry_until_ok(self.solr.commit) @wrap_exceptions def get_last_doc(self): """Returns the last document stored in the Solr engine. """ #search everything, sort by descending timestamp, return 1 row try: result = self.solr.search('*:*', sort='_ts desc', rows=1) except ValueError: return None for r in result: r['_id'] = r.pop(self.unique_key) return r
class DocManager(): """The DocManager class creates a connection to the backend engine and adds/removes documents, and in the case of rollback, searches for them. The reason for storing id/doc pairs as opposed to doc's is so that multiple updates to the same doc reflect the most up to date version as opposed to multiple, slightly different versions of a doc. """ def __init__(self, url, auto_commit=False, unique_key='_id'): """Verify Solr URL and establish a connection. """ if verify_url(url) is False: raise SystemError self.solr = Solr(url) self.unique_key = unique_key self.auto_commit = auto_commit self.field_list = [] self.dynamic_field_list = [] self.build_fields() if auto_commit: self.run_auto_commit() def _parse_fields(self, result, field_name): """ If Schema access, parse fields and build respective lists """ field_list = [] for key, value in result.get('schema', {}).get(field_name, {}).items(): if key not in field_list: field_list.append(key) return field_list def build_fields(self): """ Builds a list of valid fields """ try: declared_fields = self.solr._send_request('get', ADMIN_URL) except SolrError: pass result = decoder.decode(declared_fields) self.field_list = self._parse_fields(result, 'fields'), self.dynamic_field_list = self._parse_fields(result, 'dynamicFields') def clean_doc(self, doc): """ Cleans a document passed in to be compliant with the Solr as used by Solr. This WILL remove fields that aren't in the schema, so the document may actually get altered. """ if not self.field_list: return doc fixed_doc = {} for key, value in doc.items(): if key in self.field_list[0]: fixed_doc[key] = value # Dynamic strings. * can occur only at beginning and at end else: for field in self.dynamic_field_list: if field[0] == '*': regex = re.compile(r'\w%s\b' % (field)) else: regex = re.compile(r'\b%s\w' % (field)) if regex.match(key): fixed_doc[key] = value return fixed_doc def stop(self): """ Stops the instance """ self.auto_commit = False def upsert(self, doc): """Update or insert a document into Solr This method should call whatever add/insert/update method exists for the backend engine and add the document in there. The input will always be one mongo document, represented as a Python dictionary. """ try: self.solr.add([self.clean_doc(doc)], commit=True) except SolrError: logging.error("Could not insert %r into Solr" % (doc, )) def remove(self, doc): """Removes documents from Solr The input is a python dictionary that represents a mongo document. """ self.solr.delete(id=str(doc[self.unique_key]), commit=True) def _remove(self): """Removes everything """ self.solr.delete(q='*:*') def search(self, start_ts, end_ts): """Called to query Solr for documents in a time range. """ query = '_ts: [%s TO %s]' % (start_ts, end_ts) return self.solr.search(query, rows=100000000) def _search(self, query): """For test purposes only. Performs search on Solr with given query Does not have to be implemented. """ return self.solr.search(query, rows=200) def commit(self): """This function is used to force a commit. """ retry_until_ok(self.solr.commit) def run_auto_commit(self): """Periodically commits to the Solr server. """ self.solr.commit() if self.auto_commit: Timer(1, self.run_auto_commit).start() def get_last_doc(self): """Returns the last document stored in the Solr engine. """ #search everything, sort by descending timestamp, return 1 row try: result = self.solr.search('*:*', sort='_ts desc', rows=1) except ValueError: return None if len(result) == 0: return None return result.docs[0]
class DocManager(DocManagerBase): """The DocManager class creates a connection to the backend engine and adds/removes documents, and in the case of rollback, searches for them. The reason for storing id/doc pairs as opposed to doc's is so that multiple updates to the same doc reflect the most up to date version as opposed to multiple, slightly different versions of a doc. """ def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key='_id', chunk_size=DEFAULT_MAX_BULK, **kwargs): """Verify Solr URL and establish a connection. """ self.solr = Solr(url) self.unique_key = unique_key # pysolr does things in milliseconds if auto_commit_interval is not None: self.auto_commit_interval = auto_commit_interval * 1000 else: self.auto_commit_interval = None self.chunk_size = chunk_size self.field_list = [] self._build_fields() self._formatter = DocumentFlattener() def _parse_fields(self, result, field_name): """ If Schema access, parse fields and build respective lists """ field_list = [] for key, value in result.get('schema', {}).get(field_name, {}).items(): if key not in field_list: field_list.append(key) return field_list @wrap_exceptions def _build_fields(self): """ Builds a list of valid fields """ declared_fields = self.solr._send_request('get', ADMIN_URL) result = decoder.decode(declared_fields) self.field_list = self._parse_fields(result, 'fields') # Build regular expressions to match dynamic fields. # dynamic field names may have exactly one wildcard, either at # the beginning or the end of the name self._dynamic_field_regexes = [] for wc_pattern in self._parse_fields(result, 'dynamicFields'): if wc_pattern[0] == "*": self._dynamic_field_regexes.append( re.compile(".*%s\Z" % wc_pattern[1:])) elif wc_pattern[-1] == "*": self._dynamic_field_regexes.append( re.compile("\A%s.*" % wc_pattern[:-1])) def _clean_doc(self, doc): """Reformats the given document before insertion into Solr. This method reformats the document in the following ways: - removes extraneous fields that aren't defined in schema.xml - unwinds arrays in order to find and later flatten sub-documents - flattens the document so that there are no sub-documents, and every value is associated with its dot-separated path of keys An example: {"a": 2, "b": { "c": { "d": 5 } }, "e": [6, 7, 8] } becomes: {"a": 2, "b.c.d": 5, "e.0": 6, "e.1": 7, "e.2": 8} """ # Translate the _id field to whatever unique key we're using. # _id may not exist in the doc, if we retrieved it from Solr # as part of update. if '_id' in doc: doc[self.unique_key] = doc.pop("_id") # SOLR cannot index fields within sub-documents, so flatten documents # with the dot-separated path to each value as the respective key flat_doc = self._formatter.format_document(doc) # Only include fields that are explicitly provided in the # schema or match one of the dynamic field patterns, if # we were able to retrieve the schema if len(self.field_list) + len(self._dynamic_field_regexes) > 0: def include_field(field): return field in self.field_list or any( regex.match(field) for regex in self._dynamic_field_regexes ) return dict((k, v) for k, v in flat_doc.items() if include_field(k)) return flat_doc def stop(self): """ Stops the instance """ pass def apply_update(self, doc, update_spec): """Override DocManagerBase.apply_update to have flat documents.""" # Replace a whole document if not '$set' in update_spec and not '$unset' in update_spec: # update spec contains the new document update_spec['_ts'] = doc['_ts'] update_spec['ns'] = doc['ns'] update_spec['_id'] = doc['_id'] return update_spec for to_set in update_spec.get("$set", []): value = update_spec['$set'][to_set] # Find dotted-path to the value, remove that key from doc, then # put value at key: keys_to_pop = [] for key in doc: if key.startswith(to_set): if key == to_set or key[len(to_set)] == '.': keys_to_pop.append(key) for key in keys_to_pop: doc.pop(key) doc[to_set] = value for to_unset in update_spec.get("$unset", []): # MongoDB < 2.5.2 reports $unset for fields that don't exist within # the document being updated. keys_to_pop = [] for key in doc: if key.startswith(to_unset): if key == to_unset or key[len(to_unset)] == '.': keys_to_pop.append(key) for key in keys_to_pop: doc.pop(key) return doc @wrap_exceptions def update(self, doc, update_spec): """Apply updates given in update_spec to the document whose id matches that of doc. """ # Commit outstanding changes so that the document to be updated is the # same version to which the changes apply. self.commit() query = "%s:%s" % (self.unique_key, str(doc['_id'])) results = self.solr.search(query) if not len(results): # Document may not be retrievable yet self.commit() results = self.solr.search(query) # Results is an iterable containing only 1 result for doc in results: updated = self.apply_update(doc, update_spec) # A _version_ of 0 will always apply the update updated['_version_'] = 0 self.upsert(updated) return updated @wrap_exceptions def upsert(self, doc): """Update or insert a document into Solr This method should call whatever add/insert/update method exists for the backend engine and add the document in there. The input will always be one mongo document, represented as a Python dictionary. """ if self.auto_commit_interval is not None: self.solr.add([self._clean_doc(doc)], commit=(self.auto_commit_interval == 0), commitWithin=str(self.auto_commit_interval)) else: self.solr.add([self._clean_doc(doc)], commit=False) @wrap_exceptions def bulk_upsert(self, docs): """Update or insert multiple documents into Solr docs may be any iterable """ if self.auto_commit_interval is not None: add_kwargs = { "commit": (self.auto_commit_interval == 0), "commitWithin": str(self.auto_commit_interval) } else: add_kwargs = {"commit": False} cleaned = (self._clean_doc(d) for d in docs) if self.chunk_size > 0: batch = list(next(cleaned) for i in range(self.chunk_size)) while batch: self.solr.add(batch, **add_kwargs) batch = list(next(cleaned) for i in range(self.chunk_size)) else: self.solr.add(cleaned, **add_kwargs) @wrap_exceptions def remove(self, doc): """Removes documents from Solr The input is a python dictionary that represents a mongo document. """ self.solr.delete(id=str(doc["_id"]), commit=(self.auto_commit_interval == 0)) @wrap_exceptions def _remove(self): """Removes everything """ self.solr.delete(q='*:*', commit=(self.auto_commit_interval == 0)) @wrap_exceptions def _stream_search(self, query): """Helper method for iterating over Solr search results.""" for doc in self.solr.search(query, rows=100000000): if self.unique_key != "_id": doc["_id"] = doc.pop(self.unique_key) yield doc @wrap_exceptions def search(self, start_ts, end_ts): """Called to query Solr for documents in a time range.""" query = '_ts: [%s TO %s]' % (start_ts, end_ts) return self._stream_search(query) @wrap_exceptions def _search(self, query): """For test purposes only. Performs search on Solr with given query Does not have to be implemented. """ return self._stream_search(query) def commit(self): """This function is used to force a commit. """ retry_until_ok(self.solr.commit) @wrap_exceptions def get_last_doc(self): """Returns the last document stored in the Solr engine. """ #search everything, sort by descending timestamp, return 1 row try: result = self.solr.search('*:*', sort='_ts desc', rows=1) except ValueError: return None for r in result: r['_id'] = r.pop(self.unique_key) return r
class DocManager: """The DocManager class creates a connection to the backend engine and adds/removes documents, and in the case of rollback, searches for them. The reason for storing id/doc pairs as opposed to doc's is so that multiple updates to the same doc reflect the most up to date version as opposed to multiple, slightly different versions of a doc. """ def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key="_id", **kwargs): """Verify Solr URL and establish a connection. """ self.solr = Solr(url) self.unique_key = unique_key # pysolr does things in milliseconds if auto_commit_interval is not None: self.auto_commit_interval = auto_commit_interval * 1000 else: self.auto_commit_interval = None self.field_list = [] self._build_fields() def _parse_fields(self, result, field_name): """ If Schema access, parse fields and build respective lists """ field_list = [] for key, value in result.get("schema", {}).get(field_name, {}).items(): if key not in field_list: field_list.append(key) return field_list def _build_fields(self): """ Builds a list of valid fields """ declared_fields = self.solr._send_request("get", ADMIN_URL) result = decoder.decode(declared_fields) self.field_list = self._parse_fields(result, "fields") # Build regular expressions to match dynamic fields. # dynamic field names may have exactly one wildcard, either at # the beginning or the end of the name self._dynamic_field_regexes = [] for wc_pattern in self._parse_fields(result, "dynamicFields"): if wc_pattern[0] == "*": self._dynamic_field_regexes.append(re.compile("\w%s\Z" % wc_pattern)) elif wc_pattern[-1] == "*": self._dynamic_field_regexes.append(re.compile("\A%s\w*" % wc_pattern[:-1])) def _clean_doc(self, doc): """Reformats the given document before insertion into Solr. This method reformats the document in the following ways: - removes extraneous fields that aren't defined in schema.xml - unwinds arrays in order to find and later flatten sub-documents - flattens the document so that there are no sub-documents, and every value is associated with its dot-separated path of keys An example: {"a": 2, "b": { "c": { "d": 5 } }, "e": [6, 7, 8] } becomes: {"a": 2, "b.c.d": 5, "e.0": 6, "e.1": 7, "e.2": 8} """ # SOLR cannot index fields within sub-documents, so flatten documents # with the dot-separated path to each value as the respective key def flattened(doc): def flattened_kernel(doc, path): for k, v in doc.items(): path.append(k) if isinstance(v, dict): for inner_k, inner_v in flattened_kernel(v, path): yield inner_k, inner_v elif isinstance(v, list): for li, lv in enumerate(v): path.append(str(li)) if isinstance(lv, dict): for dk, dv in flattened_kernel(lv, path): yield dk, dv else: yield ".".join(path), lv path.pop() else: yield ".".join(path), v path.pop() return dict(flattened_kernel(doc, [])) # Translate the _id field to whatever unique key we're using doc[self.unique_key] = doc["_id"] flat_doc = flattened(doc) # Only include fields that are explicitly provided in the # schema or match one of the dynamic field patterns, if # we were able to retrieve the schema if len(self.field_list) + len(self._dynamic_field_regexes) > 0: def include_field(field): return field in self.field_list or any(regex.match(field) for regex in self._dynamic_field_regexes) return dict((k, v) for k, v in flat_doc.items() if include_field(k)) return flat_doc def stop(self): """ Stops the instance """ pass def upsert(self, doc): """Update or insert a document into Solr This method should call whatever add/insert/update method exists for the backend engine and add the document in there. The input will always be one mongo document, represented as a Python dictionary. """ try: if self.auto_commit_interval is not None: self.solr.add( [self._clean_doc(doc)], commit=(self.auto_commit_interval == 0), commitWithin=str(self.auto_commit_interval), ) else: self.solr.add([self._clean_doc(doc)], commit=False) except SolrError: raise errors.OperationFailed("Could not insert %r into Solr" % bsjson.dumps(doc)) def bulk_upsert(self, docs): """Update or insert multiple documents into Solr docs may be any iterable """ try: cleaned = (self._clean_doc(d) for d in docs) if self.auto_commit_interval is not None: self.solr.add( cleaned, commit=(self.auto_commit_interval == 0), commitWithin=str(self.auto_commit_interval) ) else: self.solr.add(cleaned, commit=False) except SolrError: raise errors.OperationFailed("Could not bulk-insert documents into Solr") def remove(self, doc): """Removes documents from Solr The input is a python dictionary that represents a mongo document. """ self.solr.delete(id=str(doc[self.unique_key]), commit=(self.auto_commit_interval == 0)) def _remove(self): """Removes everything """ self.solr.delete(q="*:*", commit=(self.auto_commit_interval == 0)) def search(self, start_ts, end_ts): """Called to query Solr for documents in a time range. """ query = "_ts: [%s TO %s]" % (start_ts, end_ts) return self.solr.search(query, rows=100000000) def _search(self, query): """For test purposes only. Performs search on Solr with given query Does not have to be implemented. """ return self.solr.search(query, rows=200) def commit(self): """This function is used to force a commit. """ retry_until_ok(self.solr.commit) def get_last_doc(self): """Returns the last document stored in the Solr engine. """ # search everything, sort by descending timestamp, return 1 row try: result = self.solr.search("*:*", sort="_ts desc", rows=1) except ValueError: return None if len(result) == 0: return None return result.docs[0]
class SolrTestCase(unittest.TestCase): def setUp(self): super(SolrTestCase, self).setUp() self.default_solr = Solr('http://*****:*****@unittest.skipIf(sys.version_info < (2, 7), reason=u'Python 2.6 lacks the ElementTree 1.3 interface required for Solr XML error message parsing') def test__scrape_response_coyote_xml(self): resp_3 = self.solr._scrape_response({'server': 'coyote'}, '<?xml version="1.0"?>\n<response>\n<lst name="responseHeader"><int name="status">400</int><int name="QTime">0</int></lst><lst name="error"><str name="msg">Invalid Date String:\'2015-03-23 10:43:33\'</str><int name="code">400</int></lst>\n</response>\n') self.assertEqual(resp_3, ("Invalid Date String:'2015-03-23 10:43:33'", "Invalid Date String:'2015-03-23 10:43:33'")) # Valid XML with a traceback resp_4 = self.solr._scrape_response({'server': 'coyote'}, """<?xml version="1.0"?> <response> <lst name="responseHeader"><int name="status">500</int><int name="QTime">138</int></lst><lst name="error"><str name="msg">Internal Server Error</str><str name="trace">org.apache.solr.common.SolrException: Internal Server Error at java.lang.Thread.run(Thread.java:745)</str><int name="code">500</int></lst> </response>""") self.assertEqual(resp_4, (u"Internal Server Error", u"org.apache.solr.common.SolrException: Internal Server Error at java.lang.Thread.run(Thread.java:745)")) def test__scrape_response_tomcat(self): """Tests for Tomcat error responses""" resp_0 = self.solr._scrape_response({'server': 'coyote'}, '<html><body><h1>Something broke!</h1><pre>gigantic stack trace</pre></body></html>') self.assertEqual(resp_0, ('Something broke!', '')) # Invalid XML bogus_xml = '<?xml version="1.0"?>\n<response>\n<lst name="responseHeader"><int name="status">400</int><int name="QTime">0</int></lst><lst name="error"><str name="msg">Invalid Date String:\'2015-03-23 10:43:33\'</str><int name="code">400</int></lst>' reason, full_html = self.solr._scrape_response({'server': 'coyote'}, bogus_xml) self.assertEqual(reason, None) self.assertEqual(full_html, bogus_xml.replace("\n", "")) def test__from_python(self): self.assertEqual(self.solr._from_python(datetime.date(2013, 1, 18)), '2013-01-18T00:00:00Z') self.assertEqual(self.solr._from_python(datetime.datetime(2013, 1, 18, 0, 30, 28)), '2013-01-18T00:30:28Z') self.assertEqual(self.solr._from_python(True), 'true') self.assertEqual(self.solr._from_python(False), 'false') self.assertEqual(self.solr._from_python(1), '1') self.assertEqual(self.solr._from_python(1.2), '1.2') self.assertEqual(self.solr._from_python(b'hello'), 'hello') self.assertEqual(self.solr._from_python('hello ☃'), 'hello ☃') self.assertEqual(self.solr._from_python('\x01test\x02'), 'test') def test__to_python(self): self.assertEqual(self.solr._to_python('2013-01-18T00:00:00Z'), datetime.datetime(2013, 1, 18)) self.assertEqual(self.solr._to_python('2013-01-18T00:30:28Z'), datetime.datetime(2013, 1, 18, 0, 30, 28)) self.assertEqual(self.solr._to_python('true'), True) self.assertEqual(self.solr._to_python('false'), False) self.assertEqual(self.solr._to_python(1), 1) self.assertEqual(self.solr._to_python(1.2), 1.2) self.assertEqual(self.solr._to_python(b'hello'), 'hello') self.assertEqual(self.solr._to_python('hello ☃'), 'hello ☃') self.assertEqual(self.solr._to_python(['foo', 'bar']), 'foo') self.assertEqual(self.solr._to_python(('foo', 'bar')), 'foo') self.assertEqual(self.solr._to_python('tuple("foo", "bar")'), 'tuple("foo", "bar")') def test__is_null_value(self): self.assertTrue(self.solr._is_null_value(None)) self.assertTrue(self.solr._is_null_value('')) self.assertFalse(self.solr._is_null_value('Hello')) self.assertFalse(self.solr._is_null_value(1)) def test_search(self): results = self.solr.search('doc') self.assertEqual(len(results), 3) # search should default to 'select' handler args, kwargs = self.solr._send_request.call_args self.assertTrue(args[1].startswith('select/?')) results = self.solr.search('example') self.assertEqual(len(results), 2) results = self.solr.search('nothing') self.assertEqual(len(results), 0) # Advanced options. results = self.solr.search('doc', **{ 'debug': 'true', 'hl': 'true', 'hl.fragsize': 8, 'facet': 'on', 'facet.field': 'popularity', 'spellcheck': 'true', 'spellcheck.collate': 'true', 'spellcheck.count': 1, # TODO: Can't get these working in my test setup. # 'group': 'true', # 'group.field': 'id', }) self.assertEqual(len(results), 3) self.assertTrue('explain' in results.debug) self.assertEqual(results.highlighting, {u'doc_4': {}, u'doc_2': {}, u'doc_1': {}}) self.assertEqual(results.spellcheck, {}) self.assertEqual(results.facets['facet_fields']['popularity'], ['10', 2, '7', 1, '2', 0, '8', 0]) self.assertTrue(results.qtime is not None) # TODO: Can't get these working in my test setup. # self.assertEqual(results.grouped, '') # search should support custom handlers with self.assertRaises(SolrError): self.solr.search('doc', handler='fakehandler') args, kwargs = self.solr._send_request.call_args self.assertTrue(args[1].startswith('fakehandler')) def test_more_like_this(self): results = self.solr.more_like_this('id:doc_1', 'text') self.assertEqual(len(results), 0) # more_like_this should default to 'mlt' handler args, kwargs = self.solr._send_request.call_args self.assertTrue(args[1].startswith('mlt/?')) # more_like_this should support custom handlers with self.assertRaises(SolrError): self.solr.more_like_this('id:doc_1', 'text', handler='fakehandler') args, kwargs = self.solr._send_request.call_args self.assertTrue(args[1].startswith('fakehandler')) def test_suggest_terms(self): results = self.solr.suggest_terms('title', '') self.assertEqual(len(results), 1) self.assertEqual(results, {'title': [('doc', 3), ('another', 2), ('example', 2), ('1', 1), ('2', 1), ('boring', 1), ('rock', 1), ('thing', 1)]}) # suggest_terms should default to 'mlt' handler args, kwargs = self.solr._send_request.call_args self.assertTrue(args[1].startswith('terms/?')) # suggest_terms should support custom handlers with self.assertRaises(SolrError): self.solr.suggest_terms('title', '', handler='fakehandler') args, kwargs = self.solr._send_request.call_args self.assertTrue(args[1].startswith('fakehandler')) def test__build_doc(self): doc = { 'id': 'doc_1', 'title': 'Example doc ☃ 1', 'price': 12.59, 'popularity': 10, } doc_xml = force_unicode(ET.tostring(self.solr._build_doc(doc), encoding='utf-8')) self.assertTrue('<field name="title">Example doc ☃ 1</field>' in doc_xml) self.assertTrue('<field name="id">doc_1</field>' in doc_xml) self.assertEqual(len(doc_xml), 152) def test_add(self): self.assertEqual(len(self.solr.search('doc')), 3) self.assertEqual(len(self.solr.search('example')), 2) self.solr.add([ { 'id': 'doc_6', 'title': 'Newly added doc', }, { 'id': 'doc_7', 'title': 'Another example doc', }, ]) # add should default to 'update' handler args, kwargs = self.solr._send_request.call_args self.assertTrue(args[1].startswith('update/?')) self.assertEqual(len(self.solr.search('doc')), 5) self.assertEqual(len(self.solr.search('example')), 3) # add should support custom handlers with self.assertRaises(SolrError): self.solr.add([], handler='fakehandler') args, kwargs = self.solr._send_request.call_args self.assertTrue(args[1].startswith('fakehandler')) def test_add_with_boost(self): self.assertEqual(len(self.solr.search('doc')), 3) self.solr.add([{'id': 'doc_6', 'title': 'Important doc'}], boost={'title': 10.0}) self.solr.add([{'id': 'doc_7', 'title': 'Spam doc doc'}], boost={'title': 0}) res = self.solr.search('doc') self.assertEqual(len(res), 5) self.assertEqual('doc_6', res.docs[0]['id']) def test_field_update(self): originalDocs = self.solr.search('doc') self.assertEqual(len(originalDocs), 3) updateList = [] for i, doc in enumerate(originalDocs): updateList.append( {'id': doc['id'], 'popularity': 5} ) self.solr.add(updateList, fieldUpdates={'popularity': 'inc'}) updatedDocs = self.solr.search('doc') self.assertEqual(len(updatedDocs), 3) for i, (originalDoc, updatedDoc) in enumerate(zip(originalDocs, updatedDocs)): self.assertEqual(len(updatedDoc.keys()), len(originalDoc.keys())) self.assertEqual(updatedDoc['popularity'], originalDoc['popularity'] + 5) self.assertEqual(True, all(updatedDoc[k] == originalDoc[k] for k in updatedDoc.keys() if not k in ['_version_', 'popularity'])) self.solr.add([ { 'id': 'multivalued_1', 'title': 'Multivalued doc 1', 'word_ss': ['alpha', 'beta'], }, { 'id': 'multivalued_2', 'title': 'Multivalued doc 2', 'word_ss': ['charlie', 'delta'], }, ]) originalDocs = self.solr.search('multivalued') self.assertEqual(len(originalDocs), 2) updateList = [] for i, doc in enumerate(originalDocs): updateList.append( {'id': doc['id'], 'word_ss': ['epsilon', 'gamma']} ) self.solr.add(updateList, fieldUpdates={'word_ss': 'add'}) updatedDocs = self.solr.search('multivalued') self.assertEqual(len(updatedDocs), 2) for i, (originalDoc, updatedDoc) in enumerate(zip(originalDocs, updatedDocs)): self.assertEqual(len(updatedDoc.keys()), len(originalDoc.keys())) self.assertEqual(updatedDoc['word_ss'], originalDoc['word_ss'] + ['epsilon', 'gamma']) self.assertEqual(True, all(updatedDoc[k] == originalDoc[k] for k in updatedDoc.keys() if not k in ['_version_', 'word_ss'])) def test_delete(self): self.assertEqual(len(self.solr.search('doc')), 3) self.solr.delete(id='doc_1') # delete should default to 'update' handler args, kwargs = self.solr._send_request.call_args self.assertTrue(args[1].startswith('update/?')) self.assertEqual(len(self.solr.search('doc')), 2) self.solr.delete(q='price:[0 TO 15]') self.assertEqual(len(self.solr.search('doc')), 1) self.assertEqual(len(self.solr.search('*:*')), 1) self.solr.delete(q='*:*') self.assertEqual(len(self.solr.search('*:*')), 0) # Need at least one. self.assertRaises(ValueError, self.solr.delete) # Can't have both. self.assertRaises(ValueError, self.solr.delete, id='foo', q='bar') # delete should support custom handlers with self.assertRaises(SolrError): self.solr.delete(id='doc_1', handler='fakehandler') args, kwargs = self.solr._send_request.call_args self.assertTrue(args[1].startswith('fakehandler')) def test_commit(self): self.assertEqual(len(self.solr.search('doc')), 3) self.solr.add([ { 'id': 'doc_6', 'title': 'Newly added doc', } ], commit=False) self.assertEqual(len(self.solr.search('doc')), 3) self.solr.commit() # commit should default to 'update' handler args, kwargs = self.solr._send_request.call_args self.assertTrue(args[1].startswith('update/?')) self.assertEqual(len(self.solr.search('doc')), 4) # commit should support custom handlers with self.assertRaises(SolrError): self.solr.commit(handler='fakehandler') args, kwargs = self.solr._send_request.call_args self.assertTrue(args[1].startswith('fakehandler')) def test_optimize(self): # Make sure it doesn't blow up. Side effects are hard to measure. :/ self.assertEqual(len(self.solr.search('doc')), 3) self.solr.add([ { 'id': 'doc_6', 'title': 'Newly added doc', } ], commit=False) self.assertEqual(len(self.solr.search('doc')), 3) self.solr.optimize() # optimize should default to 'update' handler args, kwargs = self.solr._send_request.call_args self.assertTrue(args[1].startswith('update/?')) self.assertEqual(len(self.solr.search('doc')), 4) # optimize should support custom handlers with self.assertRaises(SolrError): self.solr.optimize(handler='fakehandler') args, kwargs = self.solr._send_request.call_args self.assertTrue(args[1].startswith('fakehandler')) def test_extract(self): fake_f = StringIO(""" <html> <head> <meta charset="utf-8"> <meta name="haystack-test" content="test 1234"> <title>Test Title ☃☃</title> </head> <body>foobar</body> </html> """) fake_f.name = "test.html" extracted = self.solr.extract(fake_f) # extract should default to 'update/extract' handler args, kwargs = self.solr._send_request.call_args self.assertTrue(args[1].startswith('update/extract')) # extract should support custom handlers with self.assertRaises(SolrError): self.solr.extract(fake_f, handler='fakehandler') args, kwargs = self.solr._send_request.call_args self.assertTrue(args[1].startswith('fakehandler')) # Verify documented response structure: self.assertIn('contents', extracted) self.assertIn('metadata', extracted) self.assertIn('foobar', extracted['contents']) m = extracted['metadata'] self.assertEqual([fake_f.name], m['stream_name']) self.assertIn('haystack-test', m, "HTML metadata should have been extracted!") self.assertEqual(['test 1234'], m['haystack-test']) # Note the underhanded use of a double snowman to verify both that Tika # correctly decoded entities and that our UTF-8 characters survived the # round-trip: self.assertEqual(['Test Title ☃☃'], m['title']) def test_full_url(self): self.solr.url = 'http://localhost:8983/solr/core0' full_url = self.solr._create_full_url(path='/update') # Make sure trailing and leading slashes do not collide: self.assertEqual(full_url, 'http://localhost:8983/solr/core0/update') def test_request_handler(self): before_test_use_qt_param = self.solr.use_qt_param before_test_search_handler = self.solr.search_handler self.solr.use_qt_param = True response = self.solr.search('my query') args, kwargs = self.solr._send_request.call_args self.assertTrue(args[1].startswith('select')) response = self.solr.search('my', handler='/autocomplete') args, kwargs = self.solr._send_request.call_args self.assertTrue(args[1].startswith('select')) self.assertTrue(args[1].find("qt=%2Fautocomplete") > -1) self.solr.search_handler = '/autocomplete' response = self.solr.search('my') args, kwargs = self.solr._send_request.call_args self.assertTrue(args[1].startswith('select')) self.assertTrue(args[1].find("qt=%2Fautocomplete") > -1) self.solr.use_qt_param = False # will change the path, so expect a 404 with self.assertRaises(SolrError): response = self.solr.search('my') args, kwargs = self.solr._send_request.call_args self.assertTrue(args[1].startswith('/autocomplete')) self.assertTrue(args[1].find("qt=%2Fautocomplete") < 0) # reset the values to what they were before the test self.solr.use_qt_param = before_test_use_qt_param self.solr.search_handler = before_test_search_handler