def index_data(): docs = get_data() client = SolrClient('http://localhost:8983/solr') client.index_json('stocks', json.dumps(docs)) client.commit('stocks')
def index_json(): client = SolrClient('http://localhost:8983/solr') docs = [ {'id' : '8', 'field8' : 'value8'}, ] client.index_json('test', json.dumps(docs)) client.commit('test')
def test_index_multiproc(self): index = IndexQ(test_config['indexqbase'], 'testq') solr = SolrClient(test_config['SOLR_SERVER'], devel=True, auth=test_config['SOLR_CREDENTIALS']) solr.delete_doc_by_id(test_config['SOLR_COLLECTION'],'*') buff = [] files = [] for doc in self.docs: files.append(index.add(doc, finalize=True)) index.index(solr,test_config['SOLR_COLLECTION'],threads=10) solr.commit(test_config['SOLR_COLLECTION'],openSearcher=True) for doc in self.docs: res = solr.query(test_config['SOLR_COLLECTION'],{'q':'id:{}'.format(doc['id'])}) self.assertTrue(res.get_results_count()==1)
class ClientTestQuery(unittest.TestCase): @classmethod def setUpClass(self): self.solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS']) self.rand_docs = RandomTestData() self.docs = self.rand_docs.get_docs(50) self.solr.delete_doc_by_id(test_config['SOLR_COLLECTION'], '*') for field in test_config['collections']['copy_fields']: try: self.solr.collections.delete_copy_field( test_config['SOLR_COLLECTION'], field) except: pass for field in test_config['collections']['fields']: try: self.solr.collections.create_field( test_config['SOLR_COLLECTION'], field) except: pass #Index Some data self.solr.index_json(test_config['SOLR_COLLECTION'], json.dumps(self.docs)) self.solr.commit(test_config['SOLR_COLLECTION'], openSearcher=True) def test_basic_query(self): r = self.solr.query(test_config['SOLR_COLLECTION'], {'q': '*:*'}) self.assertEqual(r.get_num_found(), len(self.docs)) def test_facet(self): r = self.solr.query(test_config['SOLR_COLLECTION'], { 'q': '*:*', 'facet': 'true', 'facet.field': 'facet_test', }) local_facets = {} for doc in self.docs: try: local_facets[doc['facet_test']] += 1 except: local_facets[doc['facet_test']] = 1 try: self.assertDictEqual(local_facets, r.get_facets()['facet_test']) except Exception as e: logging.info("local") logging.info(local_facets) logging.info("facets") logging.info(r.get_facets()) raise def test_facet_with_fq(self): r = self.solr.query(test_config['SOLR_COLLECTION'], { 'q': '*:*', 'facet': True, 'facet.field': 'facet_test', }) first_facet_field = list(r.get_facets()['facet_test'].keys())[0] first_facet_field_count = r.get_facets( )['facet_test'][first_facet_field] r = self.solr.query( test_config['SOLR_COLLECTION'], { 'q': '*:*', 'facet': True, 'facet.field': 'facet_test', 'fq': 'facet_test:{}'.format(first_facet_field) }) self.assertEqual(r.get_num_found(), first_facet_field_count) def test_facet_range(self): res = self.solr.query( test_config['SOLR_COLLECTION'], { 'q': '*:*', 'facet': True, 'facet.range': 'price', 'facet.range.start': 0, 'facet.range.end': 100, 'facet.range.gap': 10 }) prices = [doc['price'] for doc in self.docs] div = lambda x: str(x // 10 * 10) out = {} for k, g in itertools.groupby(sorted(prices), div): out[k] = len(list(g)) or 0 self.assertDictEqual(out, res.get_facets_ranges()['price']) def test_facet_pivot(self): res = self.solr.query( test_config['SOLR_COLLECTION'], { 'q': '*:*', 'facet': True, 'facet.pivot': ['facet_test,price', 'facet_test,id'] }) out = {} for doc in self.docs: if doc['facet_test'] not in out: out[doc['facet_test']] = {} if doc['price'] not in out[doc['facet_test']]: out[doc['facet_test']][doc['price']] = 1 else: out[doc['facet_test']][doc['price']] += 1 self.assertDictEqual(out, res.get_facet_pivot()['facet_test,price']) def test_get_field_values_as_list(self): res = self.solr.query(test_config['SOLR_COLLECTION'], { 'q': '*:*', }) results = res.get_field_values_as_list('product_name_exact') docs = res.docs temp = [] for doc in docs: if 'product_name_exact' in doc: temp.append(doc['product_name_exact']) self.assertEqual(results, temp) def test_get_facet_values_as_list(self): r = self.solr.query( test_config['SOLR_COLLECTION'], { 'q': '*:*', 'facet': 'true', 'facet.limit': -1, 'facet.field': 'facet_test', }) self.assertEqual( sorted(r.data['facet_counts']['facet_fields']['facet_test'][1::2]), sorted(r.get_facet_values_as_list('facet_test'))) def test_grouped_count_1(self): ''' Get a dict of grouped docs ''' r = self.solr.query( test_config['SOLR_COLLECTION'], { 'q': '*:*', 'group': True, 'group.field': 'id', 'group.ngroups': True, }) self.assertEqual(r.get_ngroups(), 50) self.assertEqual(r.get_ngroups('id'), 50) def test_grouped_docs(self): ''' Get a dict of grouped docs ''' r = self.solr.query( test_config['SOLR_COLLECTION'], { 'q': '*:*', 'group': True, 'group.field': 'id', 'group.ngroups': True, }) self.assertEqual(len(r.docs), 10) self.assertTrue('doclist' in r.docs[0]) def test_grouped_docs(self): ''' Get a dict of grouped docs ''' r = self.solr.query( test_config['SOLR_COLLECTION'], { 'q': '*:*', 'group': True, 'group.field': 'id', 'group.ngroups': True, }) self.assertEqual(len(r.docs), 10) self.assertTrue('doclist' in r.docs[0]) def test_flat_groups(self): ''' Get a dict of grouped docs ''' r = self.solr.query(test_config['SOLR_COLLECTION'], { 'q': '*:*', 'group': True, 'group.field': 'id' }) flats = r.get_flat_groups() self.assertEqual(len(flats), 10) self.assertTrue('date' in flats[0]) def test_json_facet(self): ''' Get a dict of grouped docs ''' #Just lazy getting a new response object r = self.solr.query(test_config['SOLR_COLLECTION'], {'q': '*:*'}) a = r.get_jsonfacet_counts_as_dict( 'test', { 'count': 50, 'test': { 'buckets': [{ 'count': 10, 'pr': { 'buckets': [{ 'count': 2, 'unique': 1, 'val': 79 }, { 'count': 1, 'unique': 1, 'val': 9 }] }, 'pr_sum': 639.0, 'val': 'consectetur' }, { 'count': 8, 'pr': { 'buckets': [ { 'count': 1, 'unique': 1, 'val': 9 }, { 'count': 1, 'unique': 1, 'val': 31 }, { 'count': 1, 'unique': 1, 'val': 33 }, ] }, 'pr_sum': 420.0, 'val': 'auctor' }, { 'count': 8, 'pr': { 'buckets': [ { 'count': 2, 'unique': 1, 'val': 94 }, { 'count': 1, 'unique': 1, 'val': 25 }, ] }, 'pr_sum': 501.0, 'val': 'nulla' }] } }) b = { 'test': { 'auctor': { 'count': 8, 'pr': { 9: { 'count': 1, 'unique': 1 }, 31: { 'count': 1, 'unique': 1 }, 33: { 'count': 1, 'unique': 1 } }, 'pr_sum': 420.0 }, 'consectetur': { 'count': 10, 'pr': { 9: { 'count': 1, 'unique': 1 }, 79: { 'count': 2, 'unique': 1 } }, 'pr_sum': 639.0 }, 'nulla': { 'count': 8, 'pr': { 25: { 'count': 1, 'unique': 1 }, 94: { 'count': 2, 'unique': 1 } }, 'pr_sum': 501.0 } } } self.assertEqual(a, b)
class ReindexerTests(unittest.TestCase): #Methos to create the schema in the collections def create_fields(self): for coll in self.colls: logging.debug("Creating fields for {}".format(coll)) for field in test_config['collections']['fields']: try: self.solr.schema.create_field(coll, field) except ValueError: #Filed already exists probably pass def create_copy_fields(self): for coll in self.colls: logging.debug("Creating copy fields for {}".format(coll)) for field in test_config['collections']['copy_fields']: try: self.solr.schema.create_copy_field(coll, field) except ValueError: #Filed already exists probably pass def setUp(self): [self.solr.delete_doc_by_id(coll, '*') for coll in self.colls] [self.solr.commit(coll, openSearcher=True) for coll in self.colls] def _index_docs(self, numDocs, coll): ''' Generates and indexes in random data while maintaining counts of items in various date ranges. These counts in self.date_counts are used later to validate some reindexing methods. Brace yourself or have a drink..... ''' self.docs = self.rand_docs.get_docs(numDocs) sdate = datetime.datetime.now() - datetime.timedelta(days=180) edate = datetime.datetime.now() + datetime.timedelta(days=30) self._start_date = sdate self._end_date = edate import random #Assign random times to documents that are generated. This is used to spread out the documents over multiple time ranges hours = (edate - sdate).days * 24 hour_range = [x for x in range(int(hours))] self.date_counts = {} #Save the newest and oldest timestamps as well as assign them to first and second doc self.docs[0]['date'] = sdate.isoformat() + 'Z' self.date_counts[sdate.date().isoformat()] = 1 self.docs[1]['date'] = edate.isoformat() + 'Z' self.date_counts[edate.date().isoformat()] = 1 for doc in self.docs[2:]: #Make a new date and store a count of it so I can compare later new_date = (sdate + datetime.timedelta(hours=random.choice(hour_range))) new_date_s = new_date.date().isoformat() if new_date_s in self.date_counts: self.date_counts[new_date_s] += 1 else: self.date_counts[new_date_s] = 1 doc['date'] = new_date.isoformat() + 'Z' self.solr.index_json(coll, json.dumps(self.docs)) self.solr.commit(coll, openSearcher=True) time.sleep(10) def get_all_json_from_indexq(self, index): files = index.get_all_as_list() out = [] for efile in files: if efile.endswith('.gz'): f = gzip.open(efile, 'rt', encoding='utf-8') else: f = open(efile) f_data = json.load(f) f.close() out.extend(f_data) return out @classmethod def setUpClass(self): logging.debug("Starting to run Reindexer Tests") self.solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS']) self.colls = [ test_config['SOLR_REINDEXER_COLLECTION_S'], test_config['SOLR_REINDEXER_COLLECTION_D'] ] self.rand_docs = RandomTestData() def test_solr_to_indexq(self): ''' Will export documents from Solr and put them into an IndexQ. ''' index = IndexQ(test_config['indexqbase'], 'test_reindexer', size=0) for dir in ['_todo_dir', '_done_dir']: [os.remove(x) for x in index.get_all_as_list(dir=dir)] self._index_docs(5000, self.colls[0]) reindexer = Reindexer(source=self.solr, source_coll='source_coll', dest=index) reindexer.reindex() from_files = self.get_all_json_from_indexq(index) from_solr = self.solr.query('source_coll', { 'q': '*:*', 'rows': 5000 }).docs from_solr = reindexer._trim_fields(from_solr) self.assertEqual(sorted(from_files, key=lambda x: x['id']), sorted(from_solr, key=lambda x: x['id'])) def test_ignore_fields(self): ''' Will export documents from Solr and put them into an IndexQ. ''' index = IndexQ(test_config['indexqbase'], 'test_reindexer', size=0) for dir in ['_todo_dir', '_done_dir']: [os.remove(x) for x in index.get_all_as_list(dir=dir)] reindexer = Reindexer(source=self.solr, source_coll='source_coll', dest=index) for field in ['_version_', 'product_name_exact']: self.assertTrue(field in reindexer._ignore_fields) def test_ignore_fields_disable(self): ''' Checks to make sure ignore_fields override works ''' index = IndexQ(test_config['indexqbase'], 'test_reindexer', size=0) reindexer = Reindexer(source=self.solr, source_coll='source_coll', dest=index, ignore_fields=False) self.assertEqual(reindexer._ignore_fields, False) def test_ignore_fields_override(self): ''' Checks to make sure ignore_fields override works ''' index = IndexQ(test_config['indexqbase'], 'test_reindexer', size=0) reindexer = Reindexer(source=self.solr, source_coll='source_coll', dest=index, ignore_fields=['_text_', '_any_other_field']) self.assertEqual(reindexer._ignore_fields, ['_text_', '_any_other_field']) def test_get_copy_fields(self): ''' Tests the method to get copy fields from Solr. ''' reindexer = Reindexer(source=self.solr, source_coll=self.colls[0], dest=self.solr, dest_coll='doesntmatter') self.assertEqual(reindexer._get_copy_fields(), [ field['dest'] for field in self.solr.schema.get_schema_copyfields(self.colls[0]) ]) def test_query_gen(self): ''' Tests the method to get copy fields from Solr. ''' reindexer = Reindexer(source=self.solr, source_coll=self.colls[0], dest=self.solr, dest_coll='doesntmatter') self.assertEqual( reindexer._get_query('cursor'), { 'cursorMark': 'cursor', 'rows': reindexer._rows, 'q': '*:*', 'sort': 'id desc' }) def test_query_gen_pershard_distrib(self): ''' Tests the method to get copy fields from Solr. ''' reindexer = Reindexer(source=self.solr, source_coll=self.colls[0], dest=self.solr, dest_coll='doesntmatter', per_shard=True) q = reindexer._get_query('cursor') self.assertTrue('distrib' in q and q['distrib'] == 'false') def test_query_gen_date(self): ''' Tests the method to get copy fields from Solr. ''' reindexer = Reindexer(source=self.solr, source_coll=self.colls[0], dest=self.solr, dest_coll='doesntmatter', date_field='ddddd') self.assertEqual( reindexer._get_query('cursor'), { 'cursorMark': 'cursor', 'rows': reindexer._rows, 'q': '*:*', 'sort': 'id desc', 'sort': 'ddddd asc, id desc' }) def test_remove_copy_fields_from_data(self): index = IndexQ(test_config['indexqbase'], 'test_reindexer', size=0) for dir in ['_todo_dir', '_done_dir']: [os.remove(x) for x in index.get_all_as_list(dir=dir)] reindexer = Reindexer(source=self.solr, source_coll='source_coll', dest=index) reindexer.reindex() from_files = self.get_all_json_from_indexq(index) excluded_fields = reindexer._ignore_fields for doc in from_files: for field in excluded_fields: if field in doc: print(doc) #self.assertTrue(field not in doc) def test_solr_to_solr(self): self._index_docs(50000, self.colls[0]) reindexer = Reindexer(source=self.solr, source_coll='source_coll', dest=self.solr, dest_coll='dest_coll') reindexer.reindex() self.assertEqual( self.solr.query(self.colls[0], { 'q': '*:*', 'rows': 10000000 }).docs.sort(key=lambda x: x['id']), self.solr.query(self.colls[1], { 'q': '*:*', 'rows': 10000000 }).docs.sort(key=lambda x: x['id']), ) def test_solr_to_solr_with_date(self): self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS']) reindexer = Reindexer(source=solr, source_coll='source_coll', dest=solr, dest_coll='dest_coll', date_field='index_date') reindexer.reindex() try: self.assertTrue(solr.transport._action_log[1]['params']['params'] ['sort'] == 'index_date asc, id desc') except KeyError: self.assertTrue(solr.transport._action_log[2]['params']['params'] ['sort'] == 'index_date asc, id desc') self.assertEqual( solr.query(self.colls[0], { 'q': '*:*', 'rows': 10000000 }).docs.sort(key=lambda x: x['id']), solr.query(self.colls[1], { 'q': '*:*', 'rows': 10000000 }).docs.sort(key=lambda x: x['id']), ) def test_get_edge_date(self): ''' Checks to make sure _get_edge_date returns correct start and end dates. ''' self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS']) reindexer = Reindexer(source=solr, source_coll='source_coll', dest=solr, dest_coll='dest_coll', date_field='index_date') solr_end_date_string = reindexer._get_edge_date('date', 'desc') solr_start_date_string = reindexer._get_edge_date('date', 'asc') self.assertTrue( self._start_date.date(), datetime.datetime.strptime(solr_start_date_string, '%Y-%m-%dT%H:%M:%S.%fZ')) self.assertTrue( self._end_date.date(), datetime.datetime.strptime(solr_end_date_string, '%Y-%m-%dT%H:%M:%S.%fZ')) def test_get_date_range_query(self): ''' Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one ''' solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS']) reindexer = Reindexer(source=solr, source_coll='source_coll', dest=solr, dest_coll='dest_coll', date_field='index_date') self.assertEqual( reindexer._get_date_range_query('2015-11-10', '2015-12-11'), { 'rows': 0, 'facet.range.end': '2015-12-11', 'facet': 'true', 'facet.range': 'index_date', 'facet.range.start': '2015-11-10', 'q': '*:*', 'facet.range.include': 'all', 'facet.range.gap': '+1DAY' }) self.assertEqual( reindexer._get_date_range_query('2015-11-10', '2015-12-11', date_field='date123'), { 'rows': 0, 'facet.range.end': '2015-12-11', 'facet': 'true', 'facet.range': 'date123', 'facet.range.start': '2015-11-10', 'q': '*:*', 'facet.range.include': 'all', 'facet.range.gap': '+1DAY' }) self.assertEqual( reindexer._get_date_range_query('2015-11-10', '2015-12-11', date_field='date123', timespan='MONTH'), { 'rows': 0, 'facet.range.end': '2015-12-11', 'facet': 'true', 'facet.range': 'date123', 'facet.range.start': '2015-11-10', 'q': '*:*', 'facet.range.include': 'all', 'facet.range.gap': '+1MONTH' }) self.assertEqual( reindexer._get_date_range_query('2015-11-10', '2015-12-11', timespan='MONTH'), { 'rows': 0, 'facet.range.end': '2015-12-11', 'facet': 'true', 'facet.range': 'index_date', 'facet.range.start': '2015-11-10', 'q': '*:*', 'facet.range.include': 'all', 'facet.range.gap': '+1MONTH' }) def test_get_date_facet_counts(self): ''' Checks the date_range_query generation function. Makes sure the date ranges returned matches what got indexed. ''' self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS']) reindexer = Reindexer(source=solr, source_coll='source_coll', dest=solr, dest_coll='dest_coll', date_field='date') #Testing this one source_facet, dest_facet = reindexer._get_date_facet_counts( 'DAY', 'date', start_date=self._start_date.date().isoformat()) for dt_range in source_facet: dt = datetime.datetime.strptime( dt_range, '%Y-%m-%dT%H:%M:%SZ').date().isoformat() if source_facet[dt_range] != self.date_counts[dt]: logging.info("{} - {} - {}".format(dt, source_facet[dt_range], self.date_counts[dt])) self.assertEqual(source_facet[dt_range], self.date_counts[dt]) def test_get_date_facet_counts_without_start_date(self): ''' Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one ''' self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS']) reindexer = Reindexer(source=solr, source_coll='source_coll', dest=solr, dest_coll='dest_coll', date_field='date') #Testing this one source_facet, dest_facet = reindexer._get_date_facet_counts( 'DAY', 'date') for dt_range in source_facet: dt = datetime.datetime.strptime( dt_range, '%Y-%m-%dT%H:%M:%SZ').date().isoformat() if source_facet[dt_range] != self.date_counts[dt]: logging.info("{} - {} - {}".format(dt, source_facet[dt_range], self.date_counts[dt])) self.assertEqual(source_facet[dt_range], self.date_counts[dt]) def test_get_date_facet_counts_not_day(self): ''' Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one ''' self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS']) reindexer = Reindexer(source=solr, source_coll='source_coll', dest=solr, dest_coll='dest_coll', date_field='date') #Testing this one with self.assertRaises(ValueError): source_facet, dest_facet = reindexer._get_date_facet_counts( 'MONTH', 'date') ## These tests are focused on methods related to resuming re-indexing def test_solr_to_solr_resume_checkonly(self): ''' Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one ''' self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS']) reindexer = Reindexer(source=solr, source_coll='source_coll', dest=solr, dest_coll='dest_coll', date_field='date') #Make sure only source has data self.assertEqual( len( solr.query(self.colls[0], { 'q': '*:*', 'rows': 10000000 }).docs), 50000) self.assertEqual( len( solr.query(self.colls[1], { 'q': '*:*', 'rows': 10000000 }).docs), 0) reindexer.resume(check=True) #Makes sure nothing got indexed self.assertEqual( len( solr.query(self.colls[0], { 'q': '*:*', 'rows': 10000000 }).docs), 50000) self.assertEqual( len( solr.query(self.colls[1], { 'q': '*:*', 'rows': 10000000 }).docs), 0) def test_solr_to_solr_resume_basic(self): ''' Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one ''' self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config['SOLR_SERVER'][0], auth=test_config['SOLR_CREDENTIALS']) reindexer = Reindexer(source=solr, source_coll='source_coll', dest=solr, dest_coll='dest_coll', date_field='date') #Make sure only source has datae self.assertEqual( len( solr.query(self.colls[0], { 'q': '*:*', 'rows': 10000000 }).docs), 50000) self.assertEqual( len( solr.query(self.colls[1], { 'q': '*:*', 'rows': 10000000 }).docs), 0) reindexer.resume() sleep(10) #Make sure countc match up after reindex self.assertEqual( len( solr.query(self.colls[0], { 'q': '*:*', 'rows': 10000000 }).docs), len( solr.query(self.colls[1], { 'q': '*:*', 'rows': 10000000 }).docs)) def test_solr_to_solr_reindex_and_resume(self): ''' Only reindexes half of the collection on the first time. Then goes back and does a resume to make sure it works. ''' self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config['SOLR_SERVER'][0], auth=test_config['SOLR_CREDENTIALS']) reindexer = Reindexer(source=solr, source_coll='source_coll', dest=solr, dest_coll='dest_coll', date_field='date') #Make sure only source has datae self.assertEqual( len( solr.query(self.colls[0], { 'q': '*:*', 'rows': 10000000 }).docs), 50000) self.assertEqual( len( solr.query(self.colls[1], { 'q': '*:*', 'rows': 10000000 }).docs), 0) #This gets somehwat of a mid point date in the range. midpoint = (datetime.datetime.now() - datetime.timedelta(days=( (self._end_date - self._start_date).days / 2))) #Reindex approximately half of the data by restricting FQ reindexer.reindex( fq=['date:[* TO {}]'.format(midpoint.isoformat() + 'Z')]) sleep(10) #Make sure we have at least 20% of the data. dest_count = len( solr.query(self.colls[1], { 'q': '*:*', 'rows': 10000000 }).docs) s_count = len( solr.query(self.colls[0], { 'q': '*:*', 'rows': 10000000 }).docs) self.assertTrue(s_count > dest_count > s_count * .20) reindexer.resume() sleep(10) #Make sure countc match up after reindex self.assertEqual( len( solr.query(self.colls[0], { 'q': '*:*', 'rows': 10000000 }).docs), len( solr.query(self.colls[1], { 'q': '*:*', 'rows': 10000000 }).docs)) def test_solr_to_solr_reindex_and_resume_reverse(self): ''' Only reindexes half of the collection on the first time. Then goes back and does a resume to make sure it works. ''' self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config['SOLR_SERVER'][0], auth=test_config['SOLR_CREDENTIALS']) reindexer = Reindexer(source=solr, source_coll='source_coll', dest=solr, dest_coll='dest_coll', date_field='date') #Make sure only source has data self.assertEqual( len( solr.query(self.colls[0], { 'q': '*:*', 'rows': 10000000 }).docs), 50000) self.assertEqual( len( solr.query(self.colls[1], { 'q': '*:*', 'rows': 10000000 }).docs), 0) #This gets somehwat of a mid point date in the range. midpoint = (datetime.datetime.now() - datetime.timedelta(days=( (self._end_date - self._start_date).days / 2))) #Reindex approximately half of the data by restricting FQ reindexer.reindex( fq=['date:[{} TO *]'.format(midpoint.isoformat() + 'Z')]) sleep(10) #Make sure we have at least 20% of the data. dest_count = len( solr.query(self.colls[1], { 'q': '*:*', 'rows': 10000000 }).docs) s_count = len( solr.query(self.colls[0], { 'q': '*:*', 'rows': 10000000 }).docs) self.assertTrue(s_count > dest_count > s_count * .20) reindexer.resume() sleep(10) #Make sure countc match up after reindex self.assertEqual( len( solr.query(self.colls[0], { 'q': '*:*', 'rows': 10000000 }).docs), len( solr.query(self.colls[1], { 'q': '*:*', 'rows': 10000000 }).docs)) def test_solr_to_solr_reindexer_per_shard(self): self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config['SOLR_SERVER'][0], auth=test_config['SOLR_CREDENTIALS']) #Make sure only source has data self.assertEqual( len( solr.query(self.colls[0], { 'q': '*:*', 'rows': 10000000 }).docs), 50000) self.assertEqual( len( solr.query(self.colls[1], { 'q': '*:*', 'rows': 10000000 }).docs), 0) reindexer = Reindexer(source=solr, source_coll='source_coll_shard1_replica1', dest=solr, dest_coll=self.colls[1], per_shard=True, date_field='date') reindexer.reindex() reindexer = Reindexer(source=solr, source_coll='source_coll_shard2_replica1', dest=solr, dest_coll=self.colls[1], per_shard=True, date_field='date') reindexer.reindex() self.solr.commit(self.colls[1], openSearcher=True) #sloppy check over here, will improve later self.assertEqual( len( solr.query(self.colls[0], { 'q': '*:*', 'rows': 10000000 }).docs), len( solr.query(self.colls[1], { 'q': '*:*', 'rows': 10000000 }).docs))
class ClientTestQuery(unittest.TestCase): @classmethod def setUpClass(self): self.solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS']) self.rand_docs = RandomTestData() self.docs = self.rand_docs.get_docs(50) self.solr.delete_doc_by_id(test_config['SOLR_COLLECTION'], '*') for field in test_config['collections']['copy_fields']: try: self.solr.collections.delete_copy_field( test_config['SOLR_COLLECTION'], field) except: pass for field in test_config['collections']['fields']: try: self.solr.collections.create_field( test_config['SOLR_COLLECTION'], field) except: pass #Index Some data self.solr.index_json(test_config['SOLR_COLLECTION'], json.dumps(self.docs)) self.solr.commit(test_config['SOLR_COLLECTION'], openSearcher=True) def test_basic_query(self): r = self.solr.query(test_config['SOLR_COLLECTION'], {'q': '*:*'}) self.assertEqual(r.get_num_found(), len(self.docs)) def test_facet(self): r = self.solr.query(test_config['SOLR_COLLECTION'], { 'q': '*:*', 'facet': 'true', 'facet.field': 'facet_test', }) local_facets = {} for doc in self.docs: try: local_facets[doc['facet_test']] += 1 except: local_facets[doc['facet_test']] = 1 try: self.assertDictEqual(local_facets, r.get_facets()['facet_test']) except Exception as e: logging.info("local") logging.info(local_facets) logging.info("facets") logging.info(r.get_facets()) raise def test_facet_with_fq(self): r = self.solr.query(test_config['SOLR_COLLECTION'], { 'q': '*:*', 'facet': True, 'facet.field': 'facet_test', }) first_facet_field = list(r.get_facets()['facet_test'].keys())[0] first_facet_field_count = r.get_facets( )['facet_test'][first_facet_field] r = self.solr.query( test_config['SOLR_COLLECTION'], { 'q': '*:*', 'facet': True, 'facet.field': 'facet_test', 'fq': 'facet_test:{}'.format(first_facet_field) }) self.assertEqual(r.get_num_found(), first_facet_field_count) def test_facet_range(self): res = self.solr.query( test_config['SOLR_COLLECTION'], { 'q': '*:*', 'facet': True, 'facet.range': 'price', 'facet.range.start': 0, 'facet.range.end': 100, 'facet.range.gap': 10 }) prices = [doc['price'] for doc in self.docs] div = lambda x: str(x // 10 * 10) out = {} for k, g in itertools.groupby(sorted(prices), div): out[k] = len(list(g)) or 0 self.assertDictEqual(out, res.get_facets_ranges()['price']) def test_facet_pivot(self): res = self.solr.query( test_config['SOLR_COLLECTION'], { 'q': '*:*', 'facet': True, 'facet.pivot': ['facet_test,price', 'facet_test,id'] }) out = {} for doc in self.docs: if doc['facet_test'] not in out: out[doc['facet_test']] = {} if doc['price'] not in out[doc['facet_test']]: out[doc['facet_test']][doc['price']] = 1 else: out[doc['facet_test']][doc['price']] += 1 self.assertDictEqual(out, res.get_facet_pivot()['facet_test,price']) def test_get_field_values_as_list(self): res = self.solr.query(test_config['SOLR_COLLECTION'], { 'q': '*:*', }) results = res.get_field_values_as_list('product_name_exact') docs = res.docs temp = [] for doc in docs: if 'product_name_exact' in doc: temp.append(doc['product_name_exact']) self.assertEqual(results, temp) def test_get_facet_values_as_list(self): r = self.solr.query(test_config['SOLR_COLLECTION'], { 'q': '*:*', 'facet': 'true', 'facet.field': 'facet_test', })
class ClientTestIndexing(unittest.TestCase): #High Level Client Tests @classmethod def setUpClass(self): self.solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS']) self.rand_docs = RandomTestData() self.docs = self.rand_docs.get_docs(50) for field in test_config['collections']['copy_fields']: try: self.solr.schema.delete_copy_field(test_config['SOLR_COLLECTION'],field) except: pass for field in test_config['collections']['fields']: try: self.solr.schema.create_field(test_config['SOLR_COLLECTION'],field) except: pass def setUp(self): self.delete_docs() self.commit() def delete_docs(self): self.solr.delete_doc_by_id(test_config['SOLR_COLLECTION'],'*') self.commit() def commit(self): self.solr.commit(test_config['SOLR_COLLECTION'],openSearcher=True) sleep(5) @unittest.skip("Skipping for now") def test_access_without_auth(self): if not test_config['SOLR_CREDENTIALS'][0]: return solr = SolrClient(test_config['SOLR_SERVER'],devel=True) with self.assertRaises(ConnectionError) as cm: solr.query('SolrClient_unittest',{'q':'not_gonna_happen'}) def test_indexing_json(self): self.docs = self.rand_docs.get_docs(53) self.solr.index_json(test_config['SOLR_COLLECTION'],json.dumps(self.docs)) self.commit() sleep(5) for doc in self.docs: logging.debug("Checking {}".format(doc['id'])) self.assertEqual(self.solr.query(test_config['SOLR_COLLECTION'],{'q':'id:{}'.format(doc['id'])}).get_num_found(),1) self.delete_docs() self.commit() def test_indexing_conn_log(self): self.docs = self.rand_docs.get_docs(53) self.solr.index_json(test_config['SOLR_COLLECTION'],json.dumps(self.docs)) self.commit() sleep(5) for doc in self.docs: logging.debug("Checking {}".format(doc['id'])) self.assertEqual(self.solr.query(test_config['SOLR_COLLECTION'],{'q':'id:{}'.format(doc['id'])}).get_num_found(),1) logging.info(self.solr.transport._action_log) self.delete_docs() self.commit() def test_index_json_file(self): self.docs = self.rand_docs.get_docs(55) with open('temp_file.json','w') as f: json.dump(self.docs,f) r = self.solr.stream_file(test_config['SOLR_COLLECTION'],'temp_file.json') self.commit() r = self.solr.query(test_config['SOLR_COLLECTION'],{'q':'*:*'}) self.assertEqual(r.get_num_found(),len(self.docs)) self.delete_docs() self.commit() try: os.remove('temp_file.json.gz') os.remove('temp_file.json') except: pass def test_stream_file_gzip_file(self): self.docs = self.rand_docs.get_docs(60) with gzip.open('temp_file.json.gz','wb') as f: f.write(json.dumps(self.docs).encode('utf-8')) r = self.solr.stream_file(test_config['SOLR_COLLECTION'],'temp_file.json.gz') self.commit() r = self.solr.query(test_config['SOLR_COLLECTION'],{'q':'*:*'}) self.assertEqual(r.get_num_found(),len(self.docs)) self.delete_docs() self.commit() try: os.remove('temp_file.json.gz') os.remove('temp_file.json') except: pass @unittest.skip("Don't test remote indexing in travis") def test_index_json_file(self): self.docs = self.rand_docs.get_docs(61) with open('temp_file.json','w') as f: json.dump(self.docs,f) r = self.solr.local_index(test_config['SOLR_COLLECTION'],'temp_file.json') self.commit() r = self.solr.query(test_config['SOLR_COLLECTION'],{'q':'*:*'}) self.assertEqual(r.get_num_found(),len(self.docs)) self.delete_docs() self.commit() try: os.remove('temp_file.json.gz') os.remove('temp_file.json') except: pass def test_paging_query_with_rows(self): self.docs = self.rand_docs.get_docs(1000) with gzip.open('temp_file.json.gz','wb') as f: f.write(json.dumps(self.docs).encode('utf-8')) r = self.solr.stream_file(test_config['SOLR_COLLECTION'],'temp_file.json.gz') self.commit() queries = 0 docs = [] for res in self.solr.paging_query(test_config['SOLR_COLLECTION'],{'q':'*:*'}, rows=50): self.assertTrue(len(res.docs) == 50) docs.extend(res.docs) queries +=1 self.assertEqual( [x['id'] for x in sorted(docs, key= lambda x: x['id'])], [x['id'] for x in sorted(self.docs, key= lambda x: x['id'])] ) self.assertTrue(1000/50 == queries) self.delete_docs() self.commit() try: os.remove('temp_file.json.gz') os.remove('temp_file.json') except: pass def test_paging_query(self): self.docs = self.rand_docs.get_docs(1000) with gzip.open('temp_file.json.gz','wb') as f: f.write(json.dumps(self.docs).encode('utf-8')) r = self.solr.stream_file(test_config['SOLR_COLLECTION'],'temp_file.json.gz') self.commit() queries = 0 docs = [] for res in self.solr.paging_query(test_config['SOLR_COLLECTION'],{'q':'*:*'}): self.assertTrue(len(res.docs) == 1000) docs.extend(res.docs) queries +=1 self.assertTrue(queries == 1) self.assertEqual( [x['id'] for x in sorted(docs, key= lambda x: x['id'])], [x['id'] for x in sorted(self.docs, key= lambda x: x['id'])] ) self.delete_docs() self.commit() try: os.remove('temp_file.json.gz') os.remove('temp_file.json') except: pass def test_paging_query_with_max(self): self.docs = self.rand_docs.get_docs(1000) with gzip.open('temp_file.json.gz','wb') as f: f.write(json.dumps(self.docs).encode('utf-8')) r = self.solr.stream_file(test_config['SOLR_COLLECTION'],'temp_file.json.gz') self.commit() queries = 0 docs = [] for res in self.solr.paging_query(test_config['SOLR_COLLECTION'], {'q':'*:*'}, rows = 50, max_start = 502): self.assertTrue(len(res.docs) == 50) queries +=1 docs.extend(res.docs) ids = [x['id'] for x in docs] for item in docs: self.assertTrue(item['id'] in ids) self.assertEqual(11, queries) self.delete_docs() self.commit() try: os.remove('temp_file.json.gz') os.remove('temp_file.json') except: pass
def handle(self, *args, **options): total = 0 cycle = 0 try: # Retrieve the Search and Field models from the database solr = SolrClient(settings.SOLR_SERVER_URL) try: self.search_target = Search.objects.get( search_id=options['search']) self.solr_core = self.search_target.solr_core_name self.all_fields = Field.objects.filter( search_id=self.search_target) if options['nothing_to_report']: self.search_fields = Field.objects.filter( search_id=self.search_target, alt_format='ALL') | Field.objects.filter( search_id=self.search_target, alt_format='NTR') else: self.search_fields = Field.objects.filter( search_id=self.search_target, alt_format='ALL') | Field.objects.filter( search_id=self.search_target, alt_format='') for search_field in self.search_fields: self.csv_fields[search_field.field_id] = search_field codes = Code.objects.filter(field_id=search_field) # Most csv_fields will not have codes, so the queryset will be zero length if len(codes) > 0: code_dict = {} for code in codes: code_dict[code.code_id.lower()] = code self.field_codes[search_field.field_id] = code_dict except Search.DoesNotExist as x: self.logger.error('Search not found: "{0}"'.format(x)) exit(-1) except Field.DoesNotExist as x1: self.logger.error( 'Fields not found for search: "{0}"'.format(x1)) # Process the records in the CSV file one at a time with open(options['csv'], 'r', encoding='utf-8-sig', errors="ignore") as csv_file: csv_reader = csv.DictReader(csv_file, dialect='excel') solr_items = [] for csv_record in csv_reader: # Clear out the Solr core. on the first line if total == 0 and not options['nothing_to_report']: solr.delete_doc_by_query(self.solr_core, "*:*") print("Purging all records") elif total == 0 and options['nothing_to_report']: solr.delete_doc_by_query(self.solr_core, "format:NTR") solr.commit(self.solr_core, softCommit=True) print("Purging NTR records") total += 1 cycle += 1 # Call plugins if they exist for this search type. This is where a developer can introduce # code to customize the data that is loaded into Solr for a particular search. search_type_plugin = 'search.plugins.{0}'.format( options['search']) if search_type_plugin in self.discovered_plugins: include, filtered_record = self.discovered_plugins[ search_type_plugin].filter_csv_record( csv_record, self.search_target, self.csv_fields, self.field_codes, 'NTR' if options['nothing_to_report'] else '') if not include: continue else: csv_record = filtered_record # Create a dictionary for each record loaded into Solr solr_record = { 'format': 'NTR' if options['nothing_to_report'] else 'DEFAULT' } for csv_field in csv_reader.fieldnames: # Verify that it is a known field if csv_field not in self.csv_fields and csv_field not in ( 'owner_org_title', 'owner_org'): self.logger.error( "CSV files contains unknown field: {0}".format( csv_field)) exit(-1) if csv_field == 'owner_org_title': continue # Handle multi-valued fields here if self.csv_fields[csv_field].solr_field_multivalued: solr_record[csv_field] = csv_record[ csv_field].split(',') # Copy fields fo report cannot use multi-values - so directly populate with original string if self.csv_fields[csv_field].solr_field_export: for extra_field in self.csv_fields[ csv_field].solr_field_export.split( ','): solr_record[extra_field] = csv_record[ csv_field] else: solr_record[csv_field] = csv_record[csv_field] # Automatically expand out dates and numbers for use with Solr export handler if self.csv_fields[ csv_field].solr_field_type == 'pdate': try: if csv_record[csv_field]: csv_date = datetime.strptime( csv_record[csv_field], '%Y-%m-%d') solr_record[csv_field + '_en'] = format_date( csv_date, locale='en') solr_record[csv_field + '_fr'] = format_date( csv_date, locale='fr') if self.csv_fields[ csv_field].is_default_year: solr_record['year'] = csv_date.year if self.csv_fields[ csv_field].is_default_month: solr_record['month'] = csv_date.month else: solr_record[csv_field + '_en'] = '' solr_record[csv_field + '_fr'] = '' except ValueError as x2: self.logger.error( 'Invalid date: "{0}"'.format(x2)) solr_record[csv_field] = '' continue elif self.csv_fields[csv_field].solr_field_type in [ 'pint', 'pfloat' ]: if solr_record[csv_field]: if solr_record[csv_field] == '.': solr_record[csv_field] = "0" csv_decimal = parse_decimal( solr_record[csv_field], locale='en_US') if self.csv_fields[ csv_field].solr_field_is_currency: solr_record[csv_field + '_en'] = format_currency( csv_decimal, 'CAD', locale='en_CA') solr_record[csv_field + '_fr'] = format_currency( csv_decimal, 'CAD', locale='fr_CA') else: solr_record[csv_field + '_en'] = format_decimal( csv_decimal, locale='en_CA') solr_record[csv_field + '_fr'] = format_decimal( csv_decimal, locale='fr_CA') else: solr_record[csv_field + '_en'] = '' solr_record[csv_field + '_fr'] = '' # Lookup the expanded code value from the codes dict of dict if csv_field in self.field_codes: if csv_record[csv_field]: if self.csv_fields[ csv_field].solr_field_multivalued: codes_en = [] codes_fr = [] for code_value in csv_record[ csv_field].split(","): if code_value.lower( ) in self.field_codes[csv_field]: codes_en.append( self.field_codes[csv_field] [code_value.lower()].label_en) codes_fr.append( self.field_codes[csv_field] [code_value.lower()].label_fr) else: self.logger.info( "Unknown code value: {0} for field: {1}" .format(code_value, csv_field)) solr_record[csv_field + '_en'] = codes_en solr_record[csv_field + '_fr'] = codes_fr else: if csv_record[csv_field].lower( ) in self.field_codes[csv_field]: solr_record[csv_field + '_en'] = self.field_codes[ csv_field][csv_record[ csv_field].lower( )].label_en solr_record[csv_field + '_fr'] = self.field_codes[ csv_field][csv_record[ csv_field].lower( )].label_fr else: self.logger.info( "Unknown code value: {0} for field: {1}" .format(csv_record[csv_field], csv_field)) solr_record = self.set_empty_fields(solr_record) # Set the Solr ID field (Nothing To Report records are excluded) if not options['nothing_to_report']: if self.search_target.id_fields: id_values = [] for id_field in self.search_target.id_fields.split( ","): id_values.append(csv_record[id_field]) solr_record['id'] = ",".join(id_values) else: if 'month' in solr_record: solr_record['id'] = "{0}-{1}-{2}".format( solr_record['owner_org'], solr_record['year'], solr_record['month']) elif 'quarter' in solr_record: solr_record['id'] = "{0}-{1}-{2}".format( solr_record['owner_org'], solr_record['year'], solr_record['quarter']) # Call plugins if they exist for this search type. This is where a developer can introduce # code to customize the data that is loaded into Solr for a particular search. if search_type_plugin in self.discovered_plugins: solr_record = self.discovered_plugins[ search_type_plugin].load_csv_record( csv_record, solr_record, self.search_target, self.csv_fields, self.field_codes, 'NTR' if options['nothing_to_report'] else '') solr_items.append(solr_record) # Write to Solr whenever the cycle threshold is reached if cycle >= self.cycle_on: # try to connect to Solr up to 10 times for countdown in reversed(range(10)): try: solr.index(self.solr_core, solr_items) print("{0} rows processed".format(total)) cycle = 0 solr_items.clear() break except ConnectionError as cex: if not countdown: raise print( "Solr error: {0}. Waiting to try again ... {1}" .format(cex, countdown)) time.sleep((10 - countdown) * 5) # Write and remaining records to Solr and commit if cycle > 0: # try to connect to Solr up to 10 times for countdown in reversed(range(10)): try: solr.index(self.solr_core, solr_items) total += len(solr_items) print("{0} rows processed".format(cycle)) cycle = 0 solr_items.clear() break except ConnectionError as cex: if not countdown: raise print( "Solr error: {0}. Waiting to try again ... {1}" .format(cex, countdown)) time.sleep((10 - countdown) * 5) solr.commit(self.solr_core, softCommit=True, waitSearcher=True) print("Total rows processed: {0}".format(total)) except Exception as x: self.logger.error('Unexpected Error "{0}"'.format(x))
import os import json import requests CC_LINKS_FILES_DIRECTORIES = [] SOLR_INSTANCE_URL = "" SOLR_CORE = "" solr_client = SolrClient(SOLR_INSTANCE_URL) def get_url_content(url): user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36' headers = {'User-Agent': user_agent} resp = requests.get(url, headers=headers) return resp.text for directory in CC_LINKS_FILES_DIRECTORIES: files = [f for f in os.listdir(directory) if os.path.isfile(f)] for file in files: docs = [] with open(directory + '/' + file, 'r') as f: for line in f: json_obj = json.loads(line, encoding="utf-8") url = json_obj["url"] text = get_url_content(url) docs.append({"file_name": file, "html": text}) solr_client.index(SOLR_CORE, docs) solr_client.commit(SOLR_CORE, openSearcher=True)
class SOLRDocumentManager(IDocumentManager): def __init__(self, server_address: str, index_name: str) -> None: self.client = SolrClient(server_address) self.index = index_name self._serializer = json_serializer() def add(self, document: IndexDocument) -> BaseResponse: response = BaseResponse() try: document.id = document.unique_id doc_body = self._serializer.serialize([document]) solr_response = self.client.index_json(self.index, doc_body) if not solr_response: return response.set_error( Error("IntegrationError", 500, "Index failed to add index!")) self.client.commit(self.index, openSearcher=True, waitSearcher=False) response = BaseResponse(True) except BasicException as e: response.set_error(Error("InternalServerError", 500, e.message)) except Exception as e: response.set_error( Error("InternalServerError", 500, 'Unknown error occurred!')) return response def delete(self, unique_id: str) -> BaseResponse: response = BaseResponse() try: solr_response = self.client.delete_doc_by_id(self.index, unique_id) if not solr_response: return response.set_error( Error("IntegrationError", 500, "Index failed to delete index!")) self.client.commit(self.index, openSearcher=True, waitSearcher=False) response = BaseResponse(True) except BasicException as e: response.set_error(Error("InternalServerError", 500, e.message)) except Exception as e: print(e) response.set_error( Error("InternalServerError", 500, 'Unknown error occurred!')) return response def get(self, unique_id: str) -> DocumentResponse: pass def search(self, query: SearchQuery) -> SearchResult: solr_query = "" solr_field_query = "" solr_range_query = [] for criteria in query.searchCriteria: solr_field_query += criteria.field + '^' + str( criteria.weight) + " " words = criteria.term.split(" ") for word in words: word = word.lower() solr_query += " " + word for range_criteria in query.rangeCriteria: solr_range_query.append(range_criteria.field + ":[" + str(range_criteria.minimum) + " TO " + str(range_criteria.maximum) + "]") data = { "q": solr_query.strip(), "offset": query.page * query.items, "limit": query.items, "filter": solr_range_query, "defType": "edismax", "qf": solr_field_query } result = SearchResult(0, False) try: response = self.client.query_raw(self.index, data) result = SearchResult(response['response']['numFound'], True) for document in response['response']['docs']: result.add_result( self._serializer.deserialize(document, self.index_object_type)) except Exception as e: result.set_error( Error("InternalServerError", 500, 'Unknown error occurred!')) return result
class ClientTestQuery(unittest.TestCase): @classmethod def setUpClass(self): self.solr = SolrClient(test_config['SOLR_SERVER'][0],devel=True,auth=test_config['SOLR_CREDENTIALS']) self.rand_docs = RandomTestData() self.docs = self.rand_docs.get_docs(50) self.solr.delete_doc_by_id(test_config['SOLR_COLLECTION'],'*') for field in test_config['collections']['copy_fields']: try: self.solr.collections.delete_copy_field(test_config['SOLR_COLLECTION'],field) except: pass for field in test_config['collections']['fields']: try: self.solr.collections.create_field(test_config['SOLR_COLLECTION'],field) except: pass #Index Some data self.solr.index_json(test_config['SOLR_COLLECTION'],json.dumps(self.docs)) self.solr.commit(test_config['SOLR_COLLECTION'],openSearcher=True) def test_basic_query(self): r = self.solr.query(test_config['SOLR_COLLECTION'],{'q':'*:*'}) self.assertEqual(r.get_num_found(),len(self.docs)) def test_facet(self): r = self.solr.query(test_config['SOLR_COLLECTION'],{ 'q':'*:*', 'facet':'true', 'facet.field':'facet_test', }) local_facets = {} for doc in self.docs: try: local_facets[doc['facet_test']] +=1 except: local_facets[doc['facet_test']] = 1 try: self.assertDictEqual(local_facets,r.get_facets()['facet_test']) except Exception as e: logging.info("local") logging.info(local_facets) logging.info("facets") logging.info(r.get_facets()) raise def test_facet_with_fq(self): r = self.solr.query(test_config['SOLR_COLLECTION'],{ 'q':'*:*', 'facet':True, 'facet.field':'facet_test', }) first_facet_field = list(r.get_facets()['facet_test'].keys())[0] first_facet_field_count = r.get_facets()['facet_test'][first_facet_field] r = self.solr.query(test_config['SOLR_COLLECTION'],{ 'q':'*:*', 'facet':True, 'facet.field':'facet_test', 'fq':'facet_test:{}'.format(first_facet_field) }) self.assertEqual(r.get_num_found(),first_facet_field_count) def test_facet_range(self): res = self.solr.query(test_config['SOLR_COLLECTION'],{ 'q':'*:*', 'facet':True, 'facet.range':'price', 'facet.range.start':0, 'facet.range.end':100, 'facet.range.gap':10 }) prices = [doc['price'] for doc in self.docs] div = lambda x: str(x//10 * 10) out = {} for k,g in itertools.groupby(sorted(prices),div): out[k] = len(list(g)) or 0 self.assertDictEqual(out,res.get_facets_ranges()['price']) def test_facet_pivot(self): res = self.solr.query(test_config['SOLR_COLLECTION'],{ 'q':'*:*', 'facet':True, 'facet.pivot':['facet_test,price','facet_test,id'] }) out = {} for doc in self.docs: if doc['facet_test'] not in out: out[doc['facet_test']] = {} if doc['price'] not in out[doc['facet_test']]: out[doc['facet_test']][doc['price']]=1 else: out[doc['facet_test']][doc['price']]+=1 self.assertDictEqual(out,res.get_facet_pivot()['facet_test,price']) def test_get_field_values_as_list(self): res = self.solr.query(test_config['SOLR_COLLECTION'],{ 'q':'*:*', }) results = res.get_field_values_as_list('product_name_exact') docs = res.docs temp = [] for doc in docs: if 'product_name_exact' in doc: temp.append(doc['product_name_exact']) self.assertEqual(results,temp) def test_get_facet_values_as_list(self): r = self.solr.query(test_config['SOLR_COLLECTION'],{ 'q':'*:*', 'facet':'true', 'facet.field':'facet_test', })
class ClientTestIndexing(unittest.TestCase): @classmethod def setUpClass(self): self.solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS']) self.rand_docs = RandomTestData() self.docs = self.rand_docs.get_docs(50) for field in test_config['collections']['copy_fields']: try: self.solr.schema.delete_copy_field( test_config['SOLR_COLLECTION'], field) except Exception as e: pass for field in test_config['collections']['fields']: try: self.solr.schema.create_field(test_config['SOLR_COLLECTION'], field) except Exception as e: pass def setUp(self): self.delete_docs() self.commit() def delete_docs(self): self.solr.delete_doc_by_id(test_config['SOLR_COLLECTION'], '*') self.commit() def commit(self): # softCommit because we don't care about data on disk self.solr.commit(test_config['SOLR_COLLECTION'], openSearcher=True, softCommit=True) def test_down_solr_exception(self): # connect to "down" sorl host s = SolrClient('http://*****:*****@unittest.skip("Skipping for now") def test_access_without_auth(self): if not test_config['SOLR_CREDENTIALS'][0]: return solr = SolrClient(test_config['SOLR_SERVER'], devel=True) with self.assertRaises(ConnectionError) as cm: solr.query('SolrClient_unittest', {'q': 'not_gonna_happen'}) def test_indexing_json(self): self.docs = self.rand_docs.get_docs(53) self.solr.index_json(test_config['SOLR_COLLECTION'], json.dumps(self.docs)) self.commit() for doc in self.docs: logging.debug("Checking {}".format(doc['id'])) self.assertEqual( self.solr.query(test_config['SOLR_COLLECTION'], { 'q': 'id:{}'.format(doc['id']) }).get_num_found(), 1) self.delete_docs() self.commit() def test_get(self): doc_id = '1' self.solr.index_json(test_config['SOLR_COLLECTION'], json.dumps([{ 'id': doc_id }])) # this returns the doc! self.solr.get(test_config['SOLR_COLLECTION'], doc_id) with self.assertRaises(NotFoundError): self.solr.get(test_config['SOLR_COLLECTION'], '5') def test_mget(self): self.solr.index_json(test_config['SOLR_COLLECTION'], json.dumps([{ 'id': '1' }])) self.solr.index_json(test_config['SOLR_COLLECTION'], json.dumps([{ 'id': '5' }])) docs = self.solr.mget(test_config['SOLR_COLLECTION'], ('5', '1')) self.assertEqual(len(docs), 2) def test_indexing_conn_log(self): self.docs = self.rand_docs.get_docs(53) self.solr.index_json(test_config['SOLR_COLLECTION'], json.dumps(self.docs)) self.commit() for doc in self.docs: logging.debug("Checking {}".format(doc['id'])) self.assertEqual( self.solr.query(test_config['SOLR_COLLECTION'], { 'q': 'id:{}'.format(doc['id']) }).get_num_found(), 1) logging.info(self.solr.transport._action_log) self.delete_docs() self.commit() def test_index_json_file(self): self.docs = self.rand_docs.get_docs(55) with open('temp_file.json', 'w') as f: json.dump(self.docs, f) r = self.solr.stream_file(test_config['SOLR_COLLECTION'], 'temp_file.json') self.commit() r = self.solr.query(test_config['SOLR_COLLECTION'], {'q': '*:*'}) self.assertEqual(r.get_num_found(), len(self.docs)) self.delete_docs() self.commit() try: os.remove('temp_file.json.gz') os.remove('temp_file.json') except: pass def test_stream_file_gzip_file(self): self.docs = self.rand_docs.get_docs(60) with gzip.open('temp_file.json.gz', 'wb') as f: f.write(json.dumps(self.docs).encode('utf-8')) r = self.solr.stream_file(test_config['SOLR_COLLECTION'], 'temp_file.json.gz') self.commit() r = self.solr.query(test_config['SOLR_COLLECTION'], {'q': '*:*'}) self.assertEqual(r.get_num_found(), len(self.docs)) self.delete_docs() self.commit() try: os.remove('temp_file.json.gz') os.remove('temp_file.json') except: pass @unittest.skip("Don't test remote indexing in travis") def test_index_json_file(self): self.docs = self.rand_docs.get_docs(61) with open('temp_file.json', 'w') as f: json.dump(self.docs, f) r = self.solr.local_index(test_config['SOLR_COLLECTION'], 'temp_file.json') self.commit() r = self.solr.query(test_config['SOLR_COLLECTION'], {'q': '*:*'}) self.assertEqual(r.get_num_found(), len(self.docs)) self.delete_docs() self.commit() try: os.remove('temp_file.json.gz') os.remove('temp_file.json') except: pass def test_paging_query_with_rows(self): self.docs = self.rand_docs.get_docs(1000) with gzip.open('temp_file.json.gz', 'wb') as f: f.write(json.dumps(self.docs).encode('utf-8')) r = self.solr.stream_file(test_config['SOLR_COLLECTION'], 'temp_file.json.gz') self.commit() queries = 0 docs = [] for res in self.solr.paging_query(test_config['SOLR_COLLECTION'], {'q': '*:*'}, rows=50): self.assertTrue(len(res.docs) == 50) docs.extend(res.docs) queries += 1 self.assertEqual( [x['id'] for x in sorted(docs, key=lambda x: x['id'])], [x['id'] for x in sorted(self.docs, key=lambda x: x['id'])]) self.assertTrue(1000 / 50 == queries) self.delete_docs() self.commit() try: os.remove('temp_file.json.gz') os.remove('temp_file.json') except: pass def test_paging_query(self): self.docs = self.rand_docs.get_docs(1000) with gzip.open('temp_file.json.gz', 'wb') as f: f.write(json.dumps(self.docs).encode('utf-8')) r = self.solr.stream_file(test_config['SOLR_COLLECTION'], 'temp_file.json.gz') self.commit() queries = 0 docs = [] for res in self.solr.paging_query(test_config['SOLR_COLLECTION'], {'q': '*:*'}): self.assertTrue(len(res.docs) == 1000) docs.extend(res.docs) queries += 1 self.assertTrue(queries == 1) self.assertEqual( [x['id'] for x in sorted(docs, key=lambda x: x['id'])], [x['id'] for x in sorted(self.docs, key=lambda x: x['id'])]) self.delete_docs() self.commit() try: os.remove('temp_file.json.gz') os.remove('temp_file.json') except: pass def test_paging_query_with_max(self): self.docs = self.rand_docs.get_docs(1000) with gzip.open('temp_file.json.gz', 'wb') as f: f.write(json.dumps(self.docs).encode('utf-8')) r = self.solr.stream_file(test_config['SOLR_COLLECTION'], 'temp_file.json.gz') self.commit() queries = 0 docs = [] for res in self.solr.paging_query(test_config['SOLR_COLLECTION'], {'q': '*:*'}, rows=50, max_start=502): self.assertTrue(len(res.docs) == 50) queries += 1 docs.extend(res.docs) ids = [x['id'] for x in docs] for item in docs: self.assertTrue(item['id'] in ids) self.assertEqual(11, queries) self.delete_docs() self.commit() try: os.remove('temp_file.json.gz') os.remove('temp_file.json') except: pass def test_cursor_query(self): self.docs = self.rand_docs.get_docs(2000) with gzip.open('temp_file.json.gz', 'wb') as f: f.write(json.dumps(self.docs).encode('utf-8')) r = self.solr.stream_file(test_config['SOLR_COLLECTION'], 'temp_file.json.gz') self.commit() queries = 0 docs = [] for res in self.solr.cursor_query(test_config['SOLR_COLLECTION'], { 'q': '*:*', 'rows': 100 }): self.assertTrue(len(res.docs) == 100) queries += 1 docs.extend(res.docs) ids = [x['id'] for x in docs] for item in docs: self.assertTrue(item['id'] in ids) self.delete_docs() self.commit() try: os.remove('temp_file.json.gz') os.remove('temp_file.json') except: pass