def test_solr_to_solr_reindex_and_resume_reverse(self): ''' Only reindexes half of the collection on the first time. Then goes back and does a resume to make sure it works. ''' self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config['SOLR_SERVER'][0], auth=test_config['SOLR_CREDENTIALS']) reindexer = Reindexer(source=solr, source_coll='source_coll', dest=solr, dest_coll='dest_coll', date_field='date') # Make sure only source has data self.assertEqual(len(solr.query(self.colls[0], {'q': '*:*', 'rows': 10000000}).docs), 50000) self.assertEqual(len(solr.query(self.colls[1], {'q': '*:*', 'rows': 10000000}).docs), 0) # This gets somehwat of a mid point date in the range. midpoint = (datetime.datetime.now() - datetime.timedelta(days= ((self._end_date - self._start_date).days / 2) )) # Reindex approximately half of the data by restricting FQ reindexer.reindex(fq=['date:[{} TO *]'.format(midpoint.isoformat() + 'Z')]) # Make sure we have at least 20% of the data. dest_count = len(solr.query(self.colls[1], {'q': '*:*', 'rows': 10000000}).docs) s_count = len(solr.query(self.colls[0], {'q': '*:*', 'rows': 10000000}).docs) self.assertTrue(s_count > dest_count > s_count * .20) reindexer.resume() # Make sure countc match up after reindex self.assertEqual( len(solr.query(self.colls[0], {'q': '*:*', 'rows': 10000000}).docs), len(solr.query(self.colls[1], {'q': '*:*', 'rows': 10000000}).docs))
def test_solr_to_solr_with_date(self): self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS']) reindexer = Reindexer(source=solr, source_coll='source_coll', dest=solr, dest_coll='dest_coll', date_field='index_date') reindexer.reindex() try: self.assertTrue(solr.transport._action_log[1]['params']['params'] ['sort'] == 'index_date asc, id desc') except KeyError: self.assertTrue(solr.transport._action_log[2]['params']['params'] ['sort'] == 'index_date asc, id desc') self.assertEqual( solr.query(self.colls[0], { 'q': '*:*', 'rows': 10000000 }).docs.sort(key=lambda x: x['id']), solr.query(self.colls[1], { 'q': '*:*', 'rows': 10000000 }).docs.sort(key=lambda x: x['id']), )
def test_solr_to_solr_reindex_and_resume_reverse(self): """ Only reindexes half of the collection on the first time. Then goes back and does a resume to make sure it works. """ self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config["SOLR_SERVER"][0], auth=test_config["SOLR_CREDENTIALS"]) reindexer = Reindexer( source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date" ) # Make sure only source has data self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000) self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0) # This gets somehwat of a mid point date in the range. midpoint = datetime.datetime.now() - datetime.timedelta(days=((self._end_date - self._start_date).days / 2)) # Reindex approximately half of the data by restricting FQ reindexer.reindex(fq=["date:[{} TO *]".format(midpoint.isoformat() + "Z")]) sleep(10) # Make sure we have at least 20% of the data. dest_count = len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs) s_count = len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs) self.assertTrue(s_count > dest_count > s_count * 0.20) reindexer.resume() sleep(10) # Make sure countc match up after reindex self.assertEqual( len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), )
def test_solr_to_solr(self): self._index_docs(50000, self.colls[0]) reindexer = Reindexer(source=self.solr, source_coll="source_coll", dest=self.solr, dest_coll="dest_coll") reindexer.reindex() self.assertEquals( self.solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs.sort(key=lambda x: x["id"]), self.solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs.sort(key=lambda x: x["id"]), )
def test_solr_to_solr(self): self._index_docs(50000, self.colls[0]) reindexer = Reindexer(source=self.solr, source_coll='source_coll', dest=self.solr, dest_coll='dest_coll') reindexer.reindex() self.assertEqual( self.solr.query(self.colls[0], {'q': '*:*', 'rows': 10000000}).docs.sort(key=lambda x: x['id']), self.solr.query(self.colls[1], {'q': '*:*', 'rows': 10000000}).docs.sort(key=lambda x: x['id']), )
def test_remove_copy_fields_from_data(self): index = IndexQ(test_config["indexqbase"], "test_reindexer", size=0) for dir in ["_todo_dir", "_done_dir"]: [os.remove(x) for x in index.get_all_as_list(dir=dir)] reindexer = Reindexer(source=self.solr, source_coll="source_coll", dest=index) reindexer.reindex() from_files = self.get_all_json_from_indexq(index) excluded_fields = reindexer._ignore_fields for doc in from_files: for field in excluded_fields: if field in doc: print(doc)
def test_remove_copy_fields_from_data(self): index = IndexQ(test_config['indexqbase'], 'test_reindexer', size=0) for dir in ['_todo_dir', '_done_dir']: [os.remove(x) for x in index.get_all_as_list(dir=dir)] reindexer = Reindexer(source=self.solr, source_coll='source_coll', dest=index) reindexer.reindex() from_files = self.get_all_json_from_indexq(index) excluded_fields = reindexer._ignore_fields for doc in from_files: for field in excluded_fields: if field in doc: print(doc)
def test_solr_to_indexq(self): """ Will export documents from Solr and put them into an IndexQ. """ index = IndexQ(test_config["indexqbase"], "test_reindexer", size=0) for dir in ["_todo_dir", "_done_dir"]: [os.remove(x) for x in index.get_all_as_list(dir=dir)] self._index_docs(5000, self.colls[0]) reindexer = Reindexer(source=self.solr, source_coll="source_coll", dest=index) reindexer.reindex() from_files = self.get_all_json_from_indexq(index) from_solr = self.solr.query("source_coll", {"q": "*:*", "rows": 5000}).docs from_solr = reindexer._trim_fields(from_solr) self.assertEqual(sorted(from_files, key=lambda x: x["id"]), sorted(from_solr, key=lambda x: x["id"]))
def test_solr_to_indexq(self): ''' Will export documents from Solr and put them into an IndexQ. ''' index = IndexQ(test_config['indexqbase'], 'test_reindexer', size=0) for dir in ['_todo_dir', '_done_dir']: [os.remove(x) for x in index.get_all_as_list(dir=dir)] self._index_docs(5000, self.colls[0]) reindexer = Reindexer(source=self.solr, source_coll='source_coll', dest=index) reindexer.reindex() from_files = self.get_all_json_from_indexq(index) from_solr = self.solr.query('source_coll', {'q': '*:*', 'rows': 5000}).docs from_solr = reindexer._trim_fields(from_solr) self.assertEqual(sorted(from_files, key=lambda x: x['id']), sorted(from_solr, key=lambda x: x['id']))
def test_solr_to_solr_reindexer_per_shard(self): self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config["SOLR_SERVER"][0], auth=test_config["SOLR_CREDENTIALS"]) reindexer = Reindexer( source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", per_shard=True, date_field="date" ) # Make sure only source has data self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000) self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0) reindexer.reindex() # sloppy check over here, will improve later self.assertEqual( len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), )
def test_solr_to_solr_with_date(self): self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"]) reindexer = Reindexer( source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="index_date" ) reindexer.reindex() try: self.assertTrue(solr.transport._action_log[1]["params"]["params"]["sort"] == "index_date asc, id desc") except KeyError: self.assertTrue(solr.transport._action_log[2]["params"]["params"]["sort"] == "index_date asc, id desc") self.assertEqual( solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs.sort(key=lambda x: x["id"]), solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs.sort(key=lambda x: x["id"]), )
def test_solr_to_solr_reindexer_per_shard(self): self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config['SOLR_SERVER'][0], auth=test_config['SOLR_CREDENTIALS']) #Make sure only source has data self.assertEqual( len( solr.query(self.colls[0], { 'q': '*:*', 'rows': 10000000 }).docs), 50000) self.assertEqual( len( solr.query(self.colls[1], { 'q': '*:*', 'rows': 10000000 }).docs), 0) reindexer = Reindexer(source=solr, source_coll='source_coll_shard1_replica1', dest=solr, dest_coll=self.colls[1], per_shard=True, date_field='date') reindexer.reindex() reindexer = Reindexer(source=solr, source_coll='source_coll_shard2_replica1', dest=solr, dest_coll=self.colls[1], per_shard=True, date_field='date') reindexer.reindex() self.solr.commit(self.colls[1], openSearcher=True) #sloppy check over here, will improve later self.assertEqual( len( solr.query(self.colls[0], { 'q': '*:*', 'rows': 10000000 }).docs), len( solr.query(self.colls[1], { 'q': '*:*', 'rows': 10000000 }).docs))