Example #1
0
 def test_solr_to_solr_reindex_and_resume_reverse(self):
     '''
     Only reindexes half of the collection on the first time. Then goes back and does a resume to make sure it works.
     '''
     self._index_docs(50000, self.colls[0])
     solr = SolrClient(test_config['SOLR_SERVER'][0], auth=test_config['SOLR_CREDENTIALS'])
     reindexer = Reindexer(source=solr, source_coll='source_coll', dest=solr, dest_coll='dest_coll',
                           date_field='date')
     # Make sure only source has data
     self.assertEqual(len(solr.query(self.colls[0], {'q': '*:*', 'rows': 10000000}).docs), 50000)
     self.assertEqual(len(solr.query(self.colls[1], {'q': '*:*', 'rows': 10000000}).docs), 0)
     # This gets somehwat of a mid point date in the range.
     midpoint = (datetime.datetime.now() - datetime.timedelta(days=
                                                              ((self._end_date - self._start_date).days / 2)
                                                              ))
     # Reindex approximately half of the data by restricting FQ
     reindexer.reindex(fq=['date:[{} TO *]'.format(midpoint.isoformat() + 'Z')])
     # Make sure we have at least 20% of the data.
     dest_count = len(solr.query(self.colls[1], {'q': '*:*', 'rows': 10000000}).docs)
     s_count = len(solr.query(self.colls[0], {'q': '*:*', 'rows': 10000000}).docs)
     self.assertTrue(s_count > dest_count > s_count * .20)
     reindexer.resume()
     # Make sure countc match up after reindex
     self.assertEqual(
         len(solr.query(self.colls[0], {'q': '*:*', 'rows': 10000000}).docs),
         len(solr.query(self.colls[1], {'q': '*:*', 'rows': 10000000}).docs))
Example #2
0
 def test_solr_to_solr_with_date(self):
     self._index_docs(50000, self.colls[0])
     solr = SolrClient(test_config['SOLR_SERVER'][0],
                       devel=True,
                       auth=test_config['SOLR_CREDENTIALS'])
     reindexer = Reindexer(source=solr,
                           source_coll='source_coll',
                           dest=solr,
                           dest_coll='dest_coll',
                           date_field='index_date')
     reindexer.reindex()
     try:
         self.assertTrue(solr.transport._action_log[1]['params']['params']
                         ['sort'] == 'index_date asc, id desc')
     except KeyError:
         self.assertTrue(solr.transport._action_log[2]['params']['params']
                         ['sort'] == 'index_date asc, id desc')
     self.assertEqual(
         solr.query(self.colls[0], {
             'q': '*:*',
             'rows': 10000000
         }).docs.sort(key=lambda x: x['id']),
         solr.query(self.colls[1], {
             'q': '*:*',
             'rows': 10000000
         }).docs.sort(key=lambda x: x['id']),
     )
Example #3
0
 def test_solr_to_solr_reindex_and_resume_reverse(self):
     """
     Only reindexes half of the collection on the first time. Then goes back and does a resume to make sure it works. 
     """
     self._index_docs(50000, self.colls[0])
     solr = SolrClient(test_config["SOLR_SERVER"][0], auth=test_config["SOLR_CREDENTIALS"])
     reindexer = Reindexer(
         source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date"
     )
     # Make sure only source has data
     self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000)
     self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0)
     # This gets somehwat of a mid point date in the range.
     midpoint = datetime.datetime.now() - datetime.timedelta(days=((self._end_date - self._start_date).days / 2))
     # Reindex approximately half of the data by restricting FQ
     reindexer.reindex(fq=["date:[{} TO *]".format(midpoint.isoformat() + "Z")])
     sleep(10)
     # Make sure we have at least 20% of the data.
     dest_count = len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs)
     s_count = len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs)
     self.assertTrue(s_count > dest_count > s_count * 0.20)
     reindexer.resume()
     sleep(10)
     # Make sure countc match up after reindex
     self.assertEqual(
         len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs),
         len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs),
     )
Example #4
0
 def test_solr_to_solr(self):
     self._index_docs(50000, self.colls[0])
     reindexer = Reindexer(source=self.solr, source_coll="source_coll", dest=self.solr, dest_coll="dest_coll")
     reindexer.reindex()
     self.assertEquals(
         self.solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs.sort(key=lambda x: x["id"]),
         self.solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs.sort(key=lambda x: x["id"]),
     )
Example #5
0
 def test_solr_to_solr(self):
     self._index_docs(50000, self.colls[0])
     reindexer = Reindexer(source=self.solr, source_coll='source_coll', dest=self.solr, dest_coll='dest_coll')
     reindexer.reindex()
     self.assertEqual(
         self.solr.query(self.colls[0], {'q': '*:*', 'rows': 10000000}).docs.sort(key=lambda x: x['id']),
         self.solr.query(self.colls[1], {'q': '*:*', 'rows': 10000000}).docs.sort(key=lambda x: x['id']),
     )
Example #6
0
 def test_remove_copy_fields_from_data(self):
     index = IndexQ(test_config["indexqbase"], "test_reindexer", size=0)
     for dir in ["_todo_dir", "_done_dir"]:
         [os.remove(x) for x in index.get_all_as_list(dir=dir)]
     reindexer = Reindexer(source=self.solr, source_coll="source_coll", dest=index)
     reindexer.reindex()
     from_files = self.get_all_json_from_indexq(index)
     excluded_fields = reindexer._ignore_fields
     for doc in from_files:
         for field in excluded_fields:
             if field in doc:
                 print(doc)
Example #7
0
 def test_remove_copy_fields_from_data(self):
     index = IndexQ(test_config['indexqbase'], 'test_reindexer', size=0)
     for dir in ['_todo_dir', '_done_dir']:
         [os.remove(x) for x in index.get_all_as_list(dir=dir)]
     reindexer = Reindexer(source=self.solr, source_coll='source_coll', dest=index)
     reindexer.reindex()
     from_files = self.get_all_json_from_indexq(index)
     excluded_fields = reindexer._ignore_fields
     for doc in from_files:
         for field in excluded_fields:
             if field in doc:
                 print(doc)
Example #8
0
 def test_solr_to_indexq(self):
     """
     Will export documents from Solr and put them into an IndexQ. 
     """
     index = IndexQ(test_config["indexqbase"], "test_reindexer", size=0)
     for dir in ["_todo_dir", "_done_dir"]:
         [os.remove(x) for x in index.get_all_as_list(dir=dir)]
     self._index_docs(5000, self.colls[0])
     reindexer = Reindexer(source=self.solr, source_coll="source_coll", dest=index)
     reindexer.reindex()
     from_files = self.get_all_json_from_indexq(index)
     from_solr = self.solr.query("source_coll", {"q": "*:*", "rows": 5000}).docs
     from_solr = reindexer._trim_fields(from_solr)
     self.assertEqual(sorted(from_files, key=lambda x: x["id"]), sorted(from_solr, key=lambda x: x["id"]))
Example #9
0
 def test_solr_to_indexq(self):
     '''
     Will export documents from Solr and put them into an IndexQ.
     '''
     index = IndexQ(test_config['indexqbase'], 'test_reindexer', size=0)
     for dir in ['_todo_dir', '_done_dir']:
         [os.remove(x) for x in index.get_all_as_list(dir=dir)]
     self._index_docs(5000, self.colls[0])
     reindexer = Reindexer(source=self.solr, source_coll='source_coll', dest=index)
     reindexer.reindex()
     from_files = self.get_all_json_from_indexq(index)
     from_solr = self.solr.query('source_coll', {'q': '*:*', 'rows': 5000}).docs
     from_solr = reindexer._trim_fields(from_solr)
     self.assertEqual(sorted(from_files, key=lambda x: x['id']), sorted(from_solr, key=lambda x: x['id']))
Example #10
0
 def test_solr_to_solr_reindexer_per_shard(self):
     self._index_docs(50000, self.colls[0])
     solr = SolrClient(test_config["SOLR_SERVER"][0], auth=test_config["SOLR_CREDENTIALS"])
     reindexer = Reindexer(
         source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", per_shard=True, date_field="date"
     )
     # Make sure only source has data
     self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000)
     self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0)
     reindexer.reindex()
     # sloppy check over here, will improve later
     self.assertEqual(
         len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs),
         len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs),
     )
Example #11
0
 def test_solr_to_solr_with_date(self):
     self._index_docs(50000, self.colls[0])
     solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"])
     reindexer = Reindexer(
         source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="index_date"
     )
     reindexer.reindex()
     try:
         self.assertTrue(solr.transport._action_log[1]["params"]["params"]["sort"] == "index_date asc, id desc")
     except KeyError:
         self.assertTrue(solr.transport._action_log[2]["params"]["params"]["sort"] == "index_date asc, id desc")
     self.assertEqual(
         solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs.sort(key=lambda x: x["id"]),
         solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs.sort(key=lambda x: x["id"]),
     )
Example #12
0
    def test_solr_to_solr_reindexer_per_shard(self):
        self._index_docs(50000, self.colls[0])
        solr = SolrClient(test_config['SOLR_SERVER'][0],
                          auth=test_config['SOLR_CREDENTIALS'])
        #Make sure only source has data
        self.assertEqual(
            len(
                solr.query(self.colls[0], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs), 50000)
        self.assertEqual(
            len(
                solr.query(self.colls[1], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs), 0)

        reindexer = Reindexer(source=solr,
                              source_coll='source_coll_shard1_replica1',
                              dest=solr,
                              dest_coll=self.colls[1],
                              per_shard=True,
                              date_field='date')
        reindexer.reindex()
        reindexer = Reindexer(source=solr,
                              source_coll='source_coll_shard2_replica1',
                              dest=solr,
                              dest_coll=self.colls[1],
                              per_shard=True,
                              date_field='date')
        reindexer.reindex()

        self.solr.commit(self.colls[1], openSearcher=True)
        #sloppy check over here, will improve later
        self.assertEqual(
            len(
                solr.query(self.colls[0], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs),
            len(
                solr.query(self.colls[1], {
                    'q': '*:*',
                    'rows': 10000000
                }).docs))