def _sync_records_by_id(self, core, query, timestamp_query): ''' Method that executes synchronization of all cores based on the dataset id (within a given time interval). ''' # number of records copied from source Solr --> target Solr numDatasets = 0 numFiles = 0 numAggregations = 0 # query for dataset ids from source, target Solrs print("Querying source") source_dataset_ids = self._query_dataset_ids(self.source_solr_base_url, CORE_DATASETS, query, timestamp_query) print("Querying target") target_dataset_ids = self._query_dataset_ids(self.target_solr_base_url, CORE_DATASETS, query, timestamp_query) # synchronize source Solr --> target Solr # commit after every core query for source_dataset_id in source_dataset_ids.keys(): if source_dataset_id not in target_dataset_ids or source_dataset_ids[source_dataset_id] != target_dataset_ids[source_dataset_id]: logging.info("\t\t\t\tCopying source dataset=%s" % source_dataset_id) numDatasets += migrate(self.source_solr_base_url, self.target_solr_base_url, CORE_DATASETS, query='id:%s' % source_dataset_id, commit=True, optimize=False) numFiles += migrate(self.source_solr_base_url, self.target_solr_base_url, CORE_FILES, query='dataset_id:%s' % source_dataset_id, commit=True, optimize=False) numAggregations += migrate(self.source_solr_base_url, self.target_solr_base_url, CORE_AGGREGATIONS, query='dataset_id:%s' % source_dataset_id, commit=True, optimize=False) # synchronize target Solr <-- source Solr # must delete datasets that do NOT exist at the source for target_dataset_id in target_dataset_ids.keys(): if not target_dataset_id in source_dataset_ids: # check wether dataset still exists at the source: if yes, it will be updated; if not, must delete exists = self._check_record(self.source_solr_base_url, CORE_DATASETS, target_dataset_id) if not exists: logging.info("\t\t\t\tDeleting dataset=%s" % target_dataset_id) self._delete_solr_records(self.target_solr_base_url, core=CORE_DATASETS, query='id:%s' % target_dataset_id) self._delete_solr_records(self.target_solr_base_url, core=CORE_FILES, query='dataset_id:%s' % target_dataset_id) self._delete_solr_records(self.target_solr_base_url, core=CORE_AGGREGATIONS, query='dataset_id:%s' % target_dataset_id) return (numDatasets, numFiles, numAggregations)
def _sync_records_by_time(self, core, query, timestamp_query): '''Method that executes synchronization of all records for a given core within given time interval.''' # first delete all records in timestamp bin from target solr # will NOT commit the changes yet delete_query = "(%s)AND(%s)" % (query, timestamp_query) self._delete_solr_records(self.target_solr_base_url, core, delete_query) # then migrate records from source solr # commit but do NOT optimize the index yet numRecords = migrate(self.source_solr_base_url, self.target_solr_base_url, core, query=query, fq=timestamp_query, commit=True, optimize=False) logging.info("\t\t\tNumber or records migrated=%s" % numRecords) return numRecords
def _sync_records_by_time(self, core, query, timestamp_query): ''' Method that executes synchronization of all records for a given core within given time interval. ''' # first delete all records in timestamp bin from target solr # will NOT commit the changes yet delete_query = "(%s)AND(%s)" % (query, timestamp_query) self._delete_solr_records(self.target_solr_base_url, core, delete_query) # then migrate records from source solr # commit but do NOT optimize the index yet numRecords = migrate(self.source_solr_base_url, self.target_solr_base_url, core, query=query, fq=timestamp_query, commit=True, optimize=False) logging.info("\t\t\tNumber or records migrated=%s" % numRecords) return numRecords
def _sync_all_cores_by_dataset_id(self, query, timestamp_query): ''' Method that executes synchronization of all cores based on the dataset id (within a given time interval). ''' # number of records copied from source Solr --> target Solr numDatasets = 0 numFiles = 0 numAggregations = 0 # query for dataset ids from source, target Solrs source_dataset_ids = self._query_dataset_ids(self.source_solr_base_url, CORE_DATASETS, query, timestamp_query) target_dataset_ids = self._query_dataset_ids(self.target_solr_base_url, CORE_DATASETS, query, timestamp_query) # synchronize source Solr --> target Solr # commit after every core query for source_dataset_id in source_dataset_ids.keys(): # compare dataset ids and their _timestamps if ((source_dataset_id not in target_dataset_ids) or (source_dataset_ids[source_dataset_id] != target_dataset_ids[source_dataset_id])): logging.info("\t\t\t\tCopying source dataset=" "%s" % source_dataset_id) numDatasets += migrate(self.source_solr_base_url, self.target_solr_base_url, CORE_DATASETS, query='id:%s' % source_dataset_id, commit=True, optimize=False) numFiles += migrate(self.source_solr_base_url, self.target_solr_base_url, CORE_FILES, query='dataset_id:%s' % source_dataset_id, commit=True, optimize=False) numAggregations += migrate(self.source_solr_base_url, self.target_solr_base_url, CORE_AGGREGATIONS, query='dataset_id:%s' % source_dataset_id, commit=True, optimize=False) # synchronize target Solr <-- source Solr # must delete datasets that do NOT longer exist at the source for target_dataset_id in target_dataset_ids.keys(): if target_dataset_id not in source_dataset_ids: # check whether dataset still exists at the source: # if yes, it has been updated in the previous loop; if not, delete it exists = self._check_record(self.source_solr_base_url, CORE_DATASETS, target_dataset_id) if not exists: logging.info("\t\t\t\tDeleting dataset=" "%s" % target_dataset_id) self._delete_solr_records(self.target_solr_base_url, core=CORE_DATASETS, query='id:%s' % target_dataset_id) self._delete_solr_records(self.target_solr_base_url, core=CORE_FILES, query='dataset_id:%s' % target_dataset_id) self._delete_solr_records(self.target_solr_base_url, core=CORE_AGGREGATIONS, query='dataset_id:%s' % target_dataset_id) return (numDatasets, numFiles, numAggregations)
sourceSolrUrl = "http://localhost:8983/solr" targetSolrUrl = "http://localhost:7000/solr" core = "datasets" #replace = None #replace = "pcmdi9.llnl.gov:esgf-node.jpl.nasa.gov" #replace = "pcmdi9.llnl.gov:others" # total number of records indexed = maxRecords * numIterations * replacements maxRecords = 10000 # maximum number of records per migration numIterations = 1000 # number of migrations replacements = ["pcmdi9.llnl.gov:esgf-node.jpl.nasa.gov", "pcmdi9.llnl.gov:pcmdi9.llnl.gov", "pcmdi9.llnl.gov:esgf-data.dkrz.de", "pcmdi9.llnl.gov:esgf-node.ipsl.fr", "pcmdi9.llnl.gov:esgf.nccs.nasa.gov", "pcmdi9.llnl.gov:esg2.nci.org.au", "pcmdi9.llnl.gov:esgf-index1.ceda.ac.uk", "pcmdi9.llnl.gov:esgdata.gfdl.nooa.gov", "pcmdi9.llnl.gov:hydra.fsl.noaa.gov", "pcmdi9.llnl.gov:others"] for replace in replacements: for i in range(1, 1+numIterations): print "Executing iteration #: %s for replacement=%s" % (i, replace) suffix = ".%s" % i migrate(sourceSolrUrl, targetSolrUrl, core, maxRecords=maxRecords, suffix=suffix, replace=replace, query='index_node:pcmdi9.llnl.gov')