def reindex(self, solr_docs, solr_urls, commit=False): """Sends documents to solr and to Metrics DB. :param: solr_docs - list of json objects (solr documents) :param: solr_urls - list of strings, solr servers. """ self.logger.debug('Updating solr: num_docs=%s solr_urls=%s', len(solr_docs), solr_urls) out = solr_updater.update_solr(solr_docs, solr_urls, ignore_errors=True) failed_bibcodes = [] errs = [x for x in out if x != 200] if len(errs) == 0: self._mark_processed(solr_docs) else: self.logger.error('%s docs failed indexing', len(errs)) # recover from erros by inserting docs one by one for doc in solr_docs: try: solr_updater.update_solr([doc], solr_urls, ignore_errors=False, commit=commit) self.update_processed_timestamp(doc['bibcode']) self.logger.debug('%s success', doc['bibcode']) except: failed_bibcode = doc['bibcode'] self.logger.error('Failed posting data to %s\noffending payload: %s', solr_urls, doc) failed_bibcodes.append(failed_bibcode) return failed_bibcodes
def index_solr(self, solr_docs, solr_docs_checksum, solr_urls, commit=False, update_processed=True): """Sends documents to solr. It will update the solr_processed timestamp for every document which succeeded. :param: solr_docs - list of json objects (solr documents) :param: solr_urls - list of strings, solr servers. """ self.logger.debug('Updating solr: num_docs=%s solr_urls=%s', len(solr_docs), solr_urls) # batch send solr update out = solr_updater.update_solr(solr_docs, solr_urls, ignore_errors=True) errs = [x for x in out if x != 200] if len(errs) == 0: if update_processed: self.mark_processed([x['bibcode'] for x in solr_docs], 'solr', checksums=solr_docs_checksum, status='success') else: self.logger.error('%s docs failed indexing', len(errs)) failed_bibcodes = [] # recover from errors by sending docs one by one for doc, checksum in zip(solr_docs, solr_docs_checksum): try: self.logger.error('trying individual update_solr %s', doc) solr_updater.update_solr([doc], solr_urls, ignore_errors=False, commit=commit) if update_processed: self.mark_processed((doc['bibcode'], ), 'solr', checksums=(checksum, ), status='success') self.logger.debug('%s success', doc['bibcode']) except Exception as e: # if individual insert fails, # and if 'body' is in excpetion we assume Solr failed on body field # then we try once more without fulltext # this bibcode needs to investigated as to why fulltext/body is failing failed_bibcode = doc['bibcode'] if 'body' in str( e ) or 'not all arguments converted during string formatting' in str( e): tmp_doc = dict(doc) tmp_doc.pop('body', None) try: solr_updater.update_solr([tmp_doc], solr_urls, ignore_errors=False, commit=commit) if update_processed: self.mark_processed((doc['bibcode'], ), 'solr', checksums=(checksum, ), status='success') self.logger.debug('%s success without body', doc['bibcode']) except Exception as e: self.logger.exception( 'Failed posting bibcode %s to Solr even without fulltext (urls: %s)', failed_bibcode, solr_urls) failed_bibcodes.append(failed_bibcode) else: # here if body not in error message do not retry, just note as a fail self.logger.error( 'Failed posting individual bibcode %s to Solr\nurls: %s, offending payload %s, error is %s', failed_bibcode, solr_urls, doc, e) failed_bibcodes.append(failed_bibcode) # finally update postgres record if failed_bibcodes and update_processed: self.mark_processed(failed_bibcodes, 'solr', checksums=None, status='solr-failed')
def reindex(self, solr_docs, solr_urls, commit=False): """Sends documents to solr. It will update the solr_processed timestamp for every document which succeeded. :param: solr_docs - list of json objects (solr documents) :param: solr_urls - list of strings, solr servers. """ self.logger.debug('Updating solr: num_docs=%s solr_urls=%s', len(solr_docs), solr_urls) for doc in solr_docs: if doc['bibcode'] in self.tweaks: # apply tweaks that add/override values doc.update(self.tweaks[doc['bibcode']]) # batch send solr updates out = solr_updater.update_solr(solr_docs, solr_urls, ignore_errors=True) failed_bibcodes = [] errs = [x for x in out if x != 200] if len(errs) == 0: self.mark_processed([x['bibcode'] for x in solr_docs], type='solr') else: self.logger.error('%s docs failed indexing', len(errs)) # recover from errors by inserting docs one by one for doc in solr_docs: try: self.logger.error('trying individual update_solr %s', doc) solr_updater.update_solr([doc], solr_urls, ignore_errors=False, commit=commit) self.update_processed_timestamp(doc['bibcode'], type='solr') self.logger.debug('%s success', doc['bibcode']) except Exception as e: # if individual insert fails, # and if 'body' is in excpetion we assume Solr failed on body field # then we try once more without fulltext # this bibcode needs to investigated as to why fulltext/body is failing failed_bibcode = doc['bibcode'] if 'body' in str( e ) or 'not all arguments converted during string formatting' in str( e): tmp_doc = dict(doc) tmp_doc.pop('body', None) try: solr_updater.update_solr([tmp_doc], solr_urls, ignore_errors=False, commit=commit) self.update_processed_timestamp(doc['bibcode'], type='solr') self.logger.debug('%s success without body', doc['bibcode']) except Exception as e: self.logger.error( 'Failed posting bibcode %s to Solr even without fulltext\nurls: %s, offending payload %s, error is %s', failed_bibcode, solr_urls, doc, e) failed_bibcodes.append(failed_bibcode) else: # here if body not in error message do not retry, just note as a fail self.logger.error( 'Failed posting individual bibcode %s to Solr\nurls: %s, offending payload %s, error is %s', failed_bibcode, solr_urls, doc, e) failed_bibcodes.append(failed_bibcode) return failed_bibcodes