예제 #1
0
    def reindex(self, solr_docs, solr_urls, commit=False):
        """Sends documents to solr and to Metrics DB.
        
        :param: solr_docs - list of json objects (solr documents)
        :param: solr_urls - list of strings, solr servers.
        """
        self.logger.debug('Updating solr: num_docs=%s solr_urls=%s', len(solr_docs), solr_urls)
        
        out = solr_updater.update_solr(solr_docs, solr_urls, ignore_errors=True)
        failed_bibcodes = []
        errs = [x for x in out if x != 200]
        
        if len(errs) == 0:
            self._mark_processed(solr_docs)
        else:
            self.logger.error('%s docs failed indexing', len(errs))
            # recover from erros by inserting docs one by one
            for doc in solr_docs:
                try:
                    solr_updater.update_solr([doc], solr_urls, ignore_errors=False, commit=commit)
                    self.update_processed_timestamp(doc['bibcode'])
                    self.logger.debug('%s success', doc['bibcode'])
                except:
                    failed_bibcode = doc['bibcode']
                    self.logger.error('Failed posting data to %s\noffending payload: %s', solr_urls, doc)
                    failed_bibcodes.append(failed_bibcode)

        return failed_bibcodes
예제 #2
0
    def index_solr(self,
                   solr_docs,
                   solr_docs_checksum,
                   solr_urls,
                   commit=False,
                   update_processed=True):
        """Sends documents to solr. It will update
        the solr_processed timestamp for every document which succeeded.

        :param: solr_docs - list of json objects (solr documents)
        :param: solr_urls - list of strings, solr servers.
        """
        self.logger.debug('Updating solr: num_docs=%s solr_urls=%s',
                          len(solr_docs), solr_urls)
        # batch send solr update
        out = solr_updater.update_solr(solr_docs,
                                       solr_urls,
                                       ignore_errors=True)
        errs = [x for x in out if x != 200]

        if len(errs) == 0:
            if update_processed:
                self.mark_processed([x['bibcode'] for x in solr_docs],
                                    'solr',
                                    checksums=solr_docs_checksum,
                                    status='success')
        else:
            self.logger.error('%s docs failed indexing', len(errs))
            failed_bibcodes = []
            # recover from errors by sending docs one by one
            for doc, checksum in zip(solr_docs, solr_docs_checksum):
                try:
                    self.logger.error('trying individual update_solr %s', doc)
                    solr_updater.update_solr([doc],
                                             solr_urls,
                                             ignore_errors=False,
                                             commit=commit)
                    if update_processed:
                        self.mark_processed((doc['bibcode'], ),
                                            'solr',
                                            checksums=(checksum, ),
                                            status='success')
                    self.logger.debug('%s success', doc['bibcode'])
                except Exception as e:
                    # if individual insert fails,
                    # and if 'body' is in excpetion we assume Solr failed on body field
                    # then we try once more without fulltext
                    # this bibcode needs to investigated as to why fulltext/body is failing
                    failed_bibcode = doc['bibcode']
                    if 'body' in str(
                            e
                    ) or 'not all arguments converted during string formatting' in str(
                            e):
                        tmp_doc = dict(doc)
                        tmp_doc.pop('body', None)
                        try:
                            solr_updater.update_solr([tmp_doc],
                                                     solr_urls,
                                                     ignore_errors=False,
                                                     commit=commit)
                            if update_processed:
                                self.mark_processed((doc['bibcode'], ),
                                                    'solr',
                                                    checksums=(checksum, ),
                                                    status='success')
                            self.logger.debug('%s success without body',
                                              doc['bibcode'])
                        except Exception as e:
                            self.logger.exception(
                                'Failed posting bibcode %s to Solr even without fulltext (urls: %s)',
                                failed_bibcode, solr_urls)
                            failed_bibcodes.append(failed_bibcode)
                    else:
                        # here if body not in error message do not retry, just note as a fail
                        self.logger.error(
                            'Failed posting individual bibcode %s to Solr\nurls: %s, offending payload %s, error is %s',
                            failed_bibcode, solr_urls, doc, e)
                        failed_bibcodes.append(failed_bibcode)
            # finally update postgres record
            if failed_bibcodes and update_processed:
                self.mark_processed(failed_bibcodes,
                                    'solr',
                                    checksums=None,
                                    status='solr-failed')
예제 #3
0
    def reindex(self, solr_docs, solr_urls, commit=False):
        """Sends documents to solr. It will update
        the solr_processed timestamp for every document which succeeded.

        :param: solr_docs - list of json objects (solr documents)
        :param: solr_urls - list of strings, solr servers.
        """
        self.logger.debug('Updating solr: num_docs=%s solr_urls=%s',
                          len(solr_docs), solr_urls)
        for doc in solr_docs:
            if doc['bibcode'] in self.tweaks:
                # apply tweaks that add/override values
                doc.update(self.tweaks[doc['bibcode']])
        # batch send solr updates
        out = solr_updater.update_solr(solr_docs,
                                       solr_urls,
                                       ignore_errors=True)
        failed_bibcodes = []
        errs = [x for x in out if x != 200]

        if len(errs) == 0:
            self.mark_processed([x['bibcode'] for x in solr_docs], type='solr')
        else:
            self.logger.error('%s docs failed indexing', len(errs))
            # recover from errors by inserting docs one by one
            for doc in solr_docs:
                try:
                    self.logger.error('trying individual update_solr %s', doc)
                    solr_updater.update_solr([doc],
                                             solr_urls,
                                             ignore_errors=False,
                                             commit=commit)
                    self.update_processed_timestamp(doc['bibcode'],
                                                    type='solr')
                    self.logger.debug('%s success', doc['bibcode'])
                except Exception as e:
                    # if individual insert fails,
                    # and if 'body' is in excpetion we assume Solr failed on body field
                    # then we try once more without fulltext
                    # this bibcode needs to investigated as to why fulltext/body is failing
                    failed_bibcode = doc['bibcode']
                    if 'body' in str(
                            e
                    ) or 'not all arguments converted during string formatting' in str(
                            e):
                        tmp_doc = dict(doc)
                        tmp_doc.pop('body', None)
                        try:
                            solr_updater.update_solr([tmp_doc],
                                                     solr_urls,
                                                     ignore_errors=False,
                                                     commit=commit)
                            self.update_processed_timestamp(doc['bibcode'],
                                                            type='solr')
                            self.logger.debug('%s success without body',
                                              doc['bibcode'])
                        except Exception as e:
                            self.logger.error(
                                'Failed posting bibcode %s to Solr even without fulltext\nurls: %s, offending payload %s, error is  %s',
                                failed_bibcode, solr_urls, doc, e)
                            failed_bibcodes.append(failed_bibcode)
                    else:
                        # here if body not in error message do not retry, just note as a fail
                        self.logger.error(
                            'Failed posting individual bibcode %s to Solr\nurls: %s, offending payload %s, error is %s',
                            failed_bibcode, solr_urls, doc, e)
                        failed_bibcodes.append(failed_bibcode)
        return failed_bibcodes