class TestSolr(unittest.TestCase): def setUp(self): self.solr = Solr('http://some.url') def tearDown(self): pass @patch('SolrAPI.requests') def test_select_method(self, mock_requests): mock_requests.get.return_value = response = Mock() response.text = '{"responseHeader":{"status":0,"QTime":1,"params":{"q":"pickles","wt":"json"}},"{response": {"numFound": 1, "start": 0,"docs": []}}' response.status_code = 200 self.assertEqual(self.solr.select(params={'q': 'pickles'}), '{"responseHeader":{"status":0,"QTime":1,"params":{"q":"pickles","wt":"json"}},"{response": {"numFound": 1, "start": 0,"docs": []}}') @patch('SolrAPI.requests') def test_select_method_without_params(self, mock_requests): mock_requests.get.return_value = response = Mock() response.text = '{"responseHeader":{"status":0,"QTime":1,"params":{"wt":"json"}},"response":{"numFound":0,"start":0,"docs":[]}}}' response.status_code = 200 self.assertEqual(self.solr.select({}), '{"responseHeader":{"status":0,"QTime":1,"params":{"wt":"json"}},"response":{"numFound":0,"start":0,"docs":[]}}}') @patch('SolrAPI.requests') def test_select_method_change_return_format(self, mock_requests): mock_requests.get.return_value = response = Mock() response.text = '<?xml version="1.0" encoding="UTF-8"?><response><lst name="responseHeader"><int name="status">0</int><int name="QTime">1</int><lst name="params"><str name="q">pickles</str<str name="wt">xml</str></lst></lst><result name="response" numFound="0" start="0"></result></lst></response>' response.status_code = 200 self.assertEqual(self.solr.select({'q': 'pickles'}, format='xml'), '<?xml version="1.0" encoding="UTF-8"?><response><lst name="responseHeader"><int name="status">0</int><int name="QTime">1</int><lst name="params"><str name="q">pickles</str<str name="wt">xml</str></lst></lst><result name="response" numFound="0" start="0"></result></lst></response>')
def __init__(self, period=None, from_date=None, until_date=None, collection=None, issn=None, delete=False, differential=False, load_indicators=False): self.delete = delete self.collection = collection self.from_date = from_date self.until_date = until_date self.differential = differential self.load_indicators = load_indicators self.issn = issn self.solr = Solr(SOLR_URL, timeout=10) if period: self.from_date = datetime.now() - timedelta(days=period)
def __init__(self): self.args = self.parser.parse_args() solr_url = os.environ.get('SOLR_URL') if not solr_url and not self.args.solr_url: raise argparse.ArgumentTypeError('--url or ``SOLR_URL`` enviroment variable must be the set, use --help.') if not solr_url: self.solr = Solr(self.args.solr_url, timeout=10) else: self.solr = Solr(solr_url, timeout=10) if self.args.period: self.args.from_date = datetime.now() - timedelta(days=self.args.period)
def __init__(self, period=None, from_date=None, until_date=None, collection=None, issn=None, delete=False, sanitization=False): self.delete = delete self.sanitization = sanitization self.collection = collection self.from_date = from_date self.until_date = until_date self.issn = issn self.solr = Solr(SOLR_URL, timeout=10) if period: self.from_date = datetime.now() - timedelta(days=period)
def __init__(self): self.args = self.parser.parse_args() solr_url = os.environ.get('SOLR_URL') if not solr_url and not self.args.solr_url: raise argparse.ArgumentTypeError( '--url or ``SOLR_URL`` enviroment variable must be the set, use --help.' ) if not solr_url: self.solr = Solr(self.args.solr_url, timeout=10) else: self.solr = Solr(solr_url, timeout=10) if self.args.period: self.args.from_date = datetime.now() - timedelta( days=self.args.period)
def main(): solr = Solr('http://localhost:8080/solr/nem', timeout=30) solr.delete('*:*') data_list = [] for count, line in enumerate(open(DATA_FILE)): d = {'id': line[0:12], 'text': line[536: 545]} data_list.append(d) print count if len(data_list) == 10000: print "Sending..." solr.update(json.dumps(data_list), headers={'Content-Type': 'text/json'}) print "Commiting..." solr.commit() #Clean data_list data_list = [] solr.optimize()
def __init__(self, collection=None, issn=None): self.collection = collection self.issn = issn self.solr = Solr(SOLR_URL, timeout=10)
class UpdateSearch(object): """ Process to get article in article meta and index in Solr. """ def __init__(self, collection=None, issn=None): self.collection = collection self.issn = issn self.solr = Solr(SOLR_URL, timeout=10) def set_accesses(self, document_id, accesses): xml = ET.Element('add') doc = ET.Element('doc') identifier = ET.Element('field') identifier.set('name', 'id') identifier.text = document_id total_accesses = ET.Element('field') total_accesses.set('name', 'total_access') total_accesses.text = str(accesses) total_accesses.set('update', 'set') doc.append(identifier) doc.append(total_accesses) xml.append(doc) return ET.tostring(xml, encoding="utf-8", method="xml") def run(self): """ Run the process for update article in Solr. """ art_meta = ArticleMetaThriftClient() art_accesses = AccessThriftClient(domain="ratchet.scielo.org:11660") logger.info("Loading Solr available document ids") itens_query = [] if self.collection: itens_query.append('in:%s' % self.collection) if self.issn: itens_query.append('issn:%s' % self.issn) query = '*:*' if len(itens_query) == 0 else ' AND '.join(itens_query) available_ids = set([ i['id'] for i in json.loads( self.solr.select({ 'q': query, 'fl': 'id', 'rows': 1000000 }))['response']['docs'] ]) logger.info("Recording accesses for documents in {0}".format( self.solr.url)) for document in art_meta.documents(collection=self.collection, issn=self.issn): solr_id = '-'.join( [document.publisher_id, document.collection_acronym]) if solr_id not in available_ids: continue logger.debug("Loading accesses for document %s" % solr_id) total_accesses = int( art_accesses.document(document.publisher_id, document.collection_acronym).get( 'access_total', {'value': 0})['value']) xml = self.set_accesses(solr_id, total_accesses) try: result = self.solr.update(xml, commit=False) except ValueError as e: logger.error("ValueError: {0}".format(e)) logger.exception(e) continue except Exception as e: logger.error("Error: {0}".format(e)) logger.exception(e) continue # optimize the index self.solr.commit() self.solr.optimize()
def main(settings, *args, **xargs): solr = Solr(settings['endpoints']['solr'], timeout=int(settings['request']['timeout'])) from_date = datetime.now() until_date = datetime.now() parser = argparse.ArgumentParser(description='Script to update Solr') parser.add_argument('-p', '--period', type=int, help='index articles from specific period, use number of days.') parser.add_argument('-f', '--from', dest='from_date', type=lambda x: datetime.strptime(x, '%Y-%m-%d'), nargs='?', help='index articles from specific date. YYYY-MM-DD') parser.add_argument('-u', '--until', dest='until_date', type=lambda x: datetime.strptime(x, '%Y-%m-%d'), nargs='?', help='index articles until this specific date. YYYY-MM-DD (default today)', default=datetime.now()) parser.add_argument('-c', '--collection', dest='collection', default=None, help='use the acronym of the collection eg.: spa, scl, col.') parser.add_argument('-d', '--debug', action='store_true', help='execute the script in DEBUG mode (don\'t update the index)') parser.add_argument('-v', '--version', action='version', version='%(prog)s 0.1') args = parser.parse_args() if args.from_date: from_date = args.from_date if args.until_date: until_date = args.until_date if args.period: from_date -= timedelta(days=args.period) from_date = from_date.strftime("%Y-%m-%d") until_date = until_date.strftime("%Y-%m-%d") if args.debug: log.setLevel(logging.DEBUG) log.info('Start update solr index script with params from={0} and until={1}'.format( from_date,until_date)) total = 0 offset = 0 fail_list = [] sum_processed = 0 while True: try: total, article_lst = get_identifiers(from_date, until_date, args.collection, offset) if len(article_lst) == 0: break; sum_processed += len(article_lst) log.info('Indexing {0} of {1} articles'.format(sum_processed, total)) offset += int(settings['params']['limit_offset']) for article in article_lst: article_code = str(article['code']); code_url = '{0}?code={1}&format=xmliahx'.format( settings['endpoints']['article'], article_code) log.debug('URL used for retrieve solr xml of article {0}'.format(code_url)) solr_xml = _fetch_data(code_url).text log.info('Indexing article {0}'.format(article_code)) if not args.debug: status = solr.update(solr_xml) if status != 0: log.error('Unable to index article {0}, code:{1}'.format( article_code, status)) fail_list.append(article_code) #commit on any offset cycle commit(solr, debug=args.debug) except Exception as e: log.critical('Unexpected error: {0}'.format(e)) summary(total, fail_list, args.debug)
class UpdateSearch(object): """ Process to get article in article meta and index in Solr. """ usage = """\ Process to index article to SciELO Solr. This process collects articles in the Article meta using thrift and index in SciELO Solr. With this process it is possible to process all the article or some specific by collection, issn from date to until another date and a period like 7 days. """ parser = argparse.ArgumentParser(textwrap.dedent(usage)) parser.add_argument( '-p', '--period', type=int, help='index articles from specific period, use number of days.') parser.add_argument('-f', '--from', dest='from_date', type=lambda x: datetime.strptime(x, '%Y-%m-%d'), nargs='?', help='index articles from specific date. YYYY-MM-DD.') parser.add_argument( '-u', '--until', dest='until_date', type=lambda x: datetime.strptime(x, '%Y-%m-%d'), nargs='?', help= 'index articles until this specific date. YYYY-MM-DD (default today).', default=datetime.now()) parser.add_argument( '-c', '--collection', dest='collection', default=None, help='use the acronym of the collection eg.: spa, scl, col.') parser.add_argument('-i', '--issn', dest='issn', default=None, help='journal issn.') parser.add_argument('-d', '--delete', dest='delete', default=None, help='delete query ex.: q=*:* (Lucene Syntax).') parser.add_argument( '-s', '--sanitization', dest='sanitization', default=False, action='store_true', help= 'Remove objects from the index that are no longer present in the database.' ) parser.add_argument( '-url', '--url', dest='solr_url', help= 'Solr RESTFul URL, processing try to get the variable from environment ``SOLR_URL`` otherwise use --url to set the url(preferable).' ) parser.add_argument('-v', '--version', action='version', version='version: 0.2') def __init__(self): self.args = self.parser.parse_args() solr_url = os.environ.get('SOLR_URL') if not solr_url and not self.args.solr_url: raise argparse.ArgumentTypeError( '--url or ``SOLR_URL`` enviroment variable must be the set, use --help.' ) if not solr_url: self.solr = Solr(self.args.solr_url, timeout=10) else: self.solr = Solr(solr_url, timeout=10) if self.args.period: self.args.from_date = datetime.now() - timedelta( days=self.args.period) def format_date(self, date): """ Convert datetime.datetime to str return: ``2000-05-12``. :param datetime: bult-in datetime object :returns: str """ if not date: return None return date.strftime('%Y-%m-%d') def pipeline_to_xml(self, article): """ Pipeline to tranform a dictionary to XML format :param list_dict: List of dictionary content key tronsform in a XML. """ ppl = plumber.Pipeline(pipeline_xml.SetupDocument(), pipeline_xml.DocumentID(), pipeline_xml.DOI(), pipeline_xml.Collection(), pipeline_xml.DocumentType(), pipeline_xml.URL(), pipeline_xml.Authors(), pipeline_xml.Titles(), pipeline_xml.OriginalTitle(), pipeline_xml.Pages(), pipeline_xml.WOKCI(), pipeline_xml.WOKSC(), pipeline_xml.JournalAbbrevTitle(), pipeline_xml.Languages(), pipeline_xml.AvailableLanguages(), pipeline_xml.Fulltexts(), pipeline_xml.PublicationDate(), pipeline_xml.SciELOPublicationDate(), pipeline_xml.SciELOProcessingDate(), pipeline_xml.Abstract(), pipeline_xml.AffiliationCountry(), pipeline_xml.AffiliationInstitution(), pipeline_xml.Sponsor(), pipeline_xml.Volume(), pipeline_xml.SupplementVolume(), pipeline_xml.Issue(), pipeline_xml.SupplementIssue(), pipeline_xml.ElocationPage(), pipeline_xml.StartPage(), pipeline_xml.EndPage(), pipeline_xml.JournalTitle(), pipeline_xml.IsCitable(), pipeline_xml.Permission(), pipeline_xml.Keywords(), pipeline_xml.JournalISSNs(), pipeline_xml.SubjectAreas(), pipeline_xml.ReceivedCitations(), pipeline_xml.TearDown()) xmls = ppl.run([article]) # Add root document add = ET.Element('add') for xml in xmls: add.append(xml) return ET.tostring(add, encoding="utf-8", method="xml") def run(self): """ Run the process for update article in Solr. """ art_meta = ThriftClient() if self.args.delete: self.solr.delete(self.args.delete, commit=True) elif self.args.sanitization: # set of index ids ind_ids = set() # set of articlemeta ids art_ids = set() # all ids in index list_ids = json.loads( self.solr.select({ 'q': '*:*', 'fl': 'id', 'rows': 1000000 }))['response']['docs'] for id in list_ids: ind_ids.add(id['id']) # all ids in articlemeta for item in art_meta.documents(only_identifiers=True): if item.collection not in ALLOWED_COLLECTION: continue art_ids.add('%s-%s' % (item.code, item.collection)) # Ids to remove remove_ids = ind_ids - art_ids for id in remove_ids: self.solr.delete('id:%s' % id, commit=True) logger.info("List of removed ids: %s" % remove_ids) else: # Get article identifiers logger.info("Indexing in {0}".format(self.solr.url)) for document in art_meta.documents( collection=self.args.collection, issn=self.args.issn, from_date=self.format_date(self.args.from_date), until_date=self.format_date(self.args.until_date)): try: xml = self.pipeline_to_xml(document) self.solr.update(self.pipeline_to_xml(document), commit=True) except ValueError as e: logger.error("ValueError: {0}".format(e)) logger.exception(e) continue except Exception as e: logger.error("Error: {0}".format(e)) logger.exception(e) continue # optimize the index self.solr.commit() self.solr.optimize()
def setUp(self): self.solr = Solr('http://some.url')
class UpdateSearch(object): """ Process to get article in article meta and index in Solr. """ def __init__(self, period=None, from_date=None, until_date=None, collection=None, issn=None, delete=False, differential=False, load_indicators=False): self.delete = delete self.collection = collection self.from_date = from_date self.until_date = until_date self.differential = differential self.load_indicators = load_indicators self.issn = issn self.solr = Solr(SOLR_URL, timeout=10) if period: self.from_date = datetime.now() - timedelta(days=period) def format_date(self, date): """ Convert datetime.datetime to str return: ``2000-05-12``. :param datetime: bult-in datetime object :returns: str """ if not date: return None return date.strftime('%Y-%m-%d') def pipeline_to_xml(self, article): """ Pipeline to tranform a dictionary to XML format :param list_dict: List of dictionary content key tronsform in a XML. """ pipeline_itens = [ pipeline_xml.SetupDocument(), pipeline_xml.DocumentID(), pipeline_xml.DOI(), pipeline_xml.Collection(), pipeline_xml.DocumentType(), pipeline_xml.URL(), pipeline_xml.Authors(), pipeline_xml.Orcid(), pipeline_xml.Titles(), pipeline_xml.OriginalTitle(), pipeline_xml.Pages(), pipeline_xml.WOKCI(), pipeline_xml.WOKSC(), pipeline_xml.JournalAbbrevTitle(), pipeline_xml.Languages(), pipeline_xml.AvailableLanguages(), pipeline_xml.Fulltexts(), pipeline_xml.PublicationDate(), pipeline_xml.SciELOPublicationDate(), pipeline_xml.SciELOProcessingDate(), pipeline_xml.Abstract(), pipeline_xml.AffiliationCountry(), pipeline_xml.AffiliationInstitution(), pipeline_xml.Sponsor(), pipeline_xml.Volume(), pipeline_xml.SupplementVolume(), pipeline_xml.Issue(), pipeline_xml.SupplementIssue(), pipeline_xml.ElocationPage(), pipeline_xml.StartPage(), pipeline_xml.EndPage(), pipeline_xml.JournalTitle(), pipeline_xml.IsCitable(), pipeline_xml.Permission(), pipeline_xml.Keywords(), pipeline_xml.JournalISSNs(), pipeline_xml.SubjectAreas() ] if self.load_indicators is True: pipeline_itens.append(pipeline_xml.ReceivedCitations()) pipeline_itens.append(pipeline_xml.TearDown()) ppl = plumber.Pipeline(*pipeline_itens) xmls = ppl.run([article]) # Add root document add = ET.Element('add') for xml in xmls: add.append(xml) return ET.tostring(add, encoding="utf-8", method="xml") def differential_mode(self): art_meta = ThriftClient() logger.info("Running with differential mode") ind_ids = set() art_ids = set() # all ids in search index logger.info("Loading Search Index ids.") itens_query = [] if self.collection: itens_query.append('in:%s' % self.collection) if self.issn: itens_query.append('issn:%s' % self.issn) query = '*:*' if len(itens_query) == 0 else ' AND '.join(itens_query) list_ids = json.loads(self.solr.select( {'q': query, 'fl': 'id,scielo_processing_date', 'rows': 1000000}))['response']['docs'] for id in list_ids: ind_ids.add('%s-%s' % (id['id'], id.get('scielo_processing_date', '1900-01-01'))) # all ids in articlemeta logger.info("Loading ArticleMeta ids.") for item in art_meta.documents( collection=self.collection, issn=self.issn, only_identifiers=True ): art_ids.add('%s-%s-%s' % (item.code, item.collection, item.processing_date)) # Ids to remove if self.delete is True: logger.info("Running remove records process.") remove_ids = set([i[:27] for i in ind_ids]) - set([i[:27] for i in art_ids]) logger.info("Removing (%d) documents from search index." % len(remove_ids)) total_to_remove = len(remove_ids) if total_to_remove > 0: for ndx, to_remove_id in enumerate(remove_ids, 1): logger.debug("Removing (%d/%d): %s" % (ndx, total_to_remove, to_remove_id)) self.solr.delete('id:%s' % to_remove_id, commit=False) # Ids to include logger.info("Running include records process.") include_ids = art_ids - ind_ids logger.info("Including (%d) documents to search index." % len(include_ids)) total_to_include = len(include_ids) if total_to_include > 0: for ndx, to_include_id in enumerate(include_ids, 1): logger.debug("Including (%d/%d): %s" % (ndx, total_to_include, to_include_id)) code = to_include_id[:23] collection = to_include_id[24: 27] processing_date = to_include_id[:-11] document = art_meta.document(code=code, collection=collection) try: xml = self.pipeline_to_xml(document) self.solr.update(xml, commit=False) except ValueError as e: logger.error("ValueError: {0}".format(e)) logger.exception(e) continue except Exception as e: logger.error("Error: {0}".format(e)) logger.exception(e) continue def common_mode(self): art_meta = ThriftClient() logger.info("Running without differential mode") logger.info("Indexing in {0}".format(self.solr.url)) for document in art_meta.documents( collection=self.collection, issn=self.issn, from_date=self.format_date(self.from_date), until_date=self.format_date(self.until_date) ): logger.debug("Loading document %s" % '_'.join([document.collection_acronym, document.publisher_id])) try: xml = self.pipeline_to_xml(document) self.solr.update(xml, commit=False) except ValueError as e: logger.error("ValueError: {0}".format(e)) logger.exception(e) continue except Exception as e: logger.error("Error: {0}".format(e)) logger.exception(e) continue if self.delete is True: logger.info("Running remove records process.") ind_ids = set() art_ids = set() itens_query = [] if self.collection: itens_query.append('in:%s' % self.collection) if self.issn: itens_query.append('issn:%s' % self.issn) query = '*:*' if len(itens_query) == 0 else ' AND '.join(itens_query) list_ids = json.loads(self.solr.select( {'q': query, 'fl': 'id', 'rows': 1000000}))['response']['docs'] for id in list_ids: ind_ids.add(id['id']) # all ids in articlemeta for item in art_meta.documents( collection=self.collection, issn=self.issn, only_identifiers=True ): art_ids.add('%s-%s' % (item.code, item.collection)) # Ids to remove total_to_remove = len(remove_ids) logger.info("Removing (%d) documents from search index." % len(remove_ids)) remove_ids = ind_ids - art_ids for ndx, to_remove_id in enumerate(remove_ids, 1): logger.debug("Removing (%d/%d): %s" % (ndx, total_to_remove, to_remove_id)) self.solr.delete('id:%s' % to_remove_id, commit=False) def run(self): """ Run the process for update article in Solr. """ if self.differential is True: self.differential_mode() else: self.common_mode() # optimize the index self.solr.commit() self.solr.optimize()
class UpdatePreprint(object): """ Process to get article in Pre-Print Server and index in Solr. """ usage = """\ Process to index Pre-Prints articles to SciELO Solr. """ parser = argparse.ArgumentParser(textwrap.dedent(usage)) parser.add_argument( '-p', '--period', type=int, help='index articles from specific period, use number of hours.') parser.add_argument( '-d', '--delete', dest='delete', help='delete query ex.: q=type:"preprint (Lucene Syntax).') parser.add_argument( '-solr_url', '--solr_url', dest='solr_url', help= 'Solr RESTFul URL, processing try to get the variable from environment ``SOLR_URL`` otherwise use --solr_url to set the solr_url (preferable).' ) parser.add_argument( '-oai_url', '--oai_url', dest='oai_url', default="https://preprints.scielo.org/index.php/scielo/oai", help= 'OAI URL, processing try to get the variable from environment ``OAI_URL`` otherwise use --oai_url to set the oai_url (preferable).' ) parser.add_argument('-v', '--version', action='version', version='version: 0.1-beta') def __init__(self): self.args = self.parser.parse_args() solr_url = os.environ.get('SOLR_URL') oai_url = os.environ.get('OAI_URL') if not solr_url and not self.args.solr_url: raise argparse.ArgumentTypeError( '--solr_url or ``SOLR_URL`` enviroment variable must be the set, use --help.' ) if not oai_url and not self.args.oai_url: raise argparse.ArgumentTypeError( '--oai_url or ``OAI_URL`` enviroment variable must be the set, use --help.' ) if not solr_url: self.solr = Solr(self.args.solr_url, timeout=10) else: self.solr = Solr(solr_url, timeout=10) if self.args.period: self.from_date = datetime.now() - timedelta(hours=self.args.period) def pipeline_to_xml(self, article): """ Pipeline to tranform a dictionary to XML format :param list_dict: List of dictionary content key tronsform in a XML. """ ppl = plumber.Pipeline(pipeline_xml.SetupDocument(), pipeline_xml.DocumentID(), pipeline_xml.URL(), pipeline_xml.DOI(), pipeline_xml.Languages(), pipeline_xml.Fulltexts(), pipeline_xml.PublicationDate(), pipeline_xml.Keywords(), pipeline_xml.Collection(), pipeline_xml.DocumentType(), pipeline_xml.Titles(), pipeline_xml.Abstract(), pipeline_xml.Authors(), pipeline_xml.TearDown()) xmls = ppl.run([article]) # Add root document add = ET.Element('add') for xml in xmls: add.append(xml) return ET.tostring(add, encoding="utf-8", method="xml") def run(self): """ Run the process for update Pre-prints in Solr. """ if self.args.delete: self.solr.delete(self.args.delete, commit=True) else: logger.info("Indexing in {0}".format(self.solr.url)) sickle = Sickle(self.args.oai_url) records = sickle.ListRecords( **{ 'metadataPrefix': 'oai_dc', 'from': self.from_date.strftime("%Y-%m-%dT%H:%M:%SZ") }) for record in records: try: xml = self.pipeline_to_xml(record.xml) self.solr.update(xml, commit=True) except ValueError as e: logger.error("ValueError: {0}".format(e)) logger.exception(e) continue except Exception as e: logger.error("Error: {0}".format(e)) logger.exception(e) continue # optimize the index self.solr.commit() self.solr.optimize()
def main(settings, *args, **xargs): solr = Solr(settings['endpoints']['solr'], timeout=int(settings['request']['timeout'])) parser = argparse.ArgumentParser(description='Script to handle article duplication on article index') parser.add_argument('-d', '--debug', action='store_true', help='execute the script in DEBUG mode (don\'t update the index)') parser.add_argument('-v', '--version', action='version', version='%(prog)s 0.1') args = parser.parse_args() if args.debug: log.setLevel(logging.DEBUG) log.info('Start find duplication script') # set csv file for register duplication articles csv_filename = '{0}-{1}.csv'.format(settings['csv']['filename_prefix'], datetime.now().strftime('%Y-%m-%d') ) csv_file = open(csv_filename, 'wb') csv_writer = csv.writer(csv_file, quoting=csv.QUOTE_MINIMAL) total_duplicated = 0 offset = 0 fail_list = [] while True: try: duplication_lst = get_duplication_list(solr, offset) total_for_process = len(duplication_lst) if total_for_process == 0: break; log.info('Processing {0} duplication entries'.format(total_for_process)) offset += int(settings['params']['limit_offset']) for dup_code in duplication_lst: # ignore partial upgrade duplication signature (SOLR-4016) if dup_code[0] == '0000000000000000': continue process_list = get_duplication_articles(solr, dup_code[0]) if process_list: main_article = [article['id'] for article in process_list if article['in'][0] == 'scl'] # only process if is identified only one main article from SCL collection if len(main_article) == 1: for update_article in process_list: update_id = update_article['id'] # if is the main article (SCL colection) update index # otherwise delete article duplication if update_id == main_article[0]: log.info('Updating colection element of article: {0}'.format(update_id)) save_csv_entry(csv_writer, update_article, 'updated') if not args.debug: status = update_main_article(solr, update_id, process_list) else: log.info('Deleting duplicated article: {0}'.format(update_id)) save_csv_entry(csv_writer, update_article, 'duplication deleted') if not args.debug: delete_query = 'id:"{0}"'.format(update_id) status = solr.delete(delete_query) total_duplicated += 1 if status != 0: log.error('Unable to delete article {0}, code:{1}'.format( update_id, status)) # check for udpate solr status (update or delete) if not args.debug and status != 0: log.error('Unable to update article {0}, code:{1}'.format( update_id, status)) fail_list.append(update_id) # skip else: log.debug('Skipping articles due missing main article of SCL collection :{0}'.format( [art['id'].encode('utf-8') for art in process_list]) ) # save list of ignored articles to csv file for art in process_list: save_csv_entry(csv_writer, art, 'ignored due missing main article') # write a empty line for separate next group of duplication articles csv_writer.writerow([' ']) except Exception as e: log.critical('Unexpected error: {0}'.format(e)) # commit at end to avoid offset process gap commit(solr, debug=args.debug) # script summary summary(total_duplicated, fail_list, args.debug)
class UpdateSearch(object): """ Process to get article in article meta and index in Solr. """ def __init__(self, period=None, from_date=None, until_date=None, collection=None, issn=None, delete=False, sanitization=False): self.delete = delete self.sanitization = sanitization self.collection = collection self.from_date = from_date self.until_date = until_date self.issn = issn self.solr = Solr(SOLR_URL, timeout=10) if period: self.from_date = datetime.now() - timedelta(days=period) def format_date(self, date): """ Convert datetime.datetime to str return: ``2000-05-12``. :param datetime: bult-in datetime object :returns: str """ if not date: return None return date.strftime('%Y-%m-%d') def pipeline_to_xml(self, article): """ Pipeline to tranform a dictionary to XML format :param list_dict: List of dictionary content key tronsform in a XML. """ ppl = plumber.Pipeline(pipeline_xml.SetupDocument(), pipeline_xml.DocumentID(), pipeline_xml.DOI(), pipeline_xml.Collection(), pipeline_xml.DocumentType(), pipeline_xml.URL(), pipeline_xml.Authors(), pipeline_xml.Titles(), pipeline_xml.OriginalTitle(), pipeline_xml.Pages(), pipeline_xml.WOKCI(), pipeline_xml.WOKSC(), pipeline_xml.JournalAbbrevTitle(), pipeline_xml.Languages(), pipeline_xml.AvailableLanguages(), pipeline_xml.Fulltexts(), pipeline_xml.PublicationDate(), pipeline_xml.SciELOPublicationDate(), pipeline_xml.SciELOProcessingDate(), pipeline_xml.Abstract(), pipeline_xml.AffiliationCountry(), pipeline_xml.AffiliationInstitution(), pipeline_xml.Sponsor(), pipeline_xml.Volume(), pipeline_xml.SupplementVolume(), pipeline_xml.Issue(), pipeline_xml.SupplementIssue(), pipeline_xml.ElocationPage(), pipeline_xml.StartPage(), pipeline_xml.EndPage(), pipeline_xml.JournalTitle(), pipeline_xml.IsCitable(), pipeline_xml.Permission(), pipeline_xml.Keywords(), pipeline_xml.JournalISSNs(), pipeline_xml.SubjectAreas(), pipeline_xml.ReceivedCitations(), pipeline_xml.TearDown()) xmls = ppl.run([article]) # Add root document add = ET.Element('add') for xml in xmls: add.append(xml) return ET.tostring(add, encoding="utf-8", method="xml") def run(self): """ Run the process for update article in Solr. """ art_meta = ThriftClient() if self.delete: self.solr.delete(self.delete, commit=True) else: logger.info("Indexing in {0}".format(self.solr.url)) for document in art_meta.documents( collection=self.collection, issn=self.issn, from_date=self.format_date(self.from_date), until_date=self.format_date(self.until_date)): logger.debug("Loading document %s" % '_'.join( [document.collection_acronym, document.publisher_id])) try: xml = self.pipeline_to_xml(document) self.solr.update(self.pipeline_to_xml(document), commit=True) except ValueError as e: logger.error("ValueError: {0}".format(e)) logger.exception(e) continue except Exception as e: logger.error("Error: {0}".format(e)) logger.exception(e) continue if self.sanitization is True: logger.info("Running sanitization process") ind_ids = set() art_ids = set() itens_query = [] if self.collection: itens_query.append('in:%s' % self.collection) if self.issn: itens_query.append('issn:%s' % self.issn) query = '*:*' if len(itens_query) == 0 else ' AND '.join( itens_query) list_ids = json.loads( self.solr.select({ 'q': query, 'fl': 'id', 'rows': 1000000 }))['response']['docs'] for id in list_ids: ind_ids.add(id['id']) # all ids in articlemeta for item in art_meta.documents(collection=self.collection, issn=self.issn, only_identifiers=True): art_ids.add('%s-%s' % (item.code, item.collection)) # Ids to remove remove_ids = ind_ids - art_ids for id in remove_ids: logger.debug("Removing id: %s" % id) self.solr.delete('id:%s' % id, commit=True) # optimize the index self.solr.commit() self.solr.optimize()
class UpdateSearch(object): """ Process to get article in article meta and index in Solr. """ usage = """\ Process to index article to SciELO Solr. This process collects articles in the Article meta using thrift and index in SciELO Solr. With this process it is possible to process all the article or some specific by collection, issn from date to until another date and a period like 7 days. """ parser = argparse.ArgumentParser(textwrap.dedent(usage)) parser.add_argument('-p', '--period', type=int, help='index articles from specific period, use number of days.') parser.add_argument('-f', '--from', dest='from_date', type=lambda x: datetime.strptime(x, '%Y-%m-%d'), nargs='?', help='index articles from specific date. YYYY-MM-DD.') parser.add_argument('-u', '--until', dest='until_date', type=lambda x: datetime.strptime(x, '%Y-%m-%d'), nargs='?', help='index articles until this specific date. YYYY-MM-DD (default today).', default=datetime.now()) parser.add_argument('-c', '--collection', dest='collection', default=None, help='use the acronym of the collection eg.: spa, scl, col.') parser.add_argument('-i', '--issn', dest='issn', default=None, help='journal issn.') parser.add_argument('-d', '--delete', dest='delete', default=None, help='delete query ex.: q=*:* (Lucene Syntax).') parser.add_argument('-s', '--sanitization', dest='sanitization', default=False, action='store_true', help='Remove objects from the index that are no longer present in the database.') parser.add_argument('-url', '--url', dest='solr_url', help='Solr RESTFul URL, processing try to get the variable from environment ``SOLR_URL`` otherwise use --url to set the url(preferable).') parser.add_argument('-v', '--version', action='version', version='version: 0.2') def __init__(self): self.args = self.parser.parse_args() solr_url = os.environ.get('SOLR_URL') if not solr_url and not self.args.solr_url: raise argparse.ArgumentTypeError('--url or ``SOLR_URL`` enviroment variable must be the set, use --help.') if not solr_url: self.solr = Solr(self.args.solr_url, timeout=10) else: self.solr = Solr(solr_url, timeout=10) if self.args.period: self.args.from_date = datetime.now() - timedelta(days=self.args.period) def format_date(self, date): """ Convert datetime.datetime to str return: ``2000-05-12``. :param datetime: bult-in datetime object :returns: str """ if not date: return None return date.strftime('%Y-%m-%d') def pipeline_to_xml(self, article): """ Pipeline to tranform a dictionary to XML format :param list_dict: List of dictionary content key tronsform in a XML. """ ppl = plumber.Pipeline( pipeline_xml.SetupDocument(), pipeline_xml.DocumentID(), pipeline_xml.DOI(), pipeline_xml.Collection(), pipeline_xml.DocumentType(), pipeline_xml.URL(), pipeline_xml.Authors(), pipeline_xml.Titles(), pipeline_xml.OriginalTitle(), pipeline_xml.Pages(), pipeline_xml.WOKCI(), pipeline_xml.WOKSC(), pipeline_xml.JournalAbbrevTitle(), pipeline_xml.Languages(), pipeline_xml.AvailableLanguages(), pipeline_xml.Fulltexts(), pipeline_xml.PublicationDate(), pipeline_xml.SciELOPublicationDate(), pipeline_xml.SciELOProcessingDate(), pipeline_xml.Abstract(), pipeline_xml.AffiliationCountry(), pipeline_xml.AffiliationInstitution(), pipeline_xml.Sponsor(), pipeline_xml.Volume(), pipeline_xml.SupplementVolume(), pipeline_xml.Issue(), pipeline_xml.SupplementIssue(), pipeline_xml.ElocationPage(), pipeline_xml.StartPage(), pipeline_xml.EndPage(), pipeline_xml.JournalTitle(), pipeline_xml.IsCitable(), pipeline_xml.Permission(), pipeline_xml.Keywords(), pipeline_xml.JournalISSNs(), pipeline_xml.SubjectAreas(), pipeline_xml.ReceivedCitations(), pipeline_xml.TearDown() ) xmls = ppl.run([article]) # Add root document add = ET.Element('add') for xml in xmls: add.append(xml) return ET.tostring(add, encoding="utf-8", method="xml") def run(self): """ Run the process for update article in Solr. """ art_meta = ThriftClient() if self.args.delete: self.solr.delete(self.args.delete, commit=True) elif self.args.sanitization: # set of index ids ind_ids = set() # set of articlemeta ids art_ids = set() # all ids in index list_ids = json.loads(self.solr.select( {'q': '*:*', 'fl': 'id', 'rows': 1000000}))['response']['docs'] for id in list_ids: ind_ids.add(id['id']) # all ids in articlemeta for item in art_meta.documents(only_identifiers=True): if item.collection not in ALLOWED_COLLECTION: continue art_ids.add('%s-%s' % (item.code, item.collection)) # Ids to remove remove_ids = ind_ids - art_ids for id in remove_ids: self.solr.delete('id:%s' % id, commit=True) logger.info("List of removed ids: %s" % remove_ids) else: # Get article identifiers logger.info("Indexing in {0}".format(self.solr.url)) for document in art_meta.documents( collection=self.args.collection, issn=self.args.issn, from_date=self.format_date(self.args.from_date), until_date=self.format_date(self.args.until_date) ): try: xml = self.pipeline_to_xml(document) self.solr.update(self.pipeline_to_xml(document), commit=True) except ValueError as e: logger.error("ValueError: {0}".format(e)) logger.exception(e) continue except Exception as e: logger.error("Error: {0}".format(e)) logger.exception(e) continue # optimize the index self.solr.commit() self.solr.optimize()