Esempio n. 1
0
class TestSolr(unittest.TestCase):

    def setUp(self):
        self.solr = Solr('http://some.url')

    def tearDown(self):
        pass

    @patch('SolrAPI.requests')
    def test_select_method(self, mock_requests):
        mock_requests.get.return_value = response = Mock()
        response.text = '{"responseHeader":{"status":0,"QTime":1,"params":{"q":"pickles","wt":"json"}},"{response": {"numFound": 1, "start": 0,"docs": []}}'
        response.status_code = 200

        self.assertEqual(self.solr.select(params={'q': 'pickles'}), '{"responseHeader":{"status":0,"QTime":1,"params":{"q":"pickles","wt":"json"}},"{response": {"numFound": 1, "start": 0,"docs": []}}')

    @patch('SolrAPI.requests')
    def test_select_method_without_params(self, mock_requests):
        mock_requests.get.return_value = response = Mock()
        response.text = '{"responseHeader":{"status":0,"QTime":1,"params":{"wt":"json"}},"response":{"numFound":0,"start":0,"docs":[]}}}'
        response.status_code = 200

        self.assertEqual(self.solr.select({}), '{"responseHeader":{"status":0,"QTime":1,"params":{"wt":"json"}},"response":{"numFound":0,"start":0,"docs":[]}}}')

    @patch('SolrAPI.requests')
    def test_select_method_change_return_format(self, mock_requests):
        mock_requests.get.return_value = response = Mock()
        response.text = '<?xml version="1.0" encoding="UTF-8"?><response><lst name="responseHeader"><int name="status">0</int><int name="QTime">1</int><lst name="params"><str name="q">pickles</str<str name="wt">xml</str></lst></lst><result name="response" numFound="0" start="0"></result></lst></response>'
        response.status_code = 200

        self.assertEqual(self.solr.select({'q': 'pickles'}, format='xml'), '<?xml version="1.0" encoding="UTF-8"?><response><lst name="responseHeader"><int name="status">0</int><int name="QTime">1</int><lst name="params"><str name="q">pickles</str<str name="wt">xml</str></lst></lst><result name="response" numFound="0" start="0"></result></lst></response>')
Esempio n. 2
0
 def __init__(self, period=None, from_date=None, until_date=None,
              collection=None, issn=None, delete=False, differential=False,
              load_indicators=False):
     self.delete = delete
     self.collection = collection
     self.from_date = from_date
     self.until_date = until_date
     self.differential = differential
     self.load_indicators = load_indicators
     self.issn = issn
     self.solr = Solr(SOLR_URL, timeout=10)
     if period:
         self.from_date = datetime.now() - timedelta(days=period)
Esempio n. 3
0
    def __init__(self):

        self.args = self.parser.parse_args()

        solr_url = os.environ.get('SOLR_URL')

        if not solr_url and not self.args.solr_url:
            raise argparse.ArgumentTypeError('--url or ``SOLR_URL`` enviroment variable must be the set, use --help.')

        if not solr_url:
            self.solr = Solr(self.args.solr_url, timeout=10)
        else:
            self.solr = Solr(solr_url, timeout=10)

        if self.args.period:
            self.args.from_date = datetime.now() - timedelta(days=self.args.period)
Esempio n. 4
0
 def __init__(self,
              period=None,
              from_date=None,
              until_date=None,
              collection=None,
              issn=None,
              delete=False,
              sanitization=False):
     self.delete = delete
     self.sanitization = sanitization
     self.collection = collection
     self.from_date = from_date
     self.until_date = until_date
     self.issn = issn
     self.solr = Solr(SOLR_URL, timeout=10)
     if period:
         self.from_date = datetime.now() - timedelta(days=period)
Esempio n. 5
0
    def __init__(self):

        self.args = self.parser.parse_args()

        solr_url = os.environ.get('SOLR_URL')

        if not solr_url and not self.args.solr_url:
            raise argparse.ArgumentTypeError(
                '--url or ``SOLR_URL`` enviroment variable must be the set, use --help.'
            )

        if not solr_url:
            self.solr = Solr(self.args.solr_url, timeout=10)
        else:
            self.solr = Solr(solr_url, timeout=10)

        if self.args.period:
            self.args.from_date = datetime.now() - timedelta(
                days=self.args.period)
Esempio n. 6
0
def main():

    solr = Solr('http://localhost:8080/solr/nem', timeout=30)

    solr.delete('*:*')
    data_list = []

    for count, line in enumerate(open(DATA_FILE)):

        d = {'id': line[0:12], 'text': line[536: 545]}
        data_list.append(d)
        print count

        if len(data_list) == 10000:
            print "Sending..."
            solr.update(json.dumps(data_list), headers={'Content-Type': 'text/json'})
            print "Commiting..."
            solr.commit()
            #Clean data_list
            data_list = []

    solr.optimize()
Esempio n. 7
0
 def __init__(self, collection=None, issn=None):
     self.collection = collection
     self.issn = issn
     self.solr = Solr(SOLR_URL, timeout=10)
Esempio n. 8
0
class UpdateSearch(object):
    """
    Process to get article in article meta and index in Solr.
    """
    def __init__(self, collection=None, issn=None):
        self.collection = collection
        self.issn = issn
        self.solr = Solr(SOLR_URL, timeout=10)

    def set_accesses(self, document_id, accesses):

        xml = ET.Element('add')

        doc = ET.Element('doc')

        identifier = ET.Element('field')
        identifier.set('name', 'id')
        identifier.text = document_id

        total_accesses = ET.Element('field')
        total_accesses.set('name', 'total_access')
        total_accesses.text = str(accesses)
        total_accesses.set('update', 'set')
        doc.append(identifier)
        doc.append(total_accesses)

        xml.append(doc)

        return ET.tostring(xml, encoding="utf-8", method="xml")

    def run(self):
        """
        Run the process for update article in Solr.
        """

        art_meta = ArticleMetaThriftClient()
        art_accesses = AccessThriftClient(domain="ratchet.scielo.org:11660")

        logger.info("Loading Solr available document ids")
        itens_query = []

        if self.collection:
            itens_query.append('in:%s' % self.collection)

        if self.issn:
            itens_query.append('issn:%s' % self.issn)

        query = '*:*' if len(itens_query) == 0 else ' AND '.join(itens_query)

        available_ids = set([
            i['id'] for i in json.loads(
                self.solr.select({
                    'q': query,
                    'fl': 'id',
                    'rows': 1000000
                }))['response']['docs']
        ])

        logger.info("Recording accesses for documents in {0}".format(
            self.solr.url))

        for document in art_meta.documents(collection=self.collection,
                                           issn=self.issn):

            solr_id = '-'.join(
                [document.publisher_id, document.collection_acronym])

            if solr_id not in available_ids:
                continue

            logger.debug("Loading accesses for document %s" % solr_id)

            total_accesses = int(
                art_accesses.document(document.publisher_id,
                                      document.collection_acronym).get(
                                          'access_total',
                                          {'value': 0})['value'])

            xml = self.set_accesses(solr_id, total_accesses)

            try:
                result = self.solr.update(xml, commit=False)
            except ValueError as e:
                logger.error("ValueError: {0}".format(e))
                logger.exception(e)
                continue
            except Exception as e:
                logger.error("Error: {0}".format(e))
                logger.exception(e)
                continue

        # optimize the index
        self.solr.commit()
        self.solr.optimize()
def main(settings, *args, **xargs):

    solr = Solr(settings['endpoints']['solr'], timeout=int(settings['request']['timeout']))

    from_date  = datetime.now()
    until_date = datetime.now()

    parser = argparse.ArgumentParser(description='Script to update Solr')

    parser.add_argument('-p', '--period',
                        type=int,
                        help='index articles from specific period, use number of days.')

    parser.add_argument('-f', '--from',
                        dest='from_date',
                        type=lambda x: datetime.strptime(x, '%Y-%m-%d'),
                        nargs='?',
                        help='index articles from specific date. YYYY-MM-DD')

    parser.add_argument('-u', '--until',
                        dest='until_date',
                        type=lambda x: datetime.strptime(x, '%Y-%m-%d'),
                        nargs='?',
                        help='index articles until this specific date. YYYY-MM-DD (default today)',
                        default=datetime.now())

    parser.add_argument('-c', '--collection',
                        dest='collection',
                        default=None,
                        help='use the acronym of the collection eg.: spa, scl, col.')

    parser.add_argument('-d', '--debug',
                        action='store_true',
                        help='execute the script in DEBUG mode (don\'t update the index)')

    parser.add_argument('-v', '--version',
                        action='version',
                        version='%(prog)s 0.1')

    args = parser.parse_args()

    if args.from_date:
        from_date = args.from_date

    if args.until_date:
        until_date = args.until_date

    if args.period:
        from_date -= timedelta(days=args.period)

    from_date = from_date.strftime("%Y-%m-%d")
    until_date = until_date.strftime("%Y-%m-%d")

    if args.debug:
        log.setLevel(logging.DEBUG)

    log.info('Start update solr index script with params from={0} and until={1}'.format(
        from_date,until_date))

    total = 0
    offset = 0
    fail_list = []
    sum_processed = 0
    while True:
        try:
            total, article_lst = get_identifiers(from_date, until_date,
                args.collection, offset)

            if len(article_lst) == 0:
                break;

            sum_processed += len(article_lst)

            log.info('Indexing {0} of {1} articles'.format(sum_processed, total))

            offset += int(settings['params']['limit_offset'])

            for article in article_lst:

                article_code = str(article['code']);

                code_url = '{0}?code={1}&format=xmliahx'.format(
                    settings['endpoints']['article'], article_code)

                log.debug('URL used for retrieve solr xml of article {0}'.format(code_url))

                solr_xml = _fetch_data(code_url).text 

                log.info('Indexing article {0}'.format(article_code))

                if not args.debug:
                    status = solr.update(solr_xml)

                    if status != 0:
                        log.error('Unable to index article {0}, code:{1}'.format(
                            article_code, status))
                        fail_list.append(article_code)

            #commit on any offset cycle
            commit(solr, debug=args.debug)

        except Exception as e:
            log.critical('Unexpected error: {0}'.format(e))

    summary(total, fail_list, args.debug)
Esempio n. 10
0
class UpdateSearch(object):
    """
    Process to get article in article meta and index in Solr.
    """

    usage = """\
    Process to index article to SciELO Solr.

    This process collects articles in the Article meta using thrift and index
    in SciELO Solr.

    With this process it is possible to process all the article or some specific
    by collection, issn from date to until another date and a period like 7 days.
    """

    parser = argparse.ArgumentParser(textwrap.dedent(usage))

    parser.add_argument(
        '-p',
        '--period',
        type=int,
        help='index articles from specific period, use number of days.')

    parser.add_argument('-f',
                        '--from',
                        dest='from_date',
                        type=lambda x: datetime.strptime(x, '%Y-%m-%d'),
                        nargs='?',
                        help='index articles from specific date. YYYY-MM-DD.')

    parser.add_argument(
        '-u',
        '--until',
        dest='until_date',
        type=lambda x: datetime.strptime(x, '%Y-%m-%d'),
        nargs='?',
        help=
        'index articles until this specific date. YYYY-MM-DD (default today).',
        default=datetime.now())

    parser.add_argument(
        '-c',
        '--collection',
        dest='collection',
        default=None,
        help='use the acronym of the collection eg.: spa, scl, col.')

    parser.add_argument('-i',
                        '--issn',
                        dest='issn',
                        default=None,
                        help='journal issn.')

    parser.add_argument('-d',
                        '--delete',
                        dest='delete',
                        default=None,
                        help='delete query ex.: q=*:* (Lucene Syntax).')

    parser.add_argument(
        '-s',
        '--sanitization',
        dest='sanitization',
        default=False,
        action='store_true',
        help=
        'Remove objects from the index that are no longer present in the database.'
    )

    parser.add_argument(
        '-url',
        '--url',
        dest='solr_url',
        help=
        'Solr RESTFul URL, processing try to get the variable from environment ``SOLR_URL`` otherwise use --url to set the url(preferable).'
    )

    parser.add_argument('-v',
                        '--version',
                        action='version',
                        version='version: 0.2')

    def __init__(self):

        self.args = self.parser.parse_args()

        solr_url = os.environ.get('SOLR_URL')

        if not solr_url and not self.args.solr_url:
            raise argparse.ArgumentTypeError(
                '--url or ``SOLR_URL`` enviroment variable must be the set, use --help.'
            )

        if not solr_url:
            self.solr = Solr(self.args.solr_url, timeout=10)
        else:
            self.solr = Solr(solr_url, timeout=10)

        if self.args.period:
            self.args.from_date = datetime.now() - timedelta(
                days=self.args.period)

    def format_date(self, date):
        """
        Convert datetime.datetime to str return: ``2000-05-12``.

        :param datetime: bult-in datetime object

        :returns: str
        """
        if not date:
            return None

        return date.strftime('%Y-%m-%d')

    def pipeline_to_xml(self, article):
        """
        Pipeline to tranform a dictionary to XML format

        :param list_dict: List of dictionary content key tronsform in a XML.
        """

        ppl = plumber.Pipeline(pipeline_xml.SetupDocument(),
                               pipeline_xml.DocumentID(), pipeline_xml.DOI(),
                               pipeline_xml.Collection(),
                               pipeline_xml.DocumentType(), pipeline_xml.URL(),
                               pipeline_xml.Authors(), pipeline_xml.Titles(),
                               pipeline_xml.OriginalTitle(),
                               pipeline_xml.Pages(), pipeline_xml.WOKCI(),
                               pipeline_xml.WOKSC(),
                               pipeline_xml.JournalAbbrevTitle(),
                               pipeline_xml.Languages(),
                               pipeline_xml.AvailableLanguages(),
                               pipeline_xml.Fulltexts(),
                               pipeline_xml.PublicationDate(),
                               pipeline_xml.SciELOPublicationDate(),
                               pipeline_xml.SciELOProcessingDate(),
                               pipeline_xml.Abstract(),
                               pipeline_xml.AffiliationCountry(),
                               pipeline_xml.AffiliationInstitution(),
                               pipeline_xml.Sponsor(), pipeline_xml.Volume(),
                               pipeline_xml.SupplementVolume(),
                               pipeline_xml.Issue(),
                               pipeline_xml.SupplementIssue(),
                               pipeline_xml.ElocationPage(),
                               pipeline_xml.StartPage(),
                               pipeline_xml.EndPage(),
                               pipeline_xml.JournalTitle(),
                               pipeline_xml.IsCitable(),
                               pipeline_xml.Permission(),
                               pipeline_xml.Keywords(),
                               pipeline_xml.JournalISSNs(),
                               pipeline_xml.SubjectAreas(),
                               pipeline_xml.ReceivedCitations(),
                               pipeline_xml.TearDown())

        xmls = ppl.run([article])

        # Add root document
        add = ET.Element('add')

        for xml in xmls:
            add.append(xml)

        return ET.tostring(add, encoding="utf-8", method="xml")

    def run(self):
        """
        Run the process for update article in Solr.
        """

        art_meta = ThriftClient()

        if self.args.delete:

            self.solr.delete(self.args.delete, commit=True)

        elif self.args.sanitization:

            # set of index ids
            ind_ids = set()

            # set of articlemeta ids
            art_ids = set()

            # all ids in index
            list_ids = json.loads(
                self.solr.select({
                    'q': '*:*',
                    'fl': 'id',
                    'rows': 1000000
                }))['response']['docs']

            for id in list_ids:
                ind_ids.add(id['id'])

            # all ids in articlemeta
            for item in art_meta.documents(only_identifiers=True):
                if item.collection not in ALLOWED_COLLECTION:
                    continue
                art_ids.add('%s-%s' % (item.code, item.collection))

            # Ids to remove
            remove_ids = ind_ids - art_ids

            for id in remove_ids:
                self.solr.delete('id:%s' % id, commit=True)

            logger.info("List of removed ids: %s" % remove_ids)

        else:

            # Get article identifiers

            logger.info("Indexing in {0}".format(self.solr.url))

            for document in art_meta.documents(
                    collection=self.args.collection,
                    issn=self.args.issn,
                    from_date=self.format_date(self.args.from_date),
                    until_date=self.format_date(self.args.until_date)):

                try:
                    xml = self.pipeline_to_xml(document)
                    self.solr.update(self.pipeline_to_xml(document),
                                     commit=True)
                except ValueError as e:
                    logger.error("ValueError: {0}".format(e))
                    logger.exception(e)
                    continue
                except Exception as e:
                    logger.error("Error: {0}".format(e))
                    logger.exception(e)
                    continue

        # optimize the index
        self.solr.commit()
        self.solr.optimize()
Esempio n. 11
0
 def setUp(self):
     self.solr = Solr('http://some.url')
Esempio n. 12
0
class UpdateSearch(object):
    """
    Process to get article in article meta and index in Solr.
    """

    def __init__(self, period=None, from_date=None, until_date=None,
                 collection=None, issn=None, delete=False, differential=False,
                 load_indicators=False):
        self.delete = delete
        self.collection = collection
        self.from_date = from_date
        self.until_date = until_date
        self.differential = differential
        self.load_indicators = load_indicators
        self.issn = issn
        self.solr = Solr(SOLR_URL, timeout=10)
        if period:
            self.from_date = datetime.now() - timedelta(days=period)

    def format_date(self, date):
        """
        Convert datetime.datetime to str return: ``2000-05-12``.

        :param datetime: bult-in datetime object

        :returns: str
        """
        if not date:
            return None

        return date.strftime('%Y-%m-%d')

    def pipeline_to_xml(self, article):
        """
        Pipeline to tranform a dictionary to XML format

        :param list_dict: List of dictionary content key tronsform in a XML.
        """

        pipeline_itens = [
            pipeline_xml.SetupDocument(),
            pipeline_xml.DocumentID(),
            pipeline_xml.DOI(),
            pipeline_xml.Collection(),
            pipeline_xml.DocumentType(),
            pipeline_xml.URL(),
            pipeline_xml.Authors(),
            pipeline_xml.Orcid(),
            pipeline_xml.Titles(),
            pipeline_xml.OriginalTitle(),
            pipeline_xml.Pages(),
            pipeline_xml.WOKCI(),
            pipeline_xml.WOKSC(),
            pipeline_xml.JournalAbbrevTitle(),
            pipeline_xml.Languages(),
            pipeline_xml.AvailableLanguages(),
            pipeline_xml.Fulltexts(),
            pipeline_xml.PublicationDate(),
            pipeline_xml.SciELOPublicationDate(),
            pipeline_xml.SciELOProcessingDate(),
            pipeline_xml.Abstract(),
            pipeline_xml.AffiliationCountry(),
            pipeline_xml.AffiliationInstitution(),
            pipeline_xml.Sponsor(),
            pipeline_xml.Volume(),
            pipeline_xml.SupplementVolume(),
            pipeline_xml.Issue(),
            pipeline_xml.SupplementIssue(),
            pipeline_xml.ElocationPage(),
            pipeline_xml.StartPage(),
            pipeline_xml.EndPage(),
            pipeline_xml.JournalTitle(),
            pipeline_xml.IsCitable(),
            pipeline_xml.Permission(),
            pipeline_xml.Keywords(),
            pipeline_xml.JournalISSNs(),
            pipeline_xml.SubjectAreas()
        ]

        if self.load_indicators is True:
            pipeline_itens.append(pipeline_xml.ReceivedCitations())

        pipeline_itens.append(pipeline_xml.TearDown())

        ppl = plumber.Pipeline(*pipeline_itens)

        xmls = ppl.run([article])

        # Add root document
        add = ET.Element('add')

        for xml in xmls:
            add.append(xml)

        return ET.tostring(add, encoding="utf-8", method="xml")

    def differential_mode(self):
        art_meta = ThriftClient()

        logger.info("Running with differential mode")
        ind_ids = set()
        art_ids = set()

        # all ids in search index
        logger.info("Loading Search Index ids.")
        itens_query = []
        if self.collection:
            itens_query.append('in:%s' % self.collection)

        if self.issn:
            itens_query.append('issn:%s' % self.issn)

        query = '*:*' if len(itens_query) == 0 else ' AND '.join(itens_query)
        list_ids = json.loads(self.solr.select(
            {'q': query, 'fl': 'id,scielo_processing_date', 'rows': 1000000}))['response']['docs']

        for id in list_ids:
            ind_ids.add('%s-%s' % (id['id'], id.get('scielo_processing_date', '1900-01-01')))

        # all ids in articlemeta
        logger.info("Loading ArticleMeta ids.")
        for item in art_meta.documents(
            collection=self.collection,
            issn=self.issn,
            only_identifiers=True
        ):
            art_ids.add('%s-%s-%s' % (item.code, item.collection, item.processing_date))

        # Ids to remove
        if self.delete is True:
            logger.info("Running remove records process.")
            remove_ids = set([i[:27] for i in ind_ids]) - set([i[:27] for i in art_ids])
            logger.info("Removing (%d) documents from search index." % len(remove_ids))
            total_to_remove = len(remove_ids)
            if total_to_remove > 0:
                for ndx, to_remove_id in enumerate(remove_ids, 1):
                    logger.debug("Removing (%d/%d): %s" % (ndx, total_to_remove, to_remove_id))
                    self.solr.delete('id:%s' % to_remove_id, commit=False)

        # Ids to include
        logger.info("Running include records process.")
        include_ids = art_ids - ind_ids
        logger.info("Including (%d) documents to search index." % len(include_ids))
        total_to_include = len(include_ids)
        if total_to_include > 0:
            for ndx, to_include_id in enumerate(include_ids, 1):
                logger.debug("Including (%d/%d): %s" % (ndx, total_to_include, to_include_id))
                code = to_include_id[:23]
                collection = to_include_id[24: 27]
                processing_date = to_include_id[:-11]
                document = art_meta.document(code=code, collection=collection)
                try:
                    xml = self.pipeline_to_xml(document)
                    self.solr.update(xml, commit=False)
                except ValueError as e:
                    logger.error("ValueError: {0}".format(e))
                    logger.exception(e)
                    continue
                except Exception as e:
                    logger.error("Error: {0}".format(e))
                    logger.exception(e)
                    continue

    def common_mode(self):
        art_meta = ThriftClient()

        logger.info("Running without differential mode")
        logger.info("Indexing in {0}".format(self.solr.url))
        for document in art_meta.documents(
            collection=self.collection,
            issn=self.issn,
            from_date=self.format_date(self.from_date),
            until_date=self.format_date(self.until_date)
        ):

            logger.debug("Loading document %s" % '_'.join([document.collection_acronym, document.publisher_id]))

            try:
                xml = self.pipeline_to_xml(document)
                self.solr.update(xml, commit=False)
            except ValueError as e:
                logger.error("ValueError: {0}".format(e))
                logger.exception(e)
                continue
            except Exception as e:
                logger.error("Error: {0}".format(e))
                logger.exception(e)
                continue

        if self.delete is True:
            logger.info("Running remove records process.")
            ind_ids = set()
            art_ids = set()

            itens_query = []
            if self.collection:
                itens_query.append('in:%s' % self.collection)

            if self.issn:
                itens_query.append('issn:%s' % self.issn)

            query = '*:*' if len(itens_query) == 0 else ' AND '.join(itens_query)

            list_ids = json.loads(self.solr.select(
                {'q': query, 'fl': 'id', 'rows': 1000000}))['response']['docs']

            for id in list_ids:
                ind_ids.add(id['id'])

            # all ids in articlemeta
            for item in art_meta.documents(
                collection=self.collection,
                issn=self.issn,
                only_identifiers=True
            ):
                art_ids.add('%s-%s' % (item.code, item.collection))
            # Ids to remove
            total_to_remove = len(remove_ids)
            logger.info("Removing (%d) documents from search index." % len(remove_ids))
            remove_ids = ind_ids - art_ids
            for ndx, to_remove_id in enumerate(remove_ids, 1):
                logger.debug("Removing (%d/%d): %s" % (ndx, total_to_remove, to_remove_id))
                self.solr.delete('id:%s' % to_remove_id, commit=False)

    def run(self):
        """
        Run the process for update article in Solr.
        """
        if self.differential is True:
            self.differential_mode()
        else:
            self.common_mode()

        # optimize the index
        self.solr.commit()
        self.solr.optimize()
Esempio n. 13
0
class UpdatePreprint(object):
    """
    Process to get article in Pre-Print Server and index in Solr.
    """

    usage = """\
    Process to index Pre-Prints articles to SciELO Solr.
    """

    parser = argparse.ArgumentParser(textwrap.dedent(usage))

    parser.add_argument(
        '-p',
        '--period',
        type=int,
        help='index articles from specific period, use number of hours.')

    parser.add_argument(
        '-d',
        '--delete',
        dest='delete',
        help='delete query ex.: q=type:"preprint (Lucene Syntax).')

    parser.add_argument(
        '-solr_url',
        '--solr_url',
        dest='solr_url',
        help=
        'Solr RESTFul URL, processing try to get the variable from environment ``SOLR_URL`` otherwise use --solr_url to set the solr_url (preferable).'
    )

    parser.add_argument(
        '-oai_url',
        '--oai_url',
        dest='oai_url',
        default="https://preprints.scielo.org/index.php/scielo/oai",
        help=
        'OAI URL, processing try to get the variable from environment ``OAI_URL`` otherwise use --oai_url to set the oai_url (preferable).'
    )

    parser.add_argument('-v',
                        '--version',
                        action='version',
                        version='version: 0.1-beta')

    def __init__(self):

        self.args = self.parser.parse_args()

        solr_url = os.environ.get('SOLR_URL')
        oai_url = os.environ.get('OAI_URL')

        if not solr_url and not self.args.solr_url:
            raise argparse.ArgumentTypeError(
                '--solr_url or ``SOLR_URL`` enviroment variable must be the set, use --help.'
            )

        if not oai_url and not self.args.oai_url:
            raise argparse.ArgumentTypeError(
                '--oai_url or ``OAI_URL`` enviroment variable must be the set, use --help.'
            )

        if not solr_url:
            self.solr = Solr(self.args.solr_url, timeout=10)
        else:
            self.solr = Solr(solr_url, timeout=10)

        if self.args.period:
            self.from_date = datetime.now() - timedelta(hours=self.args.period)

    def pipeline_to_xml(self, article):
        """
        Pipeline to tranform a dictionary to XML format

        :param list_dict: List of dictionary content key tronsform in a XML.
        """

        ppl = plumber.Pipeline(pipeline_xml.SetupDocument(),
                               pipeline_xml.DocumentID(), pipeline_xml.URL(),
                               pipeline_xml.DOI(), pipeline_xml.Languages(),
                               pipeline_xml.Fulltexts(),
                               pipeline_xml.PublicationDate(),
                               pipeline_xml.Keywords(),
                               pipeline_xml.Collection(),
                               pipeline_xml.DocumentType(),
                               pipeline_xml.Titles(), pipeline_xml.Abstract(),
                               pipeline_xml.Authors(), pipeline_xml.TearDown())

        xmls = ppl.run([article])

        # Add root document
        add = ET.Element('add')

        for xml in xmls:
            add.append(xml)

        return ET.tostring(add, encoding="utf-8", method="xml")

    def run(self):
        """
        Run the process for update Pre-prints in Solr.
        """

        if self.args.delete:

            self.solr.delete(self.args.delete, commit=True)

        else:

            logger.info("Indexing in {0}".format(self.solr.url))

            sickle = Sickle(self.args.oai_url)

            records = sickle.ListRecords(
                **{
                    'metadataPrefix': 'oai_dc',
                    'from': self.from_date.strftime("%Y-%m-%dT%H:%M:%SZ")
                })

            for record in records:
                try:
                    xml = self.pipeline_to_xml(record.xml)
                    self.solr.update(xml, commit=True)
                except ValueError as e:
                    logger.error("ValueError: {0}".format(e))
                    logger.exception(e)
                    continue
                except Exception as e:
                    logger.error("Error: {0}".format(e))
                    logger.exception(e)
                    continue

        # optimize the index
        self.solr.commit()
        self.solr.optimize()
Esempio n. 14
0
def main(settings, *args, **xargs):

    solr = Solr(settings['endpoints']['solr'], timeout=int(settings['request']['timeout']))

    parser = argparse.ArgumentParser(description='Script to handle article duplication on article index')

    parser.add_argument('-d', '--debug',
                        action='store_true',
                        help='execute the script in DEBUG mode (don\'t update the index)')

    parser.add_argument('-v', '--version',
                        action='version',
                        version='%(prog)s 0.1')

    args = parser.parse_args()

    if args.debug:
        log.setLevel(logging.DEBUG)

    log.info('Start find duplication script')

    # set csv file for register duplication articles
    csv_filename = '{0}-{1}.csv'.format(settings['csv']['filename_prefix'],
         datetime.now().strftime('%Y-%m-%d') )
    csv_file = open(csv_filename, 'wb')
    csv_writer = csv.writer(csv_file, quoting=csv.QUOTE_MINIMAL)

    total_duplicated = 0
    offset = 0
    fail_list = []

    while True:
        try:
            duplication_lst = get_duplication_list(solr, offset)
            total_for_process = len(duplication_lst)

            if total_for_process == 0:
                break;

            log.info('Processing {0} duplication entries'.format(total_for_process))

            offset += int(settings['params']['limit_offset'])

            for dup_code in duplication_lst:

                # ignore partial upgrade duplication signature (SOLR-4016)
                if dup_code[0] == '0000000000000000':
                    continue
                
                process_list = get_duplication_articles(solr, dup_code[0])

                if process_list:
                    main_article = [article['id'] for article in process_list if article['in'][0] == 'scl']

                    # only process if is identified only one main article from SCL collection
                    if len(main_article) == 1:
                        
                        for update_article in process_list:
                            update_id = update_article['id']
                            # if is the main article (SCL colection) update index
                            # otherwise delete article duplication
                            if update_id == main_article[0]:
                                log.info('Updating colection element of article: {0}'.format(update_id))
                                save_csv_entry(csv_writer, update_article, 'updated')
                                
                                if not args.debug:
                                    status = update_main_article(solr, update_id, process_list)

                            else:
                                log.info('Deleting duplicated article: {0}'.format(update_id))
                                save_csv_entry(csv_writer, update_article, 'duplication deleted')

                                if not args.debug:
                                    delete_query = 'id:"{0}"'.format(update_id)
                                    status = solr.delete(delete_query)
                                    total_duplicated += 1
                                    if status != 0:
                                        log.error('Unable to delete article {0}, code:{1}'.format(
                                            update_id, status))


                            # check for udpate solr status (update or delete)
                            if not args.debug and status != 0:
                                log.error('Unable to update article {0}, code:{1}'.format(
                                        update_id, status))
                                fail_list.append(update_id)

                    # skip
                    else:
                        log.debug('Skipping articles due missing main article of SCL collection :{0}'.format(
                            [art['id'].encode('utf-8') for art in process_list]) )

                        # save list of ignored articles to csv file 
                        for art in process_list:
                            save_csv_entry(csv_writer, art, 'ignored due missing main article')

                # write a empty line for separate next group of duplication articles
                csv_writer.writerow([' '])

        except Exception as e:
            log.critical('Unexpected error: {0}'.format(e))

    # commit at end to avoid offset process gap
    commit(solr, debug=args.debug)
    # script summary
    summary(total_duplicated, fail_list, args.debug)
Esempio n. 15
0
class UpdateSearch(object):
    """
    Process to get article in article meta and index in Solr.
    """
    def __init__(self,
                 period=None,
                 from_date=None,
                 until_date=None,
                 collection=None,
                 issn=None,
                 delete=False,
                 sanitization=False):
        self.delete = delete
        self.sanitization = sanitization
        self.collection = collection
        self.from_date = from_date
        self.until_date = until_date
        self.issn = issn
        self.solr = Solr(SOLR_URL, timeout=10)
        if period:
            self.from_date = datetime.now() - timedelta(days=period)

    def format_date(self, date):
        """
        Convert datetime.datetime to str return: ``2000-05-12``.

        :param datetime: bult-in datetime object

        :returns: str
        """
        if not date:
            return None

        return date.strftime('%Y-%m-%d')

    def pipeline_to_xml(self, article):
        """
        Pipeline to tranform a dictionary to XML format

        :param list_dict: List of dictionary content key tronsform in a XML.
        """

        ppl = plumber.Pipeline(pipeline_xml.SetupDocument(),
                               pipeline_xml.DocumentID(), pipeline_xml.DOI(),
                               pipeline_xml.Collection(),
                               pipeline_xml.DocumentType(), pipeline_xml.URL(),
                               pipeline_xml.Authors(), pipeline_xml.Titles(),
                               pipeline_xml.OriginalTitle(),
                               pipeline_xml.Pages(), pipeline_xml.WOKCI(),
                               pipeline_xml.WOKSC(),
                               pipeline_xml.JournalAbbrevTitle(),
                               pipeline_xml.Languages(),
                               pipeline_xml.AvailableLanguages(),
                               pipeline_xml.Fulltexts(),
                               pipeline_xml.PublicationDate(),
                               pipeline_xml.SciELOPublicationDate(),
                               pipeline_xml.SciELOProcessingDate(),
                               pipeline_xml.Abstract(),
                               pipeline_xml.AffiliationCountry(),
                               pipeline_xml.AffiliationInstitution(),
                               pipeline_xml.Sponsor(), pipeline_xml.Volume(),
                               pipeline_xml.SupplementVolume(),
                               pipeline_xml.Issue(),
                               pipeline_xml.SupplementIssue(),
                               pipeline_xml.ElocationPage(),
                               pipeline_xml.StartPage(),
                               pipeline_xml.EndPage(),
                               pipeline_xml.JournalTitle(),
                               pipeline_xml.IsCitable(),
                               pipeline_xml.Permission(),
                               pipeline_xml.Keywords(),
                               pipeline_xml.JournalISSNs(),
                               pipeline_xml.SubjectAreas(),
                               pipeline_xml.ReceivedCitations(),
                               pipeline_xml.TearDown())

        xmls = ppl.run([article])

        # Add root document
        add = ET.Element('add')

        for xml in xmls:
            add.append(xml)

        return ET.tostring(add, encoding="utf-8", method="xml")

    def run(self):
        """
        Run the process for update article in Solr.
        """

        art_meta = ThriftClient()

        if self.delete:

            self.solr.delete(self.delete, commit=True)
        else:

            logger.info("Indexing in {0}".format(self.solr.url))
            for document in art_meta.documents(
                    collection=self.collection,
                    issn=self.issn,
                    from_date=self.format_date(self.from_date),
                    until_date=self.format_date(self.until_date)):

                logger.debug("Loading document %s" % '_'.join(
                    [document.collection_acronym, document.publisher_id]))

                try:
                    xml = self.pipeline_to_xml(document)
                    self.solr.update(self.pipeline_to_xml(document),
                                     commit=True)
                except ValueError as e:
                    logger.error("ValueError: {0}".format(e))
                    logger.exception(e)
                    continue
                except Exception as e:
                    logger.error("Error: {0}".format(e))
                    logger.exception(e)
                    continue

        if self.sanitization is True:
            logger.info("Running sanitization process")
            ind_ids = set()
            art_ids = set()

            itens_query = []
            if self.collection:
                itens_query.append('in:%s' % self.collection)

            if self.issn:
                itens_query.append('issn:%s' % self.issn)

            query = '*:*' if len(itens_query) == 0 else ' AND '.join(
                itens_query)

            list_ids = json.loads(
                self.solr.select({
                    'q': query,
                    'fl': 'id',
                    'rows': 1000000
                }))['response']['docs']

            for id in list_ids:
                ind_ids.add(id['id'])
            # all ids in articlemeta
            for item in art_meta.documents(collection=self.collection,
                                           issn=self.issn,
                                           only_identifiers=True):
                art_ids.add('%s-%s' % (item.code, item.collection))
            # Ids to remove
            remove_ids = ind_ids - art_ids
            for id in remove_ids:
                logger.debug("Removing id: %s" % id)
                self.solr.delete('id:%s' % id, commit=True)

        # optimize the index
        self.solr.commit()
        self.solr.optimize()
Esempio n. 16
0
class UpdateSearch(object):
    """
    Process to get article in article meta and index in Solr.
    """

    usage = """\
    Process to index article to SciELO Solr.

    This process collects articles in the Article meta using thrift and index
    in SciELO Solr.

    With this process it is possible to process all the article or some specific
    by collection, issn from date to until another date and a period like 7 days.
    """

    parser = argparse.ArgumentParser(textwrap.dedent(usage))

    parser.add_argument('-p', '--period',
                        type=int,
                        help='index articles from specific period, use number of days.')

    parser.add_argument('-f', '--from',
                        dest='from_date',
                        type=lambda x: datetime.strptime(x, '%Y-%m-%d'),
                        nargs='?',
                        help='index articles from specific date. YYYY-MM-DD.')

    parser.add_argument('-u', '--until',
                        dest='until_date',
                        type=lambda x: datetime.strptime(x, '%Y-%m-%d'),
                        nargs='?',
                        help='index articles until this specific date. YYYY-MM-DD (default today).',
                        default=datetime.now())

    parser.add_argument('-c', '--collection',
                        dest='collection',
                        default=None,
                        help='use the acronym of the collection eg.: spa, scl, col.')

    parser.add_argument('-i', '--issn',
                        dest='issn',
                        default=None,
                        help='journal issn.')

    parser.add_argument('-d', '--delete',
                        dest='delete',
                        default=None,
                        help='delete query ex.: q=*:* (Lucene Syntax).')

    parser.add_argument('-s', '--sanitization',
                        dest='sanitization',
                        default=False,
                        action='store_true',
                        help='Remove objects from the index that are no longer present in the database.')

    parser.add_argument('-url', '--url',
                        dest='solr_url',
                        help='Solr RESTFul URL, processing try to get the variable from environment ``SOLR_URL`` otherwise use --url to set the url(preferable).')

    parser.add_argument('-v', '--version',
                        action='version',
                        version='version: 0.2')

    def __init__(self):

        self.args = self.parser.parse_args()

        solr_url = os.environ.get('SOLR_URL')

        if not solr_url and not self.args.solr_url:
            raise argparse.ArgumentTypeError('--url or ``SOLR_URL`` enviroment variable must be the set, use --help.')

        if not solr_url:
            self.solr = Solr(self.args.solr_url, timeout=10)
        else:
            self.solr = Solr(solr_url, timeout=10)

        if self.args.period:
            self.args.from_date = datetime.now() - timedelta(days=self.args.period)

    def format_date(self, date):
        """
        Convert datetime.datetime to str return: ``2000-05-12``.

        :param datetime: bult-in datetime object

        :returns: str
        """
        if not date:
            return None

        return date.strftime('%Y-%m-%d')

    def pipeline_to_xml(self, article):
        """
        Pipeline to tranform a dictionary to XML format

        :param list_dict: List of dictionary content key tronsform in a XML.
        """

        ppl = plumber.Pipeline(
            pipeline_xml.SetupDocument(),
            pipeline_xml.DocumentID(),
            pipeline_xml.DOI(),
            pipeline_xml.Collection(),
            pipeline_xml.DocumentType(),
            pipeline_xml.URL(),
            pipeline_xml.Authors(),
            pipeline_xml.Titles(),
            pipeline_xml.OriginalTitle(),
            pipeline_xml.Pages(),
            pipeline_xml.WOKCI(),
            pipeline_xml.WOKSC(),
            pipeline_xml.JournalAbbrevTitle(),
            pipeline_xml.Languages(),
            pipeline_xml.AvailableLanguages(),
            pipeline_xml.Fulltexts(),
            pipeline_xml.PublicationDate(),
            pipeline_xml.SciELOPublicationDate(),
            pipeline_xml.SciELOProcessingDate(),
            pipeline_xml.Abstract(),
            pipeline_xml.AffiliationCountry(),
            pipeline_xml.AffiliationInstitution(),
            pipeline_xml.Sponsor(),
            pipeline_xml.Volume(),
            pipeline_xml.SupplementVolume(),
            pipeline_xml.Issue(),
            pipeline_xml.SupplementIssue(),
            pipeline_xml.ElocationPage(),
            pipeline_xml.StartPage(),
            pipeline_xml.EndPage(),
            pipeline_xml.JournalTitle(),
            pipeline_xml.IsCitable(),
            pipeline_xml.Permission(),
            pipeline_xml.Keywords(),
            pipeline_xml.JournalISSNs(),
            pipeline_xml.SubjectAreas(),
            pipeline_xml.ReceivedCitations(),
            pipeline_xml.TearDown()
        )

        xmls = ppl.run([article])

        # Add root document
        add = ET.Element('add')

        for xml in xmls:
            add.append(xml)

        return ET.tostring(add, encoding="utf-8", method="xml")

    def run(self):
        """
        Run the process for update article in Solr.
        """

        art_meta = ThriftClient()

        if self.args.delete:

            self.solr.delete(self.args.delete, commit=True)

        elif self.args.sanitization:

            # set of index ids
            ind_ids = set()

            # set of articlemeta ids
            art_ids = set()

            # all ids in index
            list_ids = json.loads(self.solr.select(
                                    {'q': '*:*', 'fl': 'id', 'rows': 1000000}))['response']['docs']

            for id in list_ids:
                ind_ids.add(id['id'])

            # all ids in articlemeta
            for item in art_meta.documents(only_identifiers=True):
                if item.collection not in ALLOWED_COLLECTION:
                    continue
                art_ids.add('%s-%s' % (item.code, item.collection))

            # Ids to remove
            remove_ids = ind_ids - art_ids

            for id in remove_ids:
                self.solr.delete('id:%s' % id, commit=True)

            logger.info("List of removed ids: %s" % remove_ids)

        else:

            # Get article identifiers

            logger.info("Indexing in {0}".format(self.solr.url))

            for document in art_meta.documents(
                collection=self.args.collection,
                issn=self.args.issn,
                from_date=self.format_date(self.args.from_date),
                until_date=self.format_date(self.args.until_date)
            ):

                try:
                    xml = self.pipeline_to_xml(document)
                    self.solr.update(self.pipeline_to_xml(document), commit=True)
                except ValueError as e:
                    logger.error("ValueError: {0}".format(e))
                    logger.exception(e)
                    continue
                except Exception as e:
                    logger.error("Error: {0}".format(e))
                    logger.exception(e)
                    continue

        # optimize the index
        self.solr.commit()
        self.solr.optimize()