Esempio n. 1
0
    def common_mode(self):
        art_meta = ThriftClient()

        logger.info("Running without differential mode")
        logger.info("Indexing in {0}".format(self.solr.url))
        for document in art_meta.documents(
            collection=self.collection,
            issn=self.issn,
            from_date=self.format_date(self.from_date),
            until_date=self.format_date(self.until_date)
        ):

            logger.debug("Loading document %s" % '_'.join([document.collection_acronym, document.publisher_id]))

            try:
                xml = self.pipeline_to_xml(document)
                self.solr.update(xml, commit=False)
            except ValueError as e:
                logger.error("ValueError: {0}".format(e))
                logger.exception(e)
                continue
            except Exception as e:
                logger.error("Error: {0}".format(e))
                logger.exception(e)
                continue

        if self.delete is True:
            logger.info("Running remove records process.")
            ind_ids = set()
            art_ids = set()

            itens_query = []
            if self.collection:
                itens_query.append('in:%s' % self.collection)

            if self.issn:
                itens_query.append('issn:%s' % self.issn)

            query = '*:*' if len(itens_query) == 0 else ' AND '.join(itens_query)

            list_ids = json.loads(self.solr.select(
                {'q': query, 'fl': 'id', 'rows': 1000000}))['response']['docs']

            for id in list_ids:
                ind_ids.add(id['id'])

            # all ids in articlemeta
            for item in art_meta.documents(
                collection=self.collection,
                issn=self.issn,
                only_identifiers=True
            ):
                art_ids.add('%s-%s' % (item.code, item.collection))
            # Ids to remove
            total_to_remove = len(remove_ids)
            logger.info("Removing (%d) documents from search index." % len(remove_ids))
            remove_ids = ind_ids - art_ids
            for ndx, to_remove_id in enumerate(remove_ids, 1):
                logger.debug("Removing (%d/%d): %s" % (ndx, total_to_remove, to_remove_id))
                self.solr.delete('id:%s' % to_remove_id, commit=False)
Esempio n. 2
0
def issue_ids_to_article_ids(collection, items):
    """
        Return a dictionary, like:

        {'issn':[pid, pid, ...]),
         'issn':[pid, pid, ...]),
         'issn':[pid, pid, ...])}
    """

    data_dict = {}

    domain = "%s:%s" % (ARTICLE_META_THRIFT_DOMAIN,
                        ARTICLE_META_THRIFT_PORT)

    cl = ThriftClient(domain, timeout=ARTICLE_META_THRIFT_TIMEOUT)

    for issn, icodes in items.items():
        d = data_dict.setdefault(issn, [])
        for icode in icodes:
            for code in cl.documents(collection=collection,
                                     only_identifiers=True,
                                     extra_filter='{"code_issue":"%s"}' % icode):
                if code:
                    d.append(code.code)

    return data_dict
Esempio n. 3
0
def issue_ids_to_article_ids(collection, items):
    """
        Return a dictionary, like:

        {'issn':[pid, pid, ...]),
         'issn':[pid, pid, ...]),
         'issn':[pid, pid, ...])}
    """

    data_dict = {}

    domain = "%s:%s" % (ARTICLE_META_THRIFT_DOMAIN, ARTICLE_META_THRIFT_PORT)

    cl = ThriftClient(domain, timeout=ARTICLE_META_THRIFT_TIMEOUT)

    for issn, icodes in items.items():
        d = data_dict.setdefault(issn, [])
        for icode in icodes:
            for code in cl.documents(collection=collection,
                                     only_identifiers=True,
                                     extra_filter='{"code_issue":"%s"}' %
                                     icode):
                if code:
                    d.append(code.code)

    return data_dict
Esempio n. 4
0
class ExportDOI(object):
    def __init__(self,
                 collection,
                 issns=None,
                 from_date=FROM,
                 until_date=UNTIL):

        self._articlemeta = ThriftClient(domain=os.environ.get(
            'ARTICLEMETA_THRIFTSERVER', 'articlemeta.scielo.org:11621'))
        self._depositor = Depositor()
        self.collection = collection
        self.from_date = from_date
        self.until_date = until_date
        self.issns = issns or [None]

    def run(self):
        logger.info(
            'started collecting articles with processing dates '
            'between "%s" and "%s"', self.from_date, self.until_date)
        count = 0
        for issn in self.issns:

            for document in self._articlemeta.documents(
                    collection=self.collection,
                    issn=issn,
                    from_date=self.from_date,
                    until_date=self.until_date,
                    only_identifiers=True):

                code = '_'.join([document.collection, document.code])
                logger.info('collecting document for deposit: %s', code)
                self._depositor.deposit_by_pids([code])
                count += 1

        logger.info('finished collecting documents. total: %d', count)
Esempio n. 5
0
def load_articlemeta_documents_ids(collection, issns=None):
    rc = ThriftClient(domain=ARTICLEMETA_THRIFTSERVER, admintoken=ADMINTOKEN)

    documents_pids = []
    logger.info('Loading articlemeta documents ids')
    for issn in issns or [None]:
        for document in rc.documents(collection, issn=issn, only_identifiers=True):
            logger.debug(
                'Loading articlemeta document id (%s)',
                '_'.join([document.collection, document.code, document.processing_date.replace('-', '')])
            )
            documents_pids.append('_'.join([document.collection, document.code, document.processing_date.replace('-', '')]))

    return documents_pids
Esempio n. 6
0
    def run(self):
        """
        Run the process for update article in Solr.
        """

        art_meta = ArticleMetaThriftClient()
        art_accesses = AccessThriftClient(domain="ratchet.scielo.org:11660")

        logger.info("Loading Solr available document ids")
        itens_query = []

        if self.collection:
            itens_query.append('in:%s' % self.collection)

        if self.issn:
            itens_query.append('issn:%s' % self.issn)

        query = '*:*' if len(itens_query) == 0 else ' AND '.join(itens_query)

        available_ids = set([
            i['id'] for i in json.loads(
                self.solr.select({
                    'q': query,
                    'fl': 'id',
                    'rows': 1000000
                }))['response']['docs']
        ])

        logger.info("Recording accesses for documents in {0}".format(
            self.solr.url))

        for document in art_meta.documents(collection=self.collection,
                                           issn=self.issn):

            solr_id = '-'.join(
                [document.publisher_id, document.collection_acronym])

            if solr_id not in available_ids:
                continue

            logger.debug("Loading accesses for document %s" % solr_id)

            total_accesses = int(
                art_accesses.document(document.publisher_id,
                                      document.collection_acronym).get(
                                          'access_total',
                                          {'value': 0})['value'])

            xml = self.set_accesses(solr_id, total_accesses)

            try:
                result = self.solr.update(xml, commit=False)
            except ValueError as e:
                logger.error("ValueError: {0}".format(e))
                logger.exception(e)
                continue
            except Exception as e:
                logger.error("Error: {0}".format(e))
                logger.exception(e)
                continue

        # optimize the index
        self.solr.commit()
        self.solr.optimize()
Esempio n. 7
0
    def run(self):
        """
        Run the process for update article in Solr.
        """

        art_meta = ThriftClient()

        if self.args.delete:

            self.solr.delete(self.args.delete, commit=True)

        elif self.args.sanitization:

            # set of index ids
            ind_ids = set()

            # set of articlemeta ids
            art_ids = set()

            # all ids in index
            list_ids = json.loads(
                self.solr.select({
                    'q': '*:*',
                    'fl': 'id',
                    'rows': 1000000
                }))['response']['docs']

            for id in list_ids:
                ind_ids.add(id['id'])

            # all ids in articlemeta
            for item in art_meta.documents(only_identifiers=True):
                if item.collection not in ALLOWED_COLLECTION:
                    continue
                art_ids.add('%s-%s' % (item.code, item.collection))

            # Ids to remove
            remove_ids = ind_ids - art_ids

            for id in remove_ids:
                self.solr.delete('id:%s' % id, commit=True)

            logger.info("List of removed ids: %s" % remove_ids)

        else:

            # Get article identifiers

            logger.info("Indexing in {0}".format(self.solr.url))

            for document in art_meta.documents(
                    collection=self.args.collection,
                    issn=self.args.issn,
                    from_date=self.format_date(self.args.from_date),
                    until_date=self.format_date(self.args.until_date)):

                try:
                    xml = self.pipeline_to_xml(document)
                    self.solr.update(self.pipeline_to_xml(document),
                                     commit=True)
                except ValueError as e:
                    logger.error("ValueError: {0}".format(e))
                    logger.exception(e)
                    continue
                except Exception as e:
                    logger.error("Error: {0}".format(e))
                    logger.exception(e)
                    continue

        # optimize the index
        self.solr.commit()
        self.solr.optimize()
Esempio n. 8
0
    def differential_mode(self):
        art_meta = ThriftClient()

        logger.info("Running with differential mode")
        ind_ids = set()
        art_ids = set()

        # all ids in search index
        logger.info("Loading Search Index ids.")
        itens_query = []
        if self.collection:
            itens_query.append('in:%s' % self.collection)

        if self.issn:
            itens_query.append('issn:%s' % self.issn)

        query = '*:*' if len(itens_query) == 0 else ' AND '.join(itens_query)
        list_ids = json.loads(self.solr.select(
            {'q': query, 'fl': 'id,scielo_processing_date', 'rows': 1000000}))['response']['docs']

        for id in list_ids:
            ind_ids.add('%s-%s' % (id['id'], id.get('scielo_processing_date', '1900-01-01')))

        # all ids in articlemeta
        logger.info("Loading ArticleMeta ids.")
        for item in art_meta.documents(
            collection=self.collection,
            issn=self.issn,
            only_identifiers=True
        ):
            art_ids.add('%s-%s-%s' % (item.code, item.collection, item.processing_date))

        # Ids to remove
        if self.delete is True:
            logger.info("Running remove records process.")
            remove_ids = set([i[:27] for i in ind_ids]) - set([i[:27] for i in art_ids])
            logger.info("Removing (%d) documents from search index." % len(remove_ids))
            total_to_remove = len(remove_ids)
            if total_to_remove > 0:
                for ndx, to_remove_id in enumerate(remove_ids, 1):
                    logger.debug("Removing (%d/%d): %s" % (ndx, total_to_remove, to_remove_id))
                    self.solr.delete('id:%s' % to_remove_id, commit=False)

        # Ids to include
        logger.info("Running include records process.")
        include_ids = art_ids - ind_ids
        logger.info("Including (%d) documents to search index." % len(include_ids))
        total_to_include = len(include_ids)
        if total_to_include > 0:
            for ndx, to_include_id in enumerate(include_ids, 1):
                logger.debug("Including (%d/%d): %s" % (ndx, total_to_include, to_include_id))
                code = to_include_id[:23]
                collection = to_include_id[24: 27]
                processing_date = to_include_id[:-11]
                document = art_meta.document(code=code, collection=collection)
                try:
                    xml = self.pipeline_to_xml(document)
                    self.solr.update(xml, commit=False)
                except ValueError as e:
                    logger.error("ValueError: {0}".format(e))
                    logger.exception(e)
                    continue
                except Exception as e:
                    logger.error("Error: {0}".format(e))
                    logger.exception(e)
                    continue
Esempio n. 9
0
    def run(self):
        """
        Run the process for update article in Solr.
        """

        art_meta = ThriftClient()

        if self.delete:

            self.solr.delete(self.delete, commit=True)
        else:

            logger.info("Indexing in {0}".format(self.solr.url))
            for document in art_meta.documents(
                    collection=self.collection,
                    issn=self.issn,
                    from_date=self.format_date(self.from_date),
                    until_date=self.format_date(self.until_date)):

                logger.debug("Loading document %s" % '_'.join(
                    [document.collection_acronym, document.publisher_id]))

                try:
                    xml = self.pipeline_to_xml(document)
                    self.solr.update(self.pipeline_to_xml(document),
                                     commit=True)
                except ValueError as e:
                    logger.error("ValueError: {0}".format(e))
                    logger.exception(e)
                    continue
                except Exception as e:
                    logger.error("Error: {0}".format(e))
                    logger.exception(e)
                    continue

        if self.sanitization is True:
            logger.info("Running sanitization process")
            ind_ids = set()
            art_ids = set()

            itens_query = []
            if self.collection:
                itens_query.append('in:%s' % self.collection)

            if self.issn:
                itens_query.append('issn:%s' % self.issn)

            query = '*:*' if len(itens_query) == 0 else ' AND '.join(
                itens_query)

            list_ids = json.loads(
                self.solr.select({
                    'q': query,
                    'fl': 'id',
                    'rows': 1000000
                }))['response']['docs']

            for id in list_ids:
                ind_ids.add(id['id'])
            # all ids in articlemeta
            for item in art_meta.documents(collection=self.collection,
                                           issn=self.issn,
                                           only_identifiers=True):
                art_ids.add('%s-%s' % (item.code, item.collection))
            # Ids to remove
            remove_ids = ind_ids - art_ids
            for id in remove_ids:
                logger.debug("Removing id: %s" % id)
                self.solr.delete('id:%s' % id, commit=True)

        # optimize the index
        self.solr.commit()
        self.solr.optimize()
Esempio n. 10
0
    def run(self):
        """
        Run the process for update article in Solr.
        """

        art_meta = ThriftClient()

        if self.args.delete:

            self.solr.delete(self.args.delete, commit=True)

        elif self.args.sanitization:

            # set of index ids
            ind_ids = set()

            # set of articlemeta ids
            art_ids = set()

            # all ids in index
            list_ids = json.loads(self.solr.select(
                                    {'q': '*:*', 'fl': 'id', 'rows': 1000000}))['response']['docs']

            for id in list_ids:
                ind_ids.add(id['id'])

            # all ids in articlemeta
            for item in art_meta.documents(only_identifiers=True):
                if item.collection not in ALLOWED_COLLECTION:
                    continue
                art_ids.add('%s-%s' % (item.code, item.collection))

            # Ids to remove
            remove_ids = ind_ids - art_ids

            for id in remove_ids:
                self.solr.delete('id:%s' % id, commit=True)

            logger.info("List of removed ids: %s" % remove_ids)

        else:

            # Get article identifiers

            logger.info("Indexing in {0}".format(self.solr.url))

            for document in art_meta.documents(
                collection=self.args.collection,
                issn=self.args.issn,
                from_date=self.format_date(self.args.from_date),
                until_date=self.format_date(self.args.until_date)
            ):

                try:
                    xml = self.pipeline_to_xml(document)
                    self.solr.update(self.pipeline_to_xml(document), commit=True)
                except ValueError as e:
                    logger.error("ValueError: {0}".format(e))
                    logger.exception(e)
                    continue
                except Exception as e:
                    logger.error("Error: {0}".format(e))
                    logger.exception(e)
                    continue

        # optimize the index
        self.solr.commit()
        self.solr.optimize()