Esempio n. 1
0
def issue_ids_to_article_ids(collection, items):
    """
        Return a dictionary, like:

        {'issn':[pid, pid, ...]),
         'issn':[pid, pid, ...]),
         'issn':[pid, pid, ...])}
    """

    data_dict = {}

    domain = "%s:%s" % (ARTICLE_META_THRIFT_DOMAIN,
                        ARTICLE_META_THRIFT_PORT)

    cl = ThriftClient(domain, timeout=ARTICLE_META_THRIFT_TIMEOUT)

    for issn, icodes in items.items():
        d = data_dict.setdefault(issn, [])
        for icode in icodes:
            for code in cl.documents(collection=collection,
                                     only_identifiers=True,
                                     extra_filter='{"code_issue":"%s"}' % icode):
                if code:
                    d.append(code.code)

    return data_dict
Esempio n. 2
0
def issue_ids_to_article_ids(collection, items):
    """
        Return a dictionary, like:

        {'issn':[pid, pid, ...]),
         'issn':[pid, pid, ...]),
         'issn':[pid, pid, ...])}
    """

    data_dict = {}

    domain = "%s:%s" % (ARTICLE_META_THRIFT_DOMAIN, ARTICLE_META_THRIFT_PORT)

    cl = ThriftClient(domain, timeout=ARTICLE_META_THRIFT_TIMEOUT)

    for issn, icodes in items.items():
        d = data_dict.setdefault(issn, [])
        for icode in icodes:
            for code in cl.documents(collection=collection,
                                     only_identifiers=True,
                                     extra_filter='{"code_issue":"%s"}' %
                                     icode):
                if code:
                    d.append(code.code)

    return data_dict
Esempio n. 3
0
    def common_mode(self):
        art_meta = ThriftClient()

        logger.info("Running without differential mode")
        logger.info("Indexing in {0}".format(self.solr.url))
        for document in art_meta.documents(
            collection=self.collection,
            issn=self.issn,
            from_date=self.format_date(self.from_date),
            until_date=self.format_date(self.until_date)
        ):

            logger.debug("Loading document %s" % '_'.join([document.collection_acronym, document.publisher_id]))

            try:
                xml = self.pipeline_to_xml(document)
                self.solr.update(xml, commit=False)
            except ValueError as e:
                logger.error("ValueError: {0}".format(e))
                logger.exception(e)
                continue
            except Exception as e:
                logger.error("Error: {0}".format(e))
                logger.exception(e)
                continue

        if self.delete is True:
            logger.info("Running remove records process.")
            ind_ids = set()
            art_ids = set()

            itens_query = []
            if self.collection:
                itens_query.append('in:%s' % self.collection)

            if self.issn:
                itens_query.append('issn:%s' % self.issn)

            query = '*:*' if len(itens_query) == 0 else ' AND '.join(itens_query)

            list_ids = json.loads(self.solr.select(
                {'q': query, 'fl': 'id', 'rows': 1000000}))['response']['docs']

            for id in list_ids:
                ind_ids.add(id['id'])

            # all ids in articlemeta
            for item in art_meta.documents(
                collection=self.collection,
                issn=self.issn,
                only_identifiers=True
            ):
                art_ids.add('%s-%s' % (item.code, item.collection))
            # Ids to remove
            total_to_remove = len(remove_ids)
            logger.info("Removing (%d) documents from search index." % len(remove_ids))
            remove_ids = ind_ids - art_ids
            for ndx, to_remove_id in enumerate(remove_ids, 1):
                logger.debug("Removing (%d/%d): %s" % (ndx, total_to_remove, to_remove_id))
                self.solr.delete('id:%s' % to_remove_id, commit=False)
Esempio n. 4
0
def registry_dispatcher_document(self, code, collection):
    """
    This task receive a list of codes that should be queued for DOI registry
    """
    articlemeta = ThriftClient(domain=ARTICLEMETA_THRIFTSERVER)
    document = articlemeta.document(code, collection)

    code = '_'.join([document.collection_acronym, document.publisher_id])
    log_title = 'Reading document: %s' % code
    logger.info(log_title)
    xml_file_name = '%s.xml' % code
    doi = document.doi or ''
    doi_prefix = document.doi.split('/')[0] if doi else ''
    now = datetime.now()

    if SUGGEST_DOI_IDENTIFICATION is True and not doi:
        doi_prefix = CROSSREF_PREFIX
        doi = '/'.join([
            CROSSREF_PREFIX, document.publisher_ahead_id
            or document.publisher_id
        ])

    depitem = Deposit(code=code,
                      pid=document.publisher_id,
                      issn=document.journal.scielo_issn,
                      volume=document.issue.volume,
                      number=document.issue.number,
                      issue_label=document.issue.label,
                      journal=document.journal.title,
                      journal_acronym=document.journal.acronym,
                      collection_acronym=document.collection_acronym,
                      xml_file_name=xml_file_name,
                      doi=doi,
                      publication_year=int(document.publication_date[0:4]),
                      prefix=doi_prefix,
                      has_submission_xml_valid_references=False,
                      submission_updated_at=now,
                      submission_status='waiting',
                      updated_at=now,
                      started_at=now)

    with transactional_session() as session:
        deposit = session.query(Deposit).filter_by(code=code).first()
        if deposit:
            logger.info(
                'deposit already exists. it will be deleted and '
                're-created: "%s"', code)
            session.delete(deposit)

        session.add(depitem)
    logger.info('deposit successfuly created for "%s": %s', code,
                repr(deposit))

    chain(
        triage_deposit.s(code).set(queue='dispatcher'),
        load_xml_from_articlemeta.s().set(queue='dispatcher'),
        prepare_document.s().set(queue='dispatcher'),
        register_doi.s().set(queue='dispatcher'),
        request_doi_status.s().set(queue='releaser')).delay()
Esempio n. 5
0
def get_issn_by_acron(collection, acron):

    domain = "%s:%s" % (ARTICLE_META_THRIFT_DOMAIN, ARTICLE_META_THRIFT_PORT)

    cl = ThriftClient(domain, timeout=ARTICLE_META_THRIFT_TIMEOUT)

    for journal in cl.journals(collection=collection):

        if journal.acronym == acron:
            return journal.scielo_issn
Esempio n. 6
0
def get_issn_by_acron(collection, acron):

    domain = "%s:%s" % (ARTICLE_META_THRIFT_DOMAIN,
                        ARTICLE_META_THRIFT_PORT)

    cl = ThriftClient(domain, timeout=ARTICLE_META_THRIFT_TIMEOUT)

    for journal in cl.journals(collection=collection):

        if journal.acronym == acron:
            return journal.scielo_issn
Esempio n. 7
0
    def __init__(self,
                 collection,
                 issns=None,
                 from_date=FROM,
                 until_date=UNTIL):

        self._articlemeta = ThriftClient(domain=os.environ.get(
            'ARTICLEMETA_THRIFTSERVER', 'articlemeta.scielo.org:11621'))
        self._depositor = Depositor()
        self.collection = collection
        self.from_date = from_date
        self.until_date = until_date
        self.issns = issns or [None]
Esempio n. 8
0
def load_articlemeta_journals_ids(collection, issns=None):
    rc = ThriftClient(domain=ARTICLEMETA_THRIFTSERVER, admintoken=ADMINTOKEN)

    journals_pids = []
    logger.info('Loading articlemeta journals ids')
    for issn in issns or [None]:
        for journal in rc.journals(collection, issn=issn, only_identifiers=True):
            logger.debug(
                'Loading articlemeta journal id (%s)',
                '_'.join([journal.collection, journal.code])
            )
            journals_pids.append('_'.join([journal.collection, journal.code, journal.processing_date.replace('-', '')]))

    return journals_pids
Esempio n. 9
0
class ExportDOI(object):
    def __init__(self,
                 collection,
                 issns=None,
                 from_date=FROM,
                 until_date=UNTIL):

        self._articlemeta = ThriftClient(domain=os.environ.get(
            'ARTICLEMETA_THRIFTSERVER', 'articlemeta.scielo.org:11621'))
        self._depositor = Depositor()
        self.collection = collection
        self.from_date = from_date
        self.until_date = until_date
        self.issns = issns or [None]

    def run(self):
        logger.info(
            'started collecting articles with processing dates '
            'between "%s" and "%s"', self.from_date, self.until_date)
        count = 0
        for issn in self.issns:

            for document in self._articlemeta.documents(
                    collection=self.collection,
                    issn=issn,
                    from_date=self.from_date,
                    until_date=self.until_date,
                    only_identifiers=True):

                code = '_'.join([document.collection, document.code])
                logger.info('collecting document for deposit: %s', code)
                self._depositor.deposit_by_pids([code])
                count += 1

        logger.info('finished collecting documents. total: %d', count)
Esempio n. 10
0
def get_issns_by_acrons(collection, acrons):
    issn_list = []

    domain = "%s:%s" % (ARTICLE_META_THRIFT_DOMAIN, ARTICLE_META_THRIFT_PORT)

    cl = ThriftClient(domain, timeout=ARTICLE_META_THRIFT_TIMEOUT)

    acrons = set(acrons)

    for journal in cl.journals(collection=collection):

        if not acrons:
            break

        if journal.acronym in acrons:
            acrons.remove(journal.acronym)
            issn_list.append(journal.scielo_issn)

    return issn_list
Esempio n. 11
0
def get_issns_by_acrons(collection, acrons):
    issn_list = []

    domain = "%s:%s" % (ARTICLE_META_THRIFT_DOMAIN,
                        ARTICLE_META_THRIFT_PORT)

    cl = ThriftClient(domain, timeout=ARTICLE_META_THRIFT_TIMEOUT)

    acrons = set(acrons)

    for journal in cl.journals(collection=collection):

        if not acrons:
            break

        if journal.acronym in acrons:
            acrons.remove(journal.acronym)
            issn_list.append(journal.scielo_issn)

    return issn_list
Esempio n. 12
0
def issue_labels_to_ids(collection, items):
    """
        Return a dictionary, like:

        {'issn':set([id, id]),
         'issn':set([id, id]),
         'issn':set([id, id])}
    """

    data_dict = {}

    domain = "%s:%s" % (ARTICLE_META_THRIFT_DOMAIN, ARTICLE_META_THRIFT_PORT)

    cl = ThriftClient(domain, timeout=ARTICLE_META_THRIFT_TIMEOUT)

    for issn, labels in items.items():
        d = data_dict.setdefault(issn, set())
        for label in labels:
            code = cl.get_issue_code_from_label(label, issn, collection)
            if code:
                d.add(code)

    return data_dict
Esempio n. 13
0
def issue_labels_to_ids(collection, items):
    """
        Return a dictionary, like:

        {'issn':set([id, id]),
         'issn':set([id, id]),
         'issn':set([id, id])}
    """

    data_dict = {}

    domain = "%s:%s" % (ARTICLE_META_THRIFT_DOMAIN,
                        ARTICLE_META_THRIFT_PORT)

    cl = ThriftClient(domain, timeout=ARTICLE_META_THRIFT_TIMEOUT)

    for issn, labels in items.items():
        d = data_dict.setdefault(issn, set())
        for label in labels:
            code = cl.get_issue_code_from_label(label, issn, collection)
            if code:
                d.add(code)

    return data_dict
def main():
    usage = """Povoa tabela de periódicos para uso da API SUSHI em relatórios COUNTER"""
    parser = argparse.ArgumentParser(usage)

    parser.add_argument(
        '-u',
        '--matomo_db_uri',
        default=MATOMO_DATABASE_STRING,
        dest='matomodb_uri',
        help=
        'String de conexão a base SQL no formato mysql://username:password@host1:port/database'
    )

    parser.add_argument(
        '-t',
        '--use_thrift',
        dest='use_thrift',
        default=False,
        action='store_true',
        help='Usar ArticleMeta Thrift Client ao invés de RestfulClient')

    parser.add_argument(
        '--logging_level',
        choices=['CRITICAL', 'ERROR', 'WARNING', 'INFO', 'DEBUG', 'NOTSET'],
        dest='logging_level',
        default=LOGGING_LEVEL,
        help='Nível de log')

    params = parser.parse_args()

    logging.basicConfig(level=params.logging_level)

    if not params.use_thrift:
        articlemeta = RestfulClient()
    else:
        articlemeta = ThriftClient()

    populate(articlemeta=articlemeta, db_session=SESSION_FACTORY())
Esempio n. 15
0
    def run(self):

        client = ThriftClient()

        logger.info('Creating zip file: %s', self.zip_name)
        logger.info('XML Format: %s', self.xml_format)

        with zipfile.ZipFile(self.zip_name,
                             'w',
                             compression=zipfile.ZIP_DEFLATED,
                             allowZip64=True) as thezip:
            for pid, collection, document in self.items():
                logger.debug('Loading XML file for %s',
                             '_'.join([collection, pid]))
                collection = trans_acronym.get(collection, collection)
                issn = pid[1:10]
                xml_file = '{0}/{1}/{2}.xml'.format(collection, issn, pid)
                thezip.writestr(xml_file, bytes(document.encode('utf-8')))

            readmef = open(
                os.path.dirname(__file__) +
                '/templates/dumparticle_readme.txt', 'r').read()
            readme = '{0}\r\n* Documents updated at: {1}\r\n'.format(
                readmef,
                datetime.datetime.now().isoformat())

            thezip.writestr("README.txt", bytes(readme.encode('utf-8')))

            if self.xml_format == 'xmlwos':
                xsd = getschema()
                if xsd:
                    thezip.writestr("schema/ThomsonReuters_publishing.xsd",
                                    bytes(xsd.encode('utf-8')))

        logger.info('Zip created: %s', self.zip_name)
        logger.info('Processing finished')
Esempio n. 16
0
def run(collection, issns, full_rebuild=False, force_delete=False, bulk_size=BULK_SIZE):

    rc = ThriftClient(domain=ARTICLEMETA_THRIFTSERVER, admintoken=ADMINTOKEN)

    logger.info('Running Isis2mongo')
    logger.debug('Thrift Server: %s', ARTICLEMETA_THRIFTSERVER)
    logger.debug('Admin Token: %s', ADMINTOKEN)
    logger.info('Loading ArticleMeta identifiers for collection: %s', collection)

    articlemeta_documents = set(
        load_articlemeta_documents_ids(collection, issns))
    articlemeta_issues = set(
        load_articlemeta_issues_ids(collection, issns))
    articlemeta_journals = set(
        load_articlemeta_journals_ids(collection, issns))

    if full_rebuild is True:
        articlemeta_documents = set([])
        articlemeta_issues = set([])
        articlemeta_journals = set([])

    with DataBroker(uuid.uuid4()) as ctrl:
        update_issue_id = ''

        fields_to_update_after_loading_documents = []
        bulk = {}

        bulk_count = 0
        for coll, record in load_isis_records(collection, issns):
            bulk_count += 1
            bulk.setdefault(coll, [])
            bulk[coll].append(record)
            if bulk_count == bulk_size:
                bulk_count = 0
                ctrl.bulk_data(dict(bulk))
                bulk = {}

            # ctrl.write_record(coll, record)
            # Write field 4 in issue database
            rec_type = record.get('v706', [{'_': ''}])[0]['_']
            if rec_type == 'h':
                if update_issue_id == record['v880'][0]['_'][1:18]:
                    continue
                fields_to_update_after_loading_documents.append([
                    'issues',
                    'v4',
                    record['v4'][0]['_'],
                    record['v880'][0]['_'][1:18]
                ])
        # bulk residual data
        ctrl.bulk_data(dict(bulk))

        logger.info('Updating fields metadata')
        total_fields_to_update = len(fields_to_update_after_loading_documents)
        for ndx, item in enumerate(fields_to_update_after_loading_documents, 1):
            logger.debug("Updating (%d, %d) %s", ndx, total_fields_to_update, str(item))
            ctrl.update_field(*item)

        logger.info('Loading legacy identifiers')
        legacy_documents = set(ctrl.articles_ids)
        legacy_issues = set(ctrl.issues_ids)
        legacy_journals = set(ctrl.journals_ids)

        logger.info('Producing lists of differences between ArticleMeta and Legacy databases')
        new_documents = list(legacy_documents - articlemeta_documents)
        new_issues = list(legacy_issues - articlemeta_issues)
        new_journals = list(legacy_journals - articlemeta_journals)

        am_document_pids_only = set([i[0:27] for i in articlemeta_documents])
        lg_document_pids_only = set([i[0:27] for i in legacy_documents])
        to_remove_documents = list(am_document_pids_only - lg_document_pids_only)

        am_issue_pids_only = set([i[0:21] for i in articlemeta_issues])
        lg_issue_pids_only = set([i[0:21] for i in legacy_issues])
        to_remove_issues = list(am_issue_pids_only - lg_issue_pids_only)

        am_journals_pids_only = set([i[0:13] for i in articlemeta_journals])
        lg_journals_pids_only = set([i[0:13] for i in legacy_journals])
        to_remove_journals = list(am_journals_pids_only - lg_journals_pids_only)

        # Including and Updating Documents
        logger.info(
            'Documents being included into articlemeta (%d)',
            len(new_documents)
        )
        for ndx, item in enumerate(new_documents, 1):
            item = item.split('_')
            try:
                document_meta = ctrl.load_document(item[0], item[1])
            except:
                logger.error(
                    'Fail to load document into Articlemeta (%s)',
                    '_'.join([item[0], item[1]])
                )
                continue

            if not document_meta:
                logger.error(
                    'Fail to load document into Articlemeta (%s)',
                    '_'.join([item[0], item[1]])
                )
                continue

            try:
                rc.add_document(json.dumps(document_meta))
            except ServerError:
                logger.error(
                    'Fail to load document into Articlemeta (%s)',
                    '_'.join([item[0], item[1]])
                )
                continue

            logger.debug(
                'Document (%d, %d) loaded into Articlemeta (%s)',
                ndx, len(new_documents),
                '_'.join([item[0], item[1]])
            )

        # Removing Documents
        total_to_remove_documents = len(to_remove_documents)
        logger.info(
            'Documents to be removed from articlemeta (%d)',
            total_to_remove_documents
        )

        skip_deletion = True
        if total_to_remove_documents > SECURE_ARTICLE_DELETIONS_NUMBER:
            logger.info('To many documents to be removed')
            if force_delete is False:
                skip_deletion = True
                logger.info('force_delete is setup to %s, the remove task will be skipped', force_delete)
        else:
            skip_deletion = False

        for item in to_remove_documents:
            item = item.split('_')
            if skip_deletion is True:
                logger.debug(
                    'Document remove task (%d, %d) will be skipped (%s)',
                    ndx,
                    total_to_remove_documents,
                    '_'.join([item[0], item[1]])
                )
            try:
                rc.delete_document(item[1], item[0])
                logger.debug(
                    'Document (%d, %d) removed from Articlemeta (%s)',
                    ndx,
                    total_to_remove_documents,
                    '_'.join([item[0], item[1]])
                )
            except UnauthorizedAccess:
                logger.warning('Unauthorized access to remove itens, check the ArticleMeta admin token')


        # Including and Updating Journals
        logger.info(
            'Journals being included into articlemeta (%d)',
            len(new_journals)
        )
        for ndx, item in enumerate(new_journals, 1):
            item = item.split('_')
            try:
                journal_meta = ctrl.load_journal(item[0], item[1])
            except:
                logger.error(
                    'Fail to load journal into Articlemeta (%s)',
                    '_'.join([item[0], item[1]])
                )
                continue
            if not journal_meta:
                logger.error(
                    'Fail to load journal into Articlemeta (%s)',
                    '_'.join([item[0], item[1]])
                )
                continue

            try:
                rc.add_journal(json.dumps(journal_meta))
            except ServerError:
                logger.error(
                    'Fail to load document into Articlemeta (%s)',
                    '_'.join([item[0], item[1]])
                )
                continue

            logger.debug(
                'Journal (%d, %d) loaded into Articlemeta (%s)',
                ndx,
                len(new_journals),
                '_'.join([item[0], item[1]])
            )

        # Removing Journals
        total_to_remove_journals = len(to_remove_journals)
        logger.info(
            'Journals to be removed from articlemeta (%d)',
            total_to_remove_journals
        )

        skip_deletion = True
        if total_to_remove_journals > SECURE_JOURNAL_DELETIONS_NUMBER:
            logger.info('To many journals to be removed')
            if force_delete is False:
                skip_deletion = True
                logger.info('force_delete is setup to %s, the remove task will be skipped', force_delete)
        else:
            skip_deletion = False

        for ndx, item in enumerate(to_remove_journals, 1):
            item = item.split('_')
            if skip_deletion is True:
                logger.debug(
                    'Journal remove task (%d, %d) will be skipped (%s)',
                    ndx,
                    total_to_remove_journals,
                    '_'.join([item[0], item[1]])
                )
            try:
                rc.delete_journal(item[1], item[0])
                logger.debug(
                    'Journal (%d, %d) removed from Articlemeta (%s)',
                    ndx,
                    total_to_remove_journals,
                    '_'.join([item[0], item[1]])
                )
            except UnauthorizedAccess:
                logger.warning('Unauthorized access to remove itens, check the ArticleMeta admin token')

        # Including and Updating Issues
        logger.info(
            'Issues being included into articlemeta (%d)',
            len(new_issues)
        )
        for ndx, item in enumerate(new_issues, 1):
            item = item.split('_')

            try:
                issue_meta = ctrl.load_issue(item[0], item[1])
            except:
                logger.error(
                    'Fail to load issue into Articlemeta (%s)',
                    '_'.join([item[0], item[1]])
                )
                continue

            if not issue_meta:
                logger.error(
                    'Fail to load issue into Articlemeta (%s)',
                    '_'.join([item[0], item[1]])
                )
                continue

            try:
                rc.add_issue(json.dumps(issue_meta))
            except ServerError:
                logger.error(
                    'Fail to load document into Articlemeta (%s)',
                    '_'.join([item[0], item[1]])
                )
                continue

            logger.debug(
                'Issue (%d, %d) loaded into Articlemeta (%s)',
                ndx,
                len(new_issues),
                '_'.join([item[0], item[1]])
            )

        # Removing Issues
        total_to_remove_issues = len(to_remove_issues)
        logger.info(
            'Issues to be removed from articlemeta (%d)',
            total_to_remove_issues
        )

        skip_deletion = True
        if total_to_remove_documents > SECURE_ISSUE_DELETIONS_NUMBER:
            logger.info('To many issues to be removed')
            if force_delete is False:
                skip_deletion = True
                logger.info('force_delete is setup to %s, the remove task will be skipped', force_delete)
        else:
            skip_deletion = False

        for ndx, item in enumerate(to_remove_issues, 1):
            item = item.split('_')
            if skip_deletion is True:
                logger.debug(
                    'Issue remove task (%d, %d) will be skipped (%s)',
                    ndx,
                    total_to_remove_issues,
                    '_'.join([item[0], item[1]])
                )
            try:
                rc.delete_issue(item[1], item[0])
                logger.debug(
                    'Issue (%d, %d) removed from Articlemeta (%s)',
                    ndx,
                    total_to_remove_issues,
                    '_'.join([item[0], item[1]])
                )
            except UnauthorizedAccess:
                logger.warning('Unauthorized access to remove itens, check the ArticleMeta admin token')

    logger.info('Process Isis2mongo Finished')
Esempio n. 17
0
    def run(self):
        """
        Run the process for update article in Solr.
        """

        art_meta = ThriftClient()

        if self.delete:

            self.solr.delete(self.delete, commit=True)
        else:

            logger.info("Indexing in {0}".format(self.solr.url))
            for document in art_meta.documents(
                    collection=self.collection,
                    issn=self.issn,
                    from_date=self.format_date(self.from_date),
                    until_date=self.format_date(self.until_date)):

                logger.debug("Loading document %s" % '_'.join(
                    [document.collection_acronym, document.publisher_id]))

                try:
                    xml = self.pipeline_to_xml(document)
                    self.solr.update(self.pipeline_to_xml(document),
                                     commit=True)
                except ValueError as e:
                    logger.error("ValueError: {0}".format(e))
                    logger.exception(e)
                    continue
                except Exception as e:
                    logger.error("Error: {0}".format(e))
                    logger.exception(e)
                    continue

        if self.sanitization is True:
            logger.info("Running sanitization process")
            ind_ids = set()
            art_ids = set()

            itens_query = []
            if self.collection:
                itens_query.append('in:%s' % self.collection)

            if self.issn:
                itens_query.append('issn:%s' % self.issn)

            query = '*:*' if len(itens_query) == 0 else ' AND '.join(
                itens_query)

            list_ids = json.loads(
                self.solr.select({
                    'q': query,
                    'fl': 'id',
                    'rows': 1000000
                }))['response']['docs']

            for id in list_ids:
                ind_ids.add(id['id'])
            # all ids in articlemeta
            for item in art_meta.documents(collection=self.collection,
                                           issn=self.issn,
                                           only_identifiers=True):
                art_ids.add('%s-%s' % (item.code, item.collection))
            # Ids to remove
            remove_ids = ind_ids - art_ids
            for id in remove_ids:
                logger.debug("Removing id: %s" % id)
                self.solr.delete('id:%s' % id, commit=True)

        # optimize the index
        self.solr.commit()
        self.solr.optimize()
Esempio n. 18
0
}

ROBOTS = [
    i.strip() for i in open(utils.settings.get('robots_file', 'robots.txt'))
]
APACHE_LOG_FORMAT = utils.settings.get(
    'log_format',
    r'= %h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"')
COMPILED_ROBOTS = [re.compile(i.lower()) for i in ROBOTS]
REGEX_ISSN = re.compile("^[0-9]{4}-[0-9]{3}[0-9xX]$")
REGEX_ISSUE = re.compile("^[0-9]{4}-[0-9]{3}[0-9xX][0-2][0-9]{3}[0-9]{4}$")
REGEX_ARTICLE = re.compile(
    "^[0-9]{4}-[0-9]{3}[0-9xX][0-2][0-9]{3}[0-9]{4}[0-9]{5}$")
REGEX_FBPE = re.compile("^[0-9]{4}-[0-9]{3}[0-9xX]\([0-9]{2}\)[0-9]{8}$")

am_client = ThriftClient(domain='articlemeta.scielo.org:11621')


def _allowed_collections():
    """Obtém a lista das coleções a partir do ArticleMeta
    """
    allowed_collections = []

    try:
        collections = am_client.collections()
    except:
        logger.error('Fail to retrieve collections from thrift server')

    return [i.code for i in collections]

Esempio n. 19
0
def load_xml_from_articlemeta(self, code):
    articlemeta = ThriftClient(domain=ARTICLEMETA_THRIFTSERVER)

    exc_log_title = ''

    with transactional_session() as session:
        deposit = session.query(Deposit).filter_by(code=code).first()

        log_title = 'Loading XML document from ArticleMeta (%s)' % code
        log_event(
            session, {
                'title': log_title,
                'type': 'submission',
                'status': 'info',
                'deposit_code': code
            })

        try:
            xml = articlemeta.document(deposit.pid,
                                       deposit.collection_acronym,
                                       fmt='xmlcrossref')
        except Exception as exc:
            logger.info('could not fetch Crossref XML for "%s": %s', code,
                        str(exc))
            logger.exception(exc)

            deposit.submission_status = 'error'
            deposit.submission_updated_at = datetime.now()
            deposit.updated_at = datetime.now()

            log_title = 'Fail to load XML document from ArticleMeta (%s)' % code
            log_event(
                session, {
                    'title': log_title,
                    'body': str(exc),
                    'type': 'submission',
                    'status': 'error',
                    'deposit_code': code
                })

            exc_log_title = log_title

        else:
            deposit.submission_status = 'waiting'
            deposit.submission_xml = xml
            deposit.submission_updated_at = datetime.now()
            deposit.updated_at = datetime.now()

            log_title = 'XML Document loaded from ArticleMeta (%s)' % code
            log_event(
                session, {
                    'title': log_title,
                    'type': 'submission',
                    'status': 'success',
                    'deposit_code': code
                })

    if exc_log_title:
        raise self.retry(exc=ComunicationError(exc_log_title))

    return code
Esempio n. 20
0
 def client(self):
     """
     Returns a new instance of ThriftClient client
     """
     client = ThriftClient(domain=self._domain, timeout=self.timeout)
     return client
Esempio n. 21
0
    def run(self):
        """
        Run the process for update article in Solr.
        """

        art_meta = ThriftClient()

        if self.args.delete:

            self.solr.delete(self.args.delete, commit=True)

        elif self.args.sanitization:

            # set of index ids
            ind_ids = set()

            # set of articlemeta ids
            art_ids = set()

            # all ids in index
            list_ids = json.loads(self.solr.select(
                                    {'q': '*:*', 'fl': 'id', 'rows': 1000000}))['response']['docs']

            for id in list_ids:
                ind_ids.add(id['id'])

            # all ids in articlemeta
            for item in art_meta.documents(only_identifiers=True):
                if item.collection not in ALLOWED_COLLECTION:
                    continue
                art_ids.add('%s-%s' % (item.code, item.collection))

            # Ids to remove
            remove_ids = ind_ids - art_ids

            for id in remove_ids:
                self.solr.delete('id:%s' % id, commit=True)

            logger.info("List of removed ids: %s" % remove_ids)

        else:

            # Get article identifiers

            logger.info("Indexing in {0}".format(self.solr.url))

            for document in art_meta.documents(
                collection=self.args.collection,
                issn=self.args.issn,
                from_date=self.format_date(self.args.from_date),
                until_date=self.format_date(self.args.until_date)
            ):

                try:
                    xml = self.pipeline_to_xml(document)
                    self.solr.update(self.pipeline_to_xml(document), commit=True)
                except ValueError as e:
                    logger.error("ValueError: {0}".format(e))
                    logger.exception(e)
                    continue
                except Exception as e:
                    logger.error("Error: {0}".format(e))
                    logger.exception(e)
                    continue

        # optimize the index
        self.solr.commit()
        self.solr.optimize()
Esempio n. 22
0
    def run(self):
        """
        Run the process for update article in Solr.
        """

        art_meta = ThriftClient()

        if self.args.delete:

            self.solr.delete(self.args.delete, commit=True)

        elif self.args.sanitization:

            # set of index ids
            ind_ids = set()

            # set of articlemeta ids
            art_ids = set()

            # all ids in index
            list_ids = json.loads(
                self.solr.select({
                    'q': '*:*',
                    'fl': 'id',
                    'rows': 1000000
                }))['response']['docs']

            for id in list_ids:
                ind_ids.add(id['id'])

            # all ids in articlemeta
            for item in art_meta.documents(only_identifiers=True):
                if item.collection not in ALLOWED_COLLECTION:
                    continue
                art_ids.add('%s-%s' % (item.code, item.collection))

            # Ids to remove
            remove_ids = ind_ids - art_ids

            for id in remove_ids:
                self.solr.delete('id:%s' % id, commit=True)

            logger.info("List of removed ids: %s" % remove_ids)

        else:

            # Get article identifiers

            logger.info("Indexing in {0}".format(self.solr.url))

            for document in art_meta.documents(
                    collection=self.args.collection,
                    issn=self.args.issn,
                    from_date=self.format_date(self.args.from_date),
                    until_date=self.format_date(self.args.until_date)):

                try:
                    xml = self.pipeline_to_xml(document)
                    self.solr.update(self.pipeline_to_xml(document),
                                     commit=True)
                except ValueError as e:
                    logger.error("ValueError: {0}".format(e))
                    logger.exception(e)
                    continue
                except Exception as e:
                    logger.error("Error: {0}".format(e))
                    logger.exception(e)
                    continue

        # optimize the index
        self.solr.commit()
        self.solr.optimize()
Esempio n. 23
0
    def differential_mode(self):
        art_meta = ThriftClient()

        logger.info("Running with differential mode")
        ind_ids = set()
        art_ids = set()

        # all ids in search index
        logger.info("Loading Search Index ids.")
        itens_query = []
        if self.collection:
            itens_query.append('in:%s' % self.collection)

        if self.issn:
            itens_query.append('issn:%s' % self.issn)

        query = '*:*' if len(itens_query) == 0 else ' AND '.join(itens_query)
        list_ids = json.loads(self.solr.select(
            {'q': query, 'fl': 'id,scielo_processing_date', 'rows': 1000000}))['response']['docs']

        for id in list_ids:
            ind_ids.add('%s-%s' % (id['id'], id.get('scielo_processing_date', '1900-01-01')))

        # all ids in articlemeta
        logger.info("Loading ArticleMeta ids.")
        for item in art_meta.documents(
            collection=self.collection,
            issn=self.issn,
            only_identifiers=True
        ):
            art_ids.add('%s-%s-%s' % (item.code, item.collection, item.processing_date))

        # Ids to remove
        if self.delete is True:
            logger.info("Running remove records process.")
            remove_ids = set([i[:27] for i in ind_ids]) - set([i[:27] for i in art_ids])
            logger.info("Removing (%d) documents from search index." % len(remove_ids))
            total_to_remove = len(remove_ids)
            if total_to_remove > 0:
                for ndx, to_remove_id in enumerate(remove_ids, 1):
                    logger.debug("Removing (%d/%d): %s" % (ndx, total_to_remove, to_remove_id))
                    self.solr.delete('id:%s' % to_remove_id, commit=False)

        # Ids to include
        logger.info("Running include records process.")
        include_ids = art_ids - ind_ids
        logger.info("Including (%d) documents to search index." % len(include_ids))
        total_to_include = len(include_ids)
        if total_to_include > 0:
            for ndx, to_include_id in enumerate(include_ids, 1):
                logger.debug("Including (%d/%d): %s" % (ndx, total_to_include, to_include_id))
                code = to_include_id[:23]
                collection = to_include_id[24: 27]
                processing_date = to_include_id[:-11]
                document = art_meta.document(code=code, collection=collection)
                try:
                    xml = self.pipeline_to_xml(document)
                    self.solr.update(xml, commit=False)
                except ValueError as e:
                    logger.error("ValueError: {0}".format(e))
                    logger.exception(e)
                    continue
                except Exception as e:
                    logger.error("Error: {0}".format(e))
                    logger.exception(e)
                    continue
Esempio n. 24
0
    def run(self):
        """
        Run the process for update article in Solr.
        """

        art_meta = ArticleMetaThriftClient()
        art_accesses = AccessThriftClient(domain="ratchet.scielo.org:11660")

        logger.info("Loading Solr available document ids")
        itens_query = []

        if self.collection:
            itens_query.append('in:%s' % self.collection)

        if self.issn:
            itens_query.append('issn:%s' % self.issn)

        query = '*:*' if len(itens_query) == 0 else ' AND '.join(itens_query)

        available_ids = set([
            i['id'] for i in json.loads(
                self.solr.select({
                    'q': query,
                    'fl': 'id',
                    'rows': 1000000
                }))['response']['docs']
        ])

        logger.info("Recording accesses for documents in {0}".format(
            self.solr.url))

        for document in art_meta.documents(collection=self.collection,
                                           issn=self.issn):

            solr_id = '-'.join(
                [document.publisher_id, document.collection_acronym])

            if solr_id not in available_ids:
                continue

            logger.debug("Loading accesses for document %s" % solr_id)

            total_accesses = int(
                art_accesses.document(document.publisher_id,
                                      document.collection_acronym).get(
                                          'access_total',
                                          {'value': 0})['value'])

            xml = self.set_accesses(solr_id, total_accesses)

            try:
                result = self.solr.update(xml, commit=False)
            except ValueError as e:
                logger.error("ValueError: {0}".format(e))
                logger.exception(e)
                continue
            except Exception as e:
                logger.error("Error: {0}".format(e))
                logger.exception(e)
                continue

        # optimize the index
        self.solr.commit()
        self.solr.optimize()
Esempio n. 25
0
def articlemeta(domain='articlemeta.scielo.org:11621'):

    return ThriftClient(domain=domain)