Example #1
0
 def test_non_software_doi(self):
     doi_id = "10.1016/j.dsr2.2008.10.030"  # Not software
     expected_response_content = ''
     expected_parsed_response = {}
     httpretty.enable(
     )  # enable HTTPretty so that it will monkey patch the socket module
     httpretty.register_uri(httpretty.GET,
                            self.app.conf['DOI_URL'] + doi_id,
                            body=expected_response_content)
     raw_metadata = doi.fetch_metadata(self.app.conf['DOI_URL'],
                                       self.app.conf['DATACITE_URL'],
                                       doi_id)
     parsed_metadata = doi.parse_metadata(raw_metadata)
     self.assertEqual(raw_metadata, expected_response_content)
     self.assertEqual(parsed_metadata, expected_parsed_response)
     httpretty.disable()
     httpretty.reset()  # clean up registered urls and request history
Example #2
0
 def test_software_doi(self):
     doi_id = "10.5281/zenodo.11020"  # software
     expected_response_content = self.mock_data[doi_id]['raw']
     expected_parsed_response = self.mock_data[doi_id]['parsed']
     httpretty.enable(
     )  # enable HTTPretty so that it will monkey patch the socket module
     httpretty.register_uri(httpretty.GET,
                            self.app.conf['DOI_URL'] + doi_id,
                            body=expected_response_content)
     raw_metadata = doi.fetch_metadata(self.app.conf['DOI_URL'],
                                       self.app.conf['DATACITE_URL'],
                                       doi_id)
     parsed_metadata = doi.parse_metadata(raw_metadata)
     self.assertEqual(raw_metadata, expected_response_content)
     self.assertEqual(parsed_metadata, expected_parsed_response)
     httpretty.disable()
     httpretty.reset()  # clean up registered urls and request history
def task_process_new_citation(citation_change, force=False):
    """
    Process new citation:
    - Retrieve metadata from doi.org
    """
    canonical_citing_bibcode = api.get_canonical_bibcode(
        app, citation_change.citing)
    if canonical_citing_bibcode is None:
        logger.error(
            "The citing bibcode '%s' is not in the system yet, it will be skipped in this ingestion",
            citation_change.citing)
        return
    content_type = None
    is_link_alive = False
    status = u"DISCARDED"

    # Check if we already have the citation target in the DB
    metadata = db.get_citation_target_metadata(app, citation_change.content)
    citation_target_in_db = bool(metadata)  # False if dict is empty
    raw_metadata = metadata.get('raw', None)
    parsed_metadata = metadata.get('parsed', {})
    if citation_target_in_db:
        status = metadata.get(
            'status', u'DISCARDED')  # "REGISTERED" if it is a software record

    if citation_change.content_type == adsmsg.CitationChangeContentType.doi \
        and citation_change.content not in ["", None]:
        # Default values
        content_type = u"DOI"
        #
        if not citation_target_in_db:
            # Fetch DOI metadata (if HTTP request fails, an exception is raised
            # and the task will be re-queued (see app.py and adsputils))
            raw_metadata = doi.fetch_metadata(app.conf['DOI_URL'],
                                              app.conf['DATACITE_URL'],
                                              citation_change.content)
            if raw_metadata:
                parsed_metadata = doi.parse_metadata(raw_metadata)
                is_software = parsed_metadata.get('doctype',
                                                  u'').lower() == "software"
                if parsed_metadata.get('bibcode') not in (None,
                                                          "") and is_software:
                    status = u"REGISTERED"
    elif citation_change.content_type == adsmsg.CitationChangeContentType.pid \
        and citation_change.content not in ["", None]:
        content_type = u"PID"
        status = None
        is_link_alive = url.is_alive(app.conf['ASCL_URL'] +
                                     citation_change.content)
        parsed_metadata = {'link_alive': is_link_alive}
    elif citation_change.content_type == adsmsg.CitationChangeContentType.url \
        and citation_change.content not in ["", None]:
        content_type = u"URL"
        status = None
        is_link_alive = url.is_alive(citation_change.content)
        parsed_metadata = {'link_alive': is_link_alive}
    else:
        logger.error(
            "Citation change should have doi, pid or url informed: {}",
            citation_change)
        status = None

    if status is not None:
        if not citation_target_in_db:
            # Create citation target in the DB
            target_stored = db.store_citation_target(app, citation_change,
                                                     content_type,
                                                     raw_metadata,
                                                     parsed_metadata, status)
        if status == u"REGISTERED":
            if citation_change.content_type == adsmsg.CitationChangeContentType.doi:
                if canonical_citing_bibcode != citation_change.citing:
                    # These two bibcodes are identical and we can signal the broker
                    event_data = webhook.identical_bibcodes_event_data(
                        citation_change.citing, canonical_citing_bibcode)
                    if event_data:
                        dump_prefix = citation_change.timestamp.ToDatetime(
                        ).strftime("%Y%m%d_%H%M%S")
                        logger.debug(
                            "Calling 'task_emit_event' for '%s' IsIdenticalTo '%s'",
                            citation_change.citing, canonical_citing_bibcode)
                        task_emit_event.delay(event_data, dump_prefix)
                citation_target_bibcode = parsed_metadata.get('bibcode')
                # The new bibcode and the DOI are identical
                event_data = webhook.identical_bibcode_and_doi_event_data(
                    citation_target_bibcode, citation_change.content)
                if event_data:
                    dump_prefix = citation_change.timestamp.ToDatetime(
                    ).strftime("%Y%m%d_%H%M%S")
                    logger.debug(
                        "Calling 'task_emit_event' for '%s' IsIdenticalTo '%s'",
                        citation_target_bibcode, citation_change.content)
                    task_emit_event.delay(event_data, dump_prefix)
                # Get citations from the database and transform the stored bibcodes into their canonical ones as registered in Solr.
                original_citations = db.get_citations_by_bibcode(
                    app, citation_target_bibcode)
                citations = api.get_canonical_bibcodes(app, original_citations)
                # Add canonical bibcode of current detected citation
                if canonical_citing_bibcode and canonical_citing_bibcode not in citations:
                    citations.append(canonical_citing_bibcode)
                logger.debug("Calling 'task_output_results' with '%s'",
                             citation_change)
                task_output_results.delay(citation_change, parsed_metadata,
                                          citations)
            logger.debug("Calling '_emit_citation_change' with '%s'",
                         citation_change)
            _emit_citation_change(citation_change, parsed_metadata)
        # Store the citation at the very end, so that if an exception is raised before
        # this task can be re-run in the future without key collisions in the database
        stored = db.store_citation(app, citation_change, content_type,
                                   raw_metadata, parsed_metadata, status)
def task_maintenance_metadata(dois, bibcodes):
    """
    Maintenance operation:
    - Get all the registered citation targets (or only a subset of them if DOIs and/or bibcodes are specified)
    - For each, retreive metadata and if it is different to what we have in our database:
        - Get the citations bibcodes and transform them to their canonical form
        - Send to master an update with the new metadata and the current list of citations canonical bibcodes
    """
    n_requested = len(dois) + len(bibcodes)
    if n_requested == 0:
        registered_records = db.get_citation_targets(app, only_registered=True)
    else:
        registered_records = db.get_citation_targets_by_bibcode(
            app, bibcodes, only_registered=True)
        registered_records += db.get_citation_targets_by_doi(
            app, dois, only_registered=True)
        registered_records = _remove_duplicated_dict_in_list(
            registered_records)

    for registered_record in registered_records:
        updated = False
        bibcode_replaced = {}
        # Fetch DOI metadata (if HTTP request fails, an exception is raised
        # and the task will be re-queued (see app.py and adsputils))
        raw_metadata = doi.fetch_metadata(app.conf['DOI_URL'],
                                          app.conf['DATACITE_URL'],
                                          registered_record['content'])
        if raw_metadata:
            parsed_metadata = doi.parse_metadata(raw_metadata)
            is_software = parsed_metadata.get('doctype',
                                              u'').lower() == "software"
            if not is_software:
                logger.error(
                    "The new metadata for '%s' has changed its 'doctype' and it is not 'software' anymore",
                    registered_record['bibcode'])
            elif parsed_metadata.get('bibcode') in (None, ""):
                logger.error(
                    "The new metadata for '%s' affected the metadata parser and it did not correctly compute a bibcode",
                    registered_record['bibcode'])
            else:
                # Detect concept DOIs: they have one or more versions of the software
                # and they are not a version of something else
                concept_doi = len(parsed_metadata.get(
                    'version_of',
                    [])) == 0 and len(parsed_metadata.get('versions', [])) >= 1
                different_bibcodes = registered_record[
                    'bibcode'] != parsed_metadata['bibcode']
                if concept_doi and different_bibcodes:
                    # Concept DOI publication date changes with newer software version
                    # and authors can also change (i.e., first author last name initial)
                    # but we want to respect the year in the bibcode, which corresponds
                    # to the year of the latest release when it was first ingested
                    # by ADS
                    parsed_metadata['bibcode'] = registered_record['bibcode']
                    # Temporary bugfix (some bibcodes have non-capital letter at the end):
                    parsed_metadata['bibcode'] = parsed_metadata[
                        'bibcode'][:-1] + parsed_metadata['bibcode'][-1].upper(
                        )
                    # Re-verify if bibcodes are still different (they could be if
                    # name parsing has changed):
                    different_bibcodes = registered_record[
                        'bibcode'] != parsed_metadata['bibcode']
                if different_bibcodes:
                    # These two bibcodes are identical and we can signal the broker
                    event_data = webhook.identical_bibcodes_event_data(
                        registered_record['bibcode'],
                        parsed_metadata['bibcode'])
                    if event_data:
                        dump_prefix = citation_change.timestamp.ToDatetime(
                        ).strftime("%Y%m%d")  # "%Y%m%d_%H%M%S"
                        logger.debug(
                            "Calling 'task_emit_event' for '%s' IsIdenticalTo '%s'",
                            registered_record['bibcode'],
                            parsed_metadata['bibcode'])
                        task_emit_event.delay(event_data, dump_prefix)
                    #
                    logger.warn(
                        "Parsing the new metadata for citation target '%s' produced a different bibcode: '%s'. The former will be moved to the 'alternate_bibcode' list, and the new one will be used as the main one.",
                        registered_record['bibcode'],
                        parsed_metadata.get('bibcode', None))
                    alternate_bibcode = parsed_metadata.get(
                        'alternate_bibcode', [])
                    alternate_bibcode += registered_record.get(
                        'alternate_bibcode', [])
                    if registered_record['bibcode'] not in alternate_bibcode:
                        alternate_bibcode.append(registered_record['bibcode'])
                    parsed_metadata['alternate_bibcode'] = alternate_bibcode
                    bibcode_replaced = {
                        'previous': registered_record['bibcode'],
                        'new': parsed_metadata['bibcode']
                    }
                updated = db.update_citation_target_metadata(
                    app, registered_record['bibcode'], raw_metadata,
                    parsed_metadata)
        if updated:
            citation_change = adsmsg.CitationChange(
                content=registered_record['content'],
                content_type=getattr(
                    adsmsg.CitationChangeContentType,
                    registered_record['content_type'].lower()),
                status=adsmsg.Status.updated,
                timestamp=datetime.now())
            if citation_change.content_type == adsmsg.CitationChangeContentType.doi:
                # Get citations from the database and transform the stored bibcodes into their canonical ones as registered in Solr.
                original_citations = db.get_citations_by_bibcode(
                    app, registered_record['bibcode'])
                citations = api.get_canonical_bibcodes(app, original_citations)
                logger.debug("Calling 'task_output_results' with '%s'",
                             citation_change)
                task_output_results.delay(citation_change,
                                          parsed_metadata,
                                          citations,
                                          bibcode_replaced=bibcode_replaced)
Example #5
0
def task_maintenance_reevaluate(dois, bibcodes):
    """
    Maintenance operation:
    - Get all the registered citation targets (or only a subset of them if DOIs and/or bibcodes are specified)
    - For each, retreive metadata and if it is different to what we have in our database:
        - Get the citations bibcodes and transform them to their canonical form
        - Send to master an update with the new metadata and the current list of citations canonical bibcodes
    """
    n_requested = len(dois) + len(bibcodes)
    if n_requested == 0:
        discarded_records = db.get_citation_targets(app,
                                                    only_status='DISCARDED')
    else:
        discarded_records = db.get_citation_targets_by_bibcode(
            app, bibcodes, only_status='DISCARDED')
        discarded_records += db.get_citation_targets_by_doi(
            app, dois, only_status='DISCARDED')
        discarded_records = _remove_duplicated_dict_in_list(discarded_records)

    for previously_discarded_record in discarded_records:
        updated = False
        bibcode_replaced = {}
        # Fetch DOI metadata (if HTTP request fails, an exception is raised
        # and the task will be re-queued (see app.py and adsputils))
        raw_metadata = doi.fetch_metadata(
            app.conf['DOI_URL'], app.conf['DATACITE_URL'],
            previously_discarded_record['content'])
        if raw_metadata:
            parsed_metadata = doi.parse_metadata(raw_metadata)
            is_software = parsed_metadata.get('doctype',
                                              u'').lower() == "software"
            if not is_software:
                logger.error("Discarded '%s', it is not 'software'",
                             previously_discarded_record['content'])
            elif parsed_metadata.get('bibcode') in (None, ""):
                logger.error(
                    "The metadata for '%s' could not be parsed correctly and it did not correctly compute a bibcode",
                    previously_discarded_record['content'])
            else:
                # Create citation target in the DB
                updated = db.update_citation_target_metadata(
                    app,
                    previously_discarded_record['content'],
                    raw_metadata,
                    parsed_metadata,
                    status='REGISTERED')
                if updated:
                    db.mark_all_discarded_citations_as_registered(
                        app, previously_discarded_record['content'])
        if updated:
            citation_change = adsmsg.CitationChange(
                content=previously_discarded_record['content'],
                content_type=getattr(
                    adsmsg.CitationChangeContentType,
                    previously_discarded_record['content_type'].lower()),
                status=adsmsg.Status.new,
                timestamp=datetime.now())
            if citation_change.content_type == adsmsg.CitationChangeContentType.doi:
                # Get citations from the database and transform the stored bibcodes into their canonical ones as registered in Solr.
                original_citations = db.get_citations_by_bibcode(
                    app, parsed_metadata['bibcode'])
                citations = api.get_canonical_bibcodes(app, original_citations)
                logger.debug("Calling 'task_output_results' with '%s'",
                             citation_change)
                task_output_results.delay(citation_change,
                                          parsed_metadata,
                                          citations,
                                          bibcode_replaced=bibcode_replaced)