Ejemplo n.º 1
0
    def test_09_hybrid_oa(self):
        def is_hybrid_lookup(msg):
            return False

        def is_oa_lookup(msg):
            return True

        def is_failed_lookup(msg):
            return None

        # Check that an OA record is correctly identified
        workflow.doaj_lookup = is_oa_lookup
        record = models.Record()
        msg = workflow.WorkflowMessage(record=record)
        workflow.hybrid_or_oa(msg)
        assert record.journal_type == "oa"
        assert len(record.provenance) == 1

        # check that a hybrid journal is correctly identified
        workflow.doaj_lookup = is_hybrid_lookup
        record = models.Record()
        msg = workflow.WorkflowMessage(record=record)
        workflow.hybrid_or_oa(msg)
        assert record.journal_type == "hybrid"
        assert len(record.provenance) == 1

        # check that no DOAJ check is performed if no issns are present
        # or alternatively the DOAJ lookup fails for unknown reasons
        workflow.doaj_lookup = is_failed_lookup
        record = models.Record()
        msg = workflow.WorkflowMessage(record=record)
        workflow.hybrid_or_oa(msg)
        assert record.journal_type is None
Ejemplo n.º 2
0
    def test_11_oag_callback_02_finished(self):
        cb = workflow.oag_callback_closure()

        job = models.SpreadsheetJob()
        job.save()

        state = oagclient.RequestState(["PMC1234", "PMC9876"], max_retries=1)
        state.record_requested(["PMC1234", "PMC9876"])

        oagrlink = models.OAGRLink()
        oagrlink.spreadsheet_id = job.id
        oagrlink.oagrjob_id = state.id
        oagrlink.save()

        record = models.Record()
        record.upload_id = job.id
        record.pmcid = "PMC1234"
        record.save()

        record = models.Record()
        record.upload_id = job.id
        record.pmcid = "PMC9876"
        record.save()

        time.sleep(2)

        cb("finished", state)

        time.sleep(2)

        r1 = models.Record.get_by_identifier("PMC1234", job.id, "pmcid").next()
        r2 = models.Record.get_by_identifier("PMC9876", job.id, "pmcid").next()

        assert r1.in_oag is False
        assert len(r1.provenance) == 1
        assert r1.provenance[0][2].startswith(
            "Attempted to retrieve PMC1234 1")
        assert r1.oag_pmcid == "error"
        assert r1.oag_complete is True

        assert r2.in_oag is False
        assert r2.oag_pmcid == "error"
        assert len(r2.provenance) == 1
        assert r2.provenance[0][2].startswith(
            "Attempted to retrieve PMC9876 1")
        assert r2.oag_complete is True
Ejemplo n.º 3
0
    def test_10_process_record_03_aam_no_licence(self):
        def mock_get_md(*args, **kwargs):
            md = epmcmod.EPMCMetadata(json.loads(open(EPMC_MD, "r").read()))
            return md, 1.0

        def mock_get_ft(*args, **kwargs):
            data = open(EPMC_FT, "r").read()
            xml = etree.fromstring(data)
            l = xml.xpath("//license")
            l[0].getparent().remove(l[0])
            s = etree.tostring(xml)
            return epmc.EPMCFullText(s)

        def mock_doaj(*args, **kwargs):
            return True

        def mock_romeo(*args, **kwargs):
            pass

        def mock_core(*args, **kwargs):
            pass

        workflow.get_epmc_md = mock_get_md
        workflow.get_epmc_fulltext = mock_get_ft
        workflow.doaj_lookup = mock_doaj
        workflow.embargo = mock_romeo
        workflow.ou_core = mock_core

        record = models.Record()
        record.pmcid = "PMC4219345"
        record.id = record.makeid()
        oag = []
        msg = workflow.WorkflowMessage(record=record, oag_register=oag)
        workflow.process_record(msg)

        assert record.confidence == 1.0
        assert record.pmcid == "PMC4219345"
        assert record.pmid == "24279897"
        assert record.doi == "10.1186/1471-2121-14-52"
        assert record.in_epmc is True
        assert record.is_oa is False
        assert len(record.issn) == 1
        assert "1471-2121" in record.issn
        assert record.id is not None  # implies it has been saved
        assert record.has_ft_xml is True
        assert record.aam is True
        assert record.aam_from_xml is True
        assert record.licence_type is None
        assert record.licence_source is None
        assert record.journal_type == "oa"
        assert len(oag) == 1
        assert oag[0]["id"] == "PMC4219345"
        assert oag[0]["type"] == "pmcid"
Ejemplo n.º 4
0
    def test_03_handle_oag_response_01_pmcid_success(self):
        # first make ourselves a job/record that we want to enhance
        job = models.SpreadsheetJob()
        job.save()

        record = models.Record()
        record.upload_id = job.id
        record.pmcid = "PMC1234"
        record.save()

        time.sleep(2)

        # construct the OAG response object, which has detected a licence
        oag_result = {
            "identifier": [{
                "id": "PMC1234",
                "type": "pmcid"
            }],
            "license": [{
                "type": "cc-by",
                "provenance": {
                    "accepted_author_manuscript": True,  # FIXME: provisional
                    "description": "Provenance PMC1234"
                }
            }]
        }

        # call the oag record callback
        oag_rerun = []
        workflow.oag_record_callback(oag_result, oag_rerun, job)

        # should not have added anything to the rerun
        assert len(oag_rerun) == 0

        # give the index a moment to catch up
        time.sleep(2)

        r2 = models.Record.get_by_identifier("PMC1234", job.id, "pmcid").next()
        assert isinstance(r2, models.Record)
        assert r2.id == record.id
        assert r2.pmcid == "PMC1234"

        # licence added, source=epmc, pmcid=success, provenance added, aam set
        assert r2.licence_type == "cc-by"
        assert r2.licence_source == "epmc"
        assert r2.oag_pmcid == "success"
        assert r2.aam_from_epmc is True
        assert r2.aam is True
        provs = [n for b, w, n in r2.provenance]
        assert len(provs) == 2
        assert "PMC1234 - Provenance PMC1234" in provs
        assert "Detected AAM status from EPMC web page" in provs
Ejemplo n.º 5
0
    def test_06_epmc_compliance_data(self):
        record = models.Record()
        msg = workflow.WorkflowMessage(record=record)

        data = json.loads(open(EPMC_MD, "r").read())
        epmc_md = epmcmod.EPMCMetadata(data)

        workflow.extract_metadata(msg, epmc_md)

        assert record.in_epmc is True
        assert record.is_oa is False
        assert len(record.issn) == 1
        assert "1471-2121" in record.issn
Ejemplo n.º 6
0
    def test_07_ft_info(self):
        record = models.Record()
        msg = workflow.WorkflowMessage(record=record)

        data = open(EPMC_FT, "r").read()
        ft = epmc.EPMCFullText(data)

        workflow.extract_fulltext_info(msg, ft)

        assert record.has_ft_xml is True
        assert len(record.provenance) == 2
        assert record.aam is True
        assert record.aam_from_xml is True
Ejemplo n.º 7
0
    def test_09_hybrid_oa(self):
        def is_hybrid_lookup(msg):
            return False

        def is_oa_lookup(msg):
            return True

        # Check that an OA record is correctly identified
        workflow.doaj_lookup = is_oa_lookup
        record = models.Record()
        msg = workflow.WorkflowMessage(record=record)
        workflow.hybrid_or_oa(msg)
        assert record.journal_type == "oa"
        assert len(record.provenance) == 1

        # check that a hybrid journal is correctly identified
        workflow.doaj_lookup = is_hybrid_lookup
        record = models.Record()
        msg = workflow.WorkflowMessage(record=record)
        workflow.hybrid_or_oa(msg)
        assert record.journal_type == "hybrid"
        assert len(record.provenance) == 1
Ejemplo n.º 8
0
    def test_03_doaj(self):
        record = models.Record()
        msg = workflow.WorkflowMessage(record=record)

        # An OA journal
        record.issn = "1338-3973"
        is_oa = workflow.doaj_lookup(msg)
        assert is_oa is True

        # a journal that we invented
        record.issn = "1234-5678"
        is_oa = workflow.doaj_lookup(msg)
        assert is_oa is False
Ejemplo n.º 9
0
    def test_03_handle_oag_response_02_pmcid_fto(self):
        # first make ourselves a record that we want to enhance
        job = models.SpreadsheetJob()
        job.save()

        record = models.Record()
        record.upload_id = job.id
        record.pmcid = "PMC1234"
        record.doi = "10.1234"
        record.save()
        time.sleep(2)

        # construct the OAG response object, which has detected a licence
        oag_result = {
            "identifier": [{
                "id": "PMC1234",
                "type": "pmcid"
            }],
            "license": [{
                "type": "failed-to-obtain-license",
                "provenance": {
                    "accepted_author_manuscript": True,  # FIXME: provisional
                    "description": "FTO PMC1234"
                }
            }]
        }

        # call the oag record callback
        oag_rerun = []
        workflow.oag_record_callback(oag_result, oag_rerun, job)

        # should have added the DOI to the re-run
        assert len(oag_rerun) == 1
        assert oag_rerun[0]["id"] == "10.1234"
        assert oag_rerun[0]["type"] == "doi"

        # give the index a moment to catch up
        time.sleep(2)

        r2 = models.Record.get_by_identifier("PMC1234", job.id, "pmcid").next()
        assert isinstance(r2, models.Record)

        # provenance added, pmcid=fto, aam set
        assert r2.licence_type is None
        assert r2.oag_pmcid == "fto"
        assert r2.aam_from_epmc is True
        assert r2.aam is True
        provs = [n for b, w, n in r2.provenance]
        assert len(provs) == 2
        assert "PMC1234 - FTO PMC1234" in provs
        assert "Detected AAM status from EPMC web page" in provs
Ejemplo n.º 10
0
    def test_03_handle_oag_response_04_pmcid_no_change(self):
        # first make ourselves a record that we want to enhance
        job = models.SpreadsheetJob()
        job.save()

        record = models.Record()
        record.upload_id = job.id
        record.pmcid = "PMC1234"
        record.licence_type = "CC BY"
        record.aam = True
        record.aam_from_xml = True
        record.save()
        time.sleep(2)

        # construct the OAG response object, which has detected a licence
        oag_result = {
            "identifier": [{
                "id": "PMC1234",
                "type": "pmcid"
            }],
            "license": [{
                "type": "failed-to-obtain-license",
                "provenance": {
                    "accepted_author_manuscript": False,  # FIXME: provisional
                    "description": "You won't see this PMC1234"
                }
            }]
        }

        # call the oag record callback
        oag_rerun = []
        workflow.oag_record_callback(oag_result, oag_rerun, job)

        # should not have added anything to the rerun
        assert len(oag_rerun) == 0

        # give the index a moment to catch up
        time.sleep(2)

        r2 = models.Record.get_by_identifier("PMC1234", job.id, "pmcid").next()
        assert isinstance(r2, models.Record)

        # expecting no changes
        assert r2.licence_type == "CC BY"
        assert r2.licence_source is None
        assert r2.oag_pmcid is None
        assert r2.aam_from_epmc is False
        assert r2.aam is True
        provs = [n for b, w, n in r2.provenance]
        assert len(provs) == 0
Ejemplo n.º 11
0
    def test_02_get_fulltext_xml(self):
        record = models.Record()
        msg = workflow.WorkflowMessage(record=record)

        # a successful fulltext retrieval
        record.pmcid = PMCID_SUCCESS
        ft = workflow.get_epmc_fulltext(msg)
        assert ft is not None
        assert ft.title == PMCID_SUCCESS_FT_TITLE, ft.title

        # failed fulltext retrieval
        record.pmcid = PMCID_ERROR
        ft = workflow.get_epmc_fulltext(msg)
        assert ft is None
Ejemplo n.º 12
0
    def test_03_handle_oag_response_07_doi_fto(self):
        # first make ourselves a record that we want to enhance
        job = models.SpreadsheetJob()
        job.save()

        record = models.Record()
        record.upload_id = job.id
        record.doi = "10.1234"
        record.pmid = "1234"
        record.save()
        time.sleep(2)

        # construct the OAG response object, which has detected a licence
        oag_result = {
            "identifier": [{
                "id": "10.1234",
                "type": "doi"
            }],
            "license": [{
                "type": "failed-to-obtain-license",
                "provenance": {
                    "description": "FTO 10.1234"
                }
            }]
        }

        # call the oag record callback
        oag_rerun = []
        workflow.oag_record_callback(oag_result, oag_rerun, job)

        # should have added the DOI to the re-run
        assert len(oag_rerun) == 1
        assert oag_rerun[0]["id"] == "1234"
        assert oag_rerun[0]["type"] == "pmid"

        # give the index a moment to catch up
        time.sleep(2)

        r2 = models.Record.get_by_identifier("10.1234", job.id).next()
        assert isinstance(r2, models.Record)

        # provenance added, doi=fto, pmid reprocess
        assert r2.licence_type is None
        assert r2.oag_doi == "fto"
        assert r2.aam is None
        provs = [n for b, w, n in r2.provenance]
        assert len(provs) == 1
        assert "10.1234 - FTO 10.1234" in provs
Ejemplo n.º 13
0
    def test_03_handle_oag_response_06_doi_success(self):
        # first make ourselves a record that we want to enhance
        job = models.SpreadsheetJob()
        job.save()

        record = models.Record()
        record.upload_id = job.id
        record.doi = "10.1234"
        record.save()
        time.sleep(2)

        # construct the OAG response object, which has detected a licence
        oag_result = {
            "identifier": [{
                "id": "10.1234",
                "type": "doi"
            }],
            "license": [{
                "type": "cc-by",
                "provenance": {
                    "description": "Provenance 10.1234"
                }
            }]
        }

        # call the oag record callback
        oag_rerun = []
        workflow.oag_record_callback(oag_result, oag_rerun, job)

        # should not have added anything to the rerun
        assert len(oag_rerun) == 0

        # give the index a moment to catch up
        time.sleep(2)

        r2 = models.Record.get_by_identifier(
            "10.1234",
            job.id).next()  # leave out the "doi" type just for the hell of it
        assert isinstance(r2, models.Record)

        # licence added, source=publisher, doi=success, provenance added
        assert r2.licence_type == "cc-by"
        assert r2.licence_source == "publisher"
        assert r2.oag_doi == "success"
        assert r2.aam is None
        provs = [n for b, w, n in r2.provenance]
        assert len(provs) == 1
        assert "10.1234 - Provenance 10.1234" in provs
Ejemplo n.º 14
0
    def test_10_process_record_02_no_md(self):
        def mock_get_md(*args, **kwargs):
            return None, None

        workflow.get_epmc_md = mock_get_md

        record = models.Record()
        record.pmcid = "PMC4219345"
        record.id = record.makeid()
        oag = []
        msg = workflow.WorkflowMessage(record=record, oag_register=oag)
        workflow.process_record(msg)

        assert record.confidence is None
        assert len(record.provenance) == 1
        assert len(oag) == 0
Ejemplo n.º 15
0
    def test_10_process_record_01_everything(self):
        def mock_get_md(*args, **kwargs):
            md = epmcmod.EPMCMetadata(json.loads(open(EPMC_MD, "r").read()))
            return md, 1.0

        def mock_get_ft(*args, **kwargs):
            data = open(EPMC_FT, "r").read()
            return epmc.EPMCFullText(data)

        def mock_doaj(*args, **kwargs):
            return False

        def mock_romeo(*args, **kwargs):
            pass

        def mock_core(*args, **kwargs):
            pass

        workflow.get_epmc_md = mock_get_md
        workflow.get_epmc_fulltext = mock_get_ft
        workflow.doaj_lookup = mock_doaj
        workflow.embargo = mock_romeo
        workflow.ou_core = mock_core

        record = models.Record()
        record.pmcid = "PMC4219345"
        record.id = record.makeid()
        oag = []
        msg = workflow.WorkflowMessage(record=record, oag_register=oag)
        workflow.process_record(msg)

        assert record.confidence == 1.0
        assert record.pmcid == "PMC4219345"
        assert record.pmid == "24279897"
        assert record.doi == "10.1186/1471-2121-14-52"
        assert record.in_epmc is True
        assert record.is_oa is False
        assert len(record.issn) == 1
        assert "1471-2121" in record.issn
        assert record.id is not None  # implies it has been saved
        assert record.has_ft_xml is True
        assert record.aam is True
        assert record.aam_from_xml is True
        assert record.licence_type == "cc-by"
        assert record.licence_source == "epmc_xml"
        assert record.journal_type == "hybrid"
        assert len(oag) == 0
Ejemplo n.º 16
0
    def test_10_process_record_04_licence_no_aam(self):
        def mock_get_md(*args, **kwargs):
            md = epmcmod.EPMCMetadata(json.loads(open(EPMC_MD, "r").read()))
            return md, 1.0

        def mock_get_ft(*args, **kwargs):
            data = open(EPMC_FT, "r").read()
            xml = etree.fromstring(data)
            aids = xml.xpath("//article-id[@pub-id-type='manuscript']")
            aids[0].getparent().remove(aids[0])
            s = etree.tostring(xml)
            return epmc.EPMCFullText(s)

        def mock_doaj(*args, **kwargs):
            return True

        workflow.get_epmc_md = mock_get_md
        workflow.get_epmc_fulltext = mock_get_ft
        workflow.doaj_lookup = mock_doaj

        record = models.Record()
        record.pmcid = "PMC4219345"
        record.id = record.makeid()
        oag = []
        msg = workflow.WorkflowMessage(record=record, oag_register=oag)
        workflow.process_record(msg)

        assert record.confidence == 1.0
        assert record.pmcid == "PMC4219345"
        assert record.pmid == "24279897"
        assert record.doi == "10.1186/1471-2121-14-52"
        assert record.in_epmc is True
        assert record.is_oa is False
        assert len(record.issn) == 1
        assert "1471-2121" in record.issn
        assert record.id is not None  # implies it has been saved
        assert record.has_ft_xml is True
        assert record.aam is False
        assert record.aam_from_xml is True
        assert record.licence_type == "cc-by"
        assert record.licence_source == "epmc_xml"
        assert record.journal_type == "oa"
        assert len(oag) == 0
Ejemplo n.º 17
0
    def test_03_handle_oag_response_05_pmcid_error(self):
        # first make ourselves a record that we want to enhance
        job = models.SpreadsheetJob()
        job.save()

        record = models.Record()
        record.upload_id = job.id
        record.pmcid = "PMC1234"
        record.pmid = "1234"
        record.save()
        time.sleep(2)

        # construct the OAG response object, which has detected a licence
        oag_result = {
            "identifier": {
                "id": "PMC1234",
                "type": "pmcid"
            },
            "error": "broken!"
        }

        # call the oag record callback
        oag_rerun = []
        workflow.oag_record_callback(oag_result, oag_rerun, job)

        # should have added the PMID to the re-run
        assert len(oag_rerun) == 1
        assert oag_rerun[0]["id"] == "1234"
        assert oag_rerun[0]["type"] == "pmid"

        # give the index a moment to catch up
        time.sleep(2)

        r2 = models.Record.get_by_identifier("PMC1234", job.id, "pmcid").next()
        assert isinstance(r2, models.Record)

        # provenance added, pmcid=error, pmid reprocess
        assert r2.licence_type is None
        assert r2.oag_pmcid == "error"
        provs = [n for b, w, n in r2.provenance]
        assert len(provs) == 1
        assert "PMC1234 - broken!" in provs
Ejemplo n.º 18
0
    def test_15_record_maxed_02_no_match(self):
        job = models.SpreadsheetJob()
        job.save()

        record = models.Record()
        record.pmcid = "PMC1234"
        record.upload_id = job.id
        record.save()

        time.sleep(2)

        oag_maxed = {"requested": 20, "init": "2001-01-01T09:30:00Z"}

        oag_rerun = []
        workflow.record_maxed("PMC9876", oag_maxed, job, oag_rerun)

        time.sleep(2)

        record = models.Record.pull(record.id)
        assert record.oag_complete is False
        assert len(record.provenance) == 0
Ejemplo n.º 19
0
    def test_10_process_record_05_no_ft(self):
        def mock_get_md(*args, **kwargs):
            md = epmcmod.EPMCMetadata(json.loads(open(EPMC_MD, "r").read()))
            return md, 1.0

        def mock_get_ft(*args, **kwargs):
            return None

        def mock_doaj(*args, **kwargs):
            return False

        workflow.get_epmc_md = mock_get_md
        workflow.get_epmc_fulltext = mock_get_ft
        workflow.doaj_lookup = mock_doaj

        record = models.Record()
        record.pmcid = "PMC4219345"
        record.id = record.makeid()
        oag = []
        msg = workflow.WorkflowMessage(record=record, oag_register=oag)
        workflow.process_record(msg)

        assert record.confidence == 1.0
        assert record.pmcid == "PMC4219345"
        assert record.pmid == "24279897"
        assert record.doi == "10.1186/1471-2121-14-52"
        assert record.in_epmc is True
        assert record.is_oa is False
        assert len(record.issn) == 1
        assert "1471-2121" in record.issn
        assert record.id is not None  # implies it has been saved
        assert record.has_ft_xml is False
        assert record.aam is None
        assert record.aam_from_xml is False
        assert record.licence_type is None
        assert record.licence_source is None
        assert record.journal_type == "hybrid"
        assert len(oag) == 1
        assert oag[0]["id"] == "PMC4219345"
        assert oag[0]["type"] == "pmcid"
Ejemplo n.º 20
0
    def test_05_populate_identifiers(self):
        record = models.Record()
        msg = workflow.WorkflowMessage(record=record)

        data = json.loads(open(EPMC_MD, "r").read())
        epmc_md = epmcmod.EPMCMetadata(data)

        workflow.populate_identifiers(msg, epmc_md)

        assert record.pmcid == "PMC4219345"
        assert record.pmid == "24279897"
        assert record.doi == "10.1186/1471-2121-14-52"

        record.pmcid = "PMC000000"
        record.pmid = "0000000"
        del record.doi

        workflow.populate_identifiers(msg, epmc_md)

        assert record.pmcid == "PMC000000"
        assert record.pmid == "0000000"
        assert record.doi == "10.1186/1471-2121-14-52"
Ejemplo n.º 21
0
    def test_12_licence_translate(self):
        assert workflow.translate_licence_type(
            "free-to-read") == "non-standard-licence"

        # first make ourselves a record that we want to enhance
        job = models.SpreadsheetJob()
        job.save()

        record = models.Record()
        record.upload_id = job.id
        record.pmcid = "PMC1234"
        record.save()
        time.sleep(2)

        # construct the OAG response object, which has detected a licence
        oag_result = {
            "identifier": [{
                "id": "PMC1234",
                "type": "pmcid"
            }],
            "license": [{
                "type": "free-to-read",
                "provenance": {
                    "accepted_author_manuscript": False,  # FIXME: provisional
                    "description": "FtR PMC1234"
                }
            }]
        }

        oag_rerun = []
        workflow.oag_record_callback(oag_result, oag_rerun, job)

        # give the index a moment to catch up
        time.sleep(2)

        r2 = models.Record.get_by_identifier("PMC1234", job.id).next()
        assert isinstance(r2, models.Record)

        assert r2.licence_type == "non-standard-licence"
Ejemplo n.º 22
0
def parse_csv(job):
    app.logger.info("Loading records from " + job.id)

    # find out where to get the file
    upload = app.config.get("UPLOAD_DIR")
    if upload is None or upload == "":
        raise WorkflowException("UPLOAD_DIR is not set")

    path = os.path.join(upload, job.id + ".csv")

    # FIXME: what happens if the sheet can't be read
    sheet = sheets.MasterSheet(path)

    i = 0
    for obj in sheet.objects():
        i += 1
        r = models.Record()
        r.upload_id = job.id
        r.upload_pos = i
        r.set_source_data(**obj)

        # also copy the various identifiers over into the locations where they can be normalised
        # and used for lookup

        if obj.get("pmcid") is not None and obj.get("pmcid") != "":
            npmicd = normalise_pmcid(obj.get("pmcid"))
            if npmicd is not None:
                r.pmcid = npmicd
                note = "normalised PMCID %(source)s to %(target)s" % {
                    "source": obj.get("pmcid"),
                    "target": r.pmcid
                }
            else:
                note = "PMCID %(source)s was syntactically invalid, so ignoring" % {
                    "source": obj.get("pmcid")
                }

            r.add_provenance("importer", note)

        if obj.get("pmid") is not None and obj.get("pmid") != "":
            npmid = normalise_pmid(obj.get("pmid"))
            if npmid is not None:
                r.pmid = npmid
                note = "normalised PMID %(source)s to %(target)s" % {
                    "source": obj.get("pmid"),
                    "target": r.pmid
                }
            else:
                note = "PMID %(source)s was syntactically invalid, so ignoring" % {
                    "source": obj.get("pmid")
                }
            r.add_provenance("importer", note)

        if obj.get("doi") is not None and obj.get("doi") != "":
            ndoi = normalise_doi(obj.get("doi"))
            if ndoi is not None:
                r.doi = ndoi
                note = "normalised DOI %(source)s to %(target)s" % {
                    "source": obj.get("doi"),
                    "target": r.doi
                }
            else:
                note = "DOI %(source)s was syntactically invalid, so ignoring" % {
                    "source": obj.get("doi")
                }
            r.add_provenance("importer", note)

        if obj.get("article_title"
                   ) is not None and obj.get("article_title") != "":
            r.title = obj.get("article_title")

        r.save()

    app.logger.info("Loaded " + str(i) + " records from spreadsheet")

    # FIXME: I'm not totally convinced this a/ works or b/ is a good idea
    # Refresh can behave quite strangely, sometimes,
    # refresh the index so the data is ready to use
    models.Record.refresh()
Ejemplo n.º 23
0
    def test_13_duplicate_check(self):
        # first make ourselves a job to work on
        job = models.SpreadsheetJob()
        job.save()

        # now make a bunch of records, some unique and some duplicate

        # unique pmcid
        r = models.Record()
        r.upload_id = job.id
        r.pmcid = "PMCunique"
        r.save()

        # duplicate pmcid
        r = models.Record()
        r.upload_id = job.id
        r.pmcid = "PMCdupe"
        r.save()

        r = models.Record()
        r.upload_id = job.id
        r.pmcid = "PMCdupe"
        r.save()

        # unique pmid
        r = models.Record()
        r.upload_id = job.id
        r.pmid = "unique"
        r.save()

        # duplicate pmid
        r = models.Record()
        r.upload_id = job.id
        r.pmid = "dupe"
        r.save()

        r = models.Record()
        r.upload_id = job.id
        r.pmid = "dupe"
        r.save()

        # unique doi
        r = models.Record()
        r.upload_id = job.id
        r.doi = "10.unique"
        r.save()

        # duplicate pmcid
        r = models.Record()
        r.upload_id = job.id
        r.doi = "10.dupe"
        r.save()

        r = models.Record()
        r.upload_id = job.id
        r.doi = "10.dupe"
        r.save()

        # one that is a duplicate of everything
        r = models.Record()
        r.upload_id = job.id
        r.pmcid = "PMCdupe"
        r.pmid = "dupe"
        r.doi = "10.dupe"
        r.save()

        # one that is confused about its duplication
        r = models.Record()
        r.upload_id = job.id
        r.pmcid = "PMCdupe"
        r.pmid = "dupe"
        r.doi = "10.notdupe"
        r.save()

        time.sleep(2)

        workflow.duplicate_check(job)

        time.sleep(2)

        # for each record, check that it got the provenance

        # unique pmcid - no provenance, one result
        unique = models.Record.get_by_identifier("PMCunique", job.id, "pmcid")
        ulen = 0
        for u in unique:
            ulen += 1
            assert len(u.provenance) == 0
        assert ulen == 1

        # unique pmid - no provenance, one result
        unique = models.Record.get_by_identifier("unique", job.id, "pmid")
        ulen = 0
        for u in unique:
            ulen += 1
            assert len(u.provenance) == 0
        assert ulen == 1

        # unique doi - no provenance, one result
        unique = models.Record.get_by_identifier("10.unique", job.id, "doi")
        ulen = 0
        for u in unique:
            ulen += 1
            assert len(u.provenance) == 0
        assert ulen == 1

        # duplicates of pmcdupe
        duped = models.Record.get_by_identifier("PMCdupe", job.id, "pmcid")
        dlen = 0
        for u in duped:
            dlen += 1
            prov = False
            for p in u.provenance:
                if "PMCID" in p[2]:
                    prov = True
                    break
            assert prov
        assert dlen == 4

        # duplicates of pmid dupe
        duped = models.Record.get_by_identifier("dupe", job.id, "pmid")
        dlen = 0
        for u in duped:
            dlen += 1
            prov = False
            for p in u.provenance:
                if "PMID" in p[2]:
                    prov = True
                    break
            assert prov
        assert dlen == 4

        # duplicates of 10.dupe
        duped = models.Record.get_by_identifier("10.dupe", job.id, "doi")
        dlen = 0
        for u in duped:
            dlen += 1
            prov = False
            for p in u.provenance:
                if "DOI" in p[2]:
                    prov = True
                    break
            assert prov
        assert dlen == 3
Ejemplo n.º 24
0
    def test_14_oag_record_callback_duplicate(self):
        # first make ourselves a job/record that we want to enhance
        job = models.SpreadsheetJob()
        job.save()

        # make two distinct records with the same ids
        record = models.Record()
        record.upload_id = job.id
        record.pmcid = "PMC1234"
        record.save()

        record = models.Record()
        record.upload_id = job.id
        record.pmcid = "PMC1234"
        record.save()

        time.sleep(2)

        # construct the OAG response object, which has detected a licence
        oag_result = {
            "identifier": [{
                "id": "PMC1234",
                "type": "pmcid"
            }],
            "license": [{
                "type": "cc-by",
                "provenance": {
                    "accepted_author_manuscript": True,
                    "description": "Provenance PMC1234"
                }
            }]
        }

        # call the oag record callback
        oag_rerun = []
        workflow.oag_record_callback(oag_result, oag_rerun, job)

        # give the index a moment to catch up
        time.sleep(2)

        # read the duplicate records out of the index
        records = [
            r for r in models.Record.get_by_identifier("PMC1234", job.id,
                                                       "pmcid")
        ]

        # there should be 2 of them
        assert len(records) == 2
        for record in records:
            assert isinstance(r, models.Record)

            # both records should have the same data
            # licence added, source=epmc, pmcid=success, provenance added, aam set
            assert record.licence_type == "cc-by"
            assert record.licence_source == "epmc"
            assert record.oag_pmcid == "success"
            assert record.aam_from_epmc is True
            assert record.aam is True
            provs = [n for b, w, n in record.provenance]
            assert len(provs) == 2
            assert "PMC1234 - Provenance PMC1234" in provs
            assert "Detected AAM status from EPMC web page" in provs
            assert record.oag_complete is True
Ejemplo n.º 25
0
    if args.identifier is None or args.type is None:
        parser.print_help()
        exit()

    if args.type.lower() not in ["pmcid", "pmid", "doi"]:
        print "Type must be one of pmcid, pmid or doi"
        parser.print_help()
        exit()

    # we must create a job with a single record for it to be run
    job = models.SpreadsheetJob()
    job.contact_email = "*****@*****.**"
    job.save()

    record = models.Record()
    record.upload_id = job.id
    record.upload_pos = 1

    if args.type.lower() == "pmcid":
        record.pmcid = args.identifier
    elif args.type.lower() == "pmid":
        record.pmid = args.identifier
    elif args.type.lower() == "doi":
        record.doi = args.identifier
    record.save()

    time.sleep(2)

    oag_register = []
    msg = workflow.WorkflowMessage(job, record, oag_register)
Ejemplo n.º 26
0
    def test_08_ft_licence(self):
        data = open(EPMC_FT, "r").read()
        xml = etree.fromstring(data)

        l = xml.xpath("//license")
        lp = l[0].find("license-p")

        # licence in type attribute
        l[0].set(
            "license-type", "cc by"
        )  # note the missing "-"; to test the licence representation variations at the same time
        l[0].set("{http://www.w3.org/1999/xlink}href", "http://random.url")
        lp.clear()
        s = etree.tostring(xml)
        ft = epmc.EPMCFullText(s)
        record = models.Record()
        msg = workflow.WorkflowMessage(record=record)
        workflow.extract_fulltext_licence(msg, ft)
        assert record.licence_type == "cc-by"
        assert record.licence_source == "epmc_xml"
        assert len(record.provenance) == 1

        # licence in href attribute
        l[0].set("license-type", "open access")
        l[0].set("{http://www.w3.org/1999/xlink}href",
                 "http://creativecommons.org/licenses/by-nd/3.0")
        s = etree.tostring(xml)
        ft = epmc.EPMCFullText(s)
        record = models.Record()
        msg = workflow.WorkflowMessage(record=record)
        workflow.extract_fulltext_licence(msg, ft)
        assert record.licence_type == "cc-by-nd"
        assert record.licence_source == "epmc_xml"
        assert len(record.provenance) == 1

        # licence in text
        l[0].set("license-type", "open access")
        l[0].set("{http://www.w3.org/1999/xlink}href", "http://random.url")
        lp.text = "licence is <a href='http://creativecommons.org/licenses/by-nc-nd/3.0'>http://creativecommons.org/licenses/by-nc-nd/3.0</a>"
        s = etree.tostring(xml)
        ft = epmc.EPMCFullText(s)
        record = models.Record()
        msg = workflow.WorkflowMessage(record=record)
        workflow.extract_fulltext_licence(msg, ft)
        assert record.licence_type == "cc-by-nc-nd"
        assert record.licence_source == "epmc_xml"
        assert len(record.provenance) == 1

        # licence in /second/ licence paragraph
        lp.text = "some waffle"
        lp2 = etree.SubElement(l[0], "license-p")
        lp2.text = "licence is <a href='http://creativecommons.org/licenses/by/3.0'>http://creativecommons.org/licenses/by/3.0</a>"
        s = etree.tostring(xml)
        ft = epmc.EPMCFullText(s)
        record = models.Record()
        msg = workflow.WorkflowMessage(record=record)
        workflow.extract_fulltext_licence(msg, ft)
        assert record.licence_type == "cc-by"
        assert record.licence_source == "epmc_xml"
        assert len(record.provenance) == 1

        # licence in words in text
        l[0].set("license-type", "open access")
        l[0].set("{http://www.w3.org/1999/xlink}href", "http://random.url")
        lp.text = "This is a Creative Commons Attribution-NonCommercial licenced article"
        l[0].remove(lp2)
        s = etree.tostring(xml)
        ft = epmc.EPMCFullText(s)
        record = models.Record()
        msg = workflow.WorkflowMessage(record=record)
        workflow.extract_fulltext_licence(msg, ft)
        assert record.licence_type == "cc-by-nc"
        assert record.licence_source == "epmc_xml"
        assert len(record.provenance) == 1

        # licence present but unrecognised
        lp.text = "wibble wibble wobble"
        s = etree.tostring(xml)
        ft = epmc.EPMCFullText(s)
        record = models.Record()
        msg = workflow.WorkflowMessage(record=record)
        workflow.extract_fulltext_licence(msg, ft)
        assert record.licence_type == "non-standard-licence"
        assert record.licence_source == "epmc_xml"
        assert len(record.provenance) == 1

        # no licence element present
        p = l[0].getparent()
        p.remove(l[0])
        s = etree.tostring(xml)
        ft = epmc.EPMCFullText(s)
        record = models.Record()
        msg = workflow.WorkflowMessage(record=record)
        workflow.extract_fulltext_licence(msg, ft)
        assert record.licence_type is None
        assert record.licence_source is None
        assert len(record.provenance) == 0
Ejemplo n.º 27
0
    def test_01_export(self):
        # make a job - we don't much care about its content for this test
        job = models.SpreadsheetJob()
        job.save()

        now = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")

        # make a few records for it

        # all fields filled in correctly
        r1 = models.Record()
        r1.pmcid = "PMC1234"
        r1.pmid = "1234"
        r1.doi = "10.1234"
        r1.title = "The Title"
        r1.has_ft_xml = True
        r1.in_epmc = True
        r1.aam = True
        r1.is_oa = True
        r1.licence_type = "CC0"
        r1.licence_source = "publisher"
        r1.journal_type = "hybrid"
        r1.confidence = 0.9
        r1.add_provenance("test", "provenance", now)
        r1.upload_id = job.id
        r1.upload_pos = 1
        r1.journal = "Journal of Science"
        r1.issn = ["1234-5678", "9876-5432"]
        r1.save()

        r2 = models.Record()
        r2.pmcid = "PMC9876"
        r2.upload_id = job.id
        r2.upload_pos = 2
        r2.save()

        r3 = models.Record()
        r3.pmid = "9876"
        r3.upload_id = job.id
        r3.upload_pos = 3
        r3.title = None
        r3.licence_type = ""
        r3.add_provenance("test", "provenance", now)
        r3.add_provenance("test", "more", now)
        r3.save()

        # refresh the index ready for querying
        models.SpreadsheetJob.refresh()
        models.Record.refresh()

        out = workflow.output_csv(job)

        s = StringIO(out)
        reader = csv.reader(s)
        rows = [r for r in reader]

        assert len(rows) == 4
        assert rows[0] == [
            'PMCID', 'PMID', 'DOI', "Journal title", "ISSN", 'Article title',
            "Fulltext in EPMC?", 'XML Fulltext?', 'AAM?', 'Open Access?',
            'Licence', 'Licence Source', 'Journal Type',
            'Correct Article Confidence', 'Standard Compliance?',
            'Deluxe Compliance?', 'Compliance Processing Ouptut'
        ]
        assert rows[1] == [
            'PMC1234', '1234', '10.1234', "Journal of Science",
            "1234-5678, 9876-5432", 'The Title', "True", 'True', 'True',
            'True', 'CC0', 'publisher', 'hybrid', '0.9', "True", "True",
            '[' + now + ' test] provenance'
        ]
        assert rows[2] == [
            "PMC9876", "", "", "", "", "", "", "", "unknown", "", "unknown",
            "", "", "", "False", "False", ""
        ]
        assert rows[3] == [
            "", "9876", "", "", "", "", "", "", "unknown", "", "unknown", "",
            "", "", "False", "False",
            '[' + now + ' test] provenance\n\n[' + now + ' test] more'
        ]
Ejemplo n.º 28
0
    def test_02_send_to_oag(self):
        record = models.Record()

        # Has PMCID, AAM and Licence
        record.pmcid = "PMC1234"
        record.aam_from_xml = True
        record.licence_type = "CC BY"
        record.id = record.makeid()
        oag = []
        msg = workflow.WorkflowMessage(record=record, oag_register=oag)
        workflow.register_with_oag(msg)
        assert len(oag) == 0

        # Has PMCID, AAM, but no licence
        record.pmcid = "PMC1234"
        record.aam_from_xml = True
        del record.licence_type
        oag = []
        msg = workflow.WorkflowMessage(record=record, oag_register=oag)
        workflow.register_with_oag(msg)
        assert len(oag) == 1
        assert oag[0].get("id") == "PMC1234"
        assert oag[0].get("type") == "pmcid"

        # Has PMCID, not AAM, Licence
        record.pmcid = "PMC1234"
        record.aam_from_xml = False
        record.licence_type = "CC BY"
        oag = []
        msg = workflow.WorkflowMessage(record=record, oag_register=oag)
        workflow.register_with_oag(msg)
        assert len(oag) == 1
        assert oag[0].get("id") == "PMC1234"
        assert oag[0].get("type") == "pmcid"

        # Has PMCID, not AAM or Licence
        record.pmcid = "PMC1234"
        record.aam_from_xml = False
        del record.licence_type
        oag = []
        msg = workflow.WorkflowMessage(record=record, oag_register=oag)
        workflow.register_with_oag(msg)
        assert len(oag) == 1
        assert oag[0].get("id") == "PMC1234"
        assert oag[0].get("type") == "pmcid"

        # No PMCID, has DOI and Licence
        del record.pmcid
        record.aam_from_xml = False
        record.licence_type = "CC BY"
        oag = []
        msg = workflow.WorkflowMessage(record=record, oag_register=oag)
        workflow.register_with_oag(msg)
        assert len(oag) == 0

        # No PMCID, has DOI, no Licence
        record.doi = "10.1234"
        del record.licence_type
        oag = []
        msg = workflow.WorkflowMessage(record=record, oag_register=oag)
        workflow.register_with_oag(msg)
        assert len(oag) == 1
        assert oag[0].get("id") == "10.1234"
        assert oag[0].get("type") == "doi"

        # No PMCID or DOI, has PMID but no Licence
        del record.doi
        record.pmid = "1234"
        oag = []
        msg = workflow.WorkflowMessage(record=record, oag_register=oag)
        workflow.register_with_oag(msg)
        assert len(oag) == 1
        assert oag[0].get("id") == "1234"
        assert oag[0].get("type") == "pmid"

        # No identifiers or licence
        del record.pmid
        del record.licence_type
        oag = []
        msg = workflow.WorkflowMessage(record=record, oag_register=oag)
        workflow.register_with_oag(msg)
        assert len(oag) == 0

        # identifier which has previously been added to the run
        record.pmid = "1234"
        oag = [{"id": "1234", "type": "pmid"}]
        msg = workflow.WorkflowMessage(record=record, oag_register=oag)
        workflow.register_with_oag(msg)
        assert len(oag) == 1
Ejemplo n.º 29
0
    def test_11_oag_callback_01_cycle(self):
        cb = workflow.oag_callback_closure()
        assert cb is not None

        import types
        assert type(cb) == types.FunctionType

        job = models.SpreadsheetJob()
        job.save()

        state = oagclient.RequestState(["PMC1234", "PMC9876"])
        oag_response = {
            "results": [{
                "identifier": [{
                    "id": "PMC1234",
                    "type": "epmc",
                    "canonical": "PMC1234"
                }],
                "license": [{
                    "type": "cc-by",
                    "provenance": {
                        "description": "SUCCESS"
                    }
                }]
            }],
            "errors": [{
                "identifier": {
                    "id": "PMC9876",
                    "type": "epmc",
                    "canonical": "PMC9876"
                },
                "error": "ERROR"
            }]
        }
        state.record_result(oag_response)

        oagrlink = models.OAGRLink()
        oagrlink.spreadsheet_id = job.id
        oagrlink.oagrjob_id = state.id
        oagrlink.save()

        record = models.Record()
        record.upload_id = job.id
        record.pmcid = "PMC1234"
        record.save()

        record = models.Record()
        record.upload_id = job.id
        record.pmcid = "PMC9876"
        record.save()

        time.sleep(2)

        cb("cycle", state)

        time.sleep(2)

        r1 = models.Record.get_by_identifier("PMC1234", job.id, "pmcid").next()
        r2 = models.Record.get_by_identifier("PMC9876", job.id, "pmcid").next()

        assert r1.in_oag is False
        assert len(r1.provenance) == 1
        assert "SUCCESS" in r1.provenance[0][2]
        assert r1.oag_pmcid == "success"
        assert r1.licence_source == "epmc"
        assert r1.licence_type == "cc-by"
        assert r1.oag_complete is True

        assert r2.in_oag is False
        assert r2.oag_pmcid == "error"
        assert len(r2.provenance) == 1
        assert "ERROR" in r2.provenance[0][2]
        assert r2.oag_complete is True
Ejemplo n.º 30
0
    def test_01_oag_rerun(self):
        record = models.Record()

        # PMCID sent, no DOI or PMID
        oag = []
        added = workflow.add_to_rerun(record, "pmcid", oag)
        assert len(oag) == 0
        assert added is False

        # PMCID sent, PMID only
        record.pmid = "1234"
        oag = []
        added = workflow.add_to_rerun(record, "pmcid", oag)
        assert len(oag) == 1
        assert oag[0]["id"] == "1234"
        assert oag[0]["type"] == "pmid"
        assert added is True

        # PMCID sent, DOI only
        del record.pmid
        record.doi = "10.1234"
        oag = []
        workflow.add_to_rerun(record, "pmcid", oag)
        assert len(oag) == 1
        assert oag[0]["id"] == "10.1234"
        assert oag[0]["type"] == "doi"

        # PMCID sent, PMID and DOI available
        record.pmid = "1234"
        record.doi = "10.1234"
        oag = []
        workflow.add_to_rerun(record, "pmcid", oag)
        assert len(oag) == 1
        assert oag[0]["id"] == "10.1234"
        assert oag[0]["type"] == "doi"

        # DOI sent, no PMID
        del record.pmid
        record.doi = "10.1234"
        oag = []
        workflow.add_to_rerun(record, "doi", oag)
        assert len(oag) == 0

        # DOI sent, PMID available
        record.pmid = "1234"
        record.doi = "10.1234"
        oag = []
        workflow.add_to_rerun(record, "doi", oag)
        assert len(oag) == 1
        assert oag[0]["id"] == "1234"
        assert oag[0]["type"] == "pmid"

        # PMID sent
        record.pmid = "1234"
        del record.doi
        oag = []
        workflow.add_to_rerun(record, "pmid", oag)
        assert len(oag) == 0

        # duplicate of existing object on stack
        oag = [{"id": "1234", "type": "pmid"}]
        workflow.add_to_rerun(record, "pmid", oag)
        assert len(oag) == 1