Beispiel #1
0
    def test_04_process_oag(self):
        job = models.SpreadsheetJob()
        job.save()

        oag_register = [{
            "id": "PMC1234",
            "type": "pmcid"
        }, {
            "id": "10.1234",
            "type": "doi"
        }, {
            "id": "10.5678",
            "type": "doi"
        }, {
            "id": "abcd",
            "type": "pmid"
        }]

        workflow.process_oag(oag_register, job)

        time.sleep(2)

        link = models.OAGRLink.by_spreadsheet_id(job.id)
        assert link is not None
        assert link.spreadsheet_id == job.id
        assert link.oagrjob_id is not None

        oj = oagr.dao.JobsDAO.pull(link.oagrjob_id)
        assert oj is not None
        state = oj.state()
        assert len(state.pending) == 4
Beispiel #2
0
    def test_02_parse_csv(self):
        s = models.SpreadsheetJob()
        s.filename = "test_submission.csv"
        s.contact_email = "*****@*****.**"
        s.status_code = "submitted"
        s.id = "test_submission"
        s.save()

        workflow.parse_csv(s)
Beispiel #3
0
    def test_03_handle_oag_response_01_pmcid_success(self):
        # first make ourselves a job/record that we want to enhance
        job = models.SpreadsheetJob()
        job.save()

        record = models.Record()
        record.upload_id = job.id
        record.pmcid = "PMC1234"
        record.save()

        time.sleep(2)

        # construct the OAG response object, which has detected a licence
        oag_result = {
            "identifier": [{
                "id": "PMC1234",
                "type": "pmcid"
            }],
            "license": [{
                "type": "cc-by",
                "provenance": {
                    "accepted_author_manuscript": True,  # FIXME: provisional
                    "description": "Provenance PMC1234"
                }
            }]
        }

        # call the oag record callback
        oag_rerun = []
        workflow.oag_record_callback(oag_result, oag_rerun, job)

        # should not have added anything to the rerun
        assert len(oag_rerun) == 0

        # give the index a moment to catch up
        time.sleep(2)

        r2 = models.Record.get_by_identifier("PMC1234", job.id, "pmcid").next()
        assert isinstance(r2, models.Record)
        assert r2.id == record.id
        assert r2.pmcid == "PMC1234"

        # licence added, source=epmc, pmcid=success, provenance added, aam set
        assert r2.licence_type == "cc-by"
        assert r2.licence_source == "epmc"
        assert r2.oag_pmcid == "success"
        assert r2.aam_from_epmc is True
        assert r2.aam is True
        provs = [n for b, w, n in r2.provenance]
        assert len(provs) == 2
        assert "PMC1234 - Provenance PMC1234" in provs
        assert "Detected AAM status from EPMC web page" in provs
Beispiel #4
0
    def test_03_handle_oag_response_02_pmcid_fto(self):
        # first make ourselves a record that we want to enhance
        job = models.SpreadsheetJob()
        job.save()

        record = models.Record()
        record.upload_id = job.id
        record.pmcid = "PMC1234"
        record.doi = "10.1234"
        record.save()
        time.sleep(2)

        # construct the OAG response object, which has detected a licence
        oag_result = {
            "identifier": [{
                "id": "PMC1234",
                "type": "pmcid"
            }],
            "license": [{
                "type": "failed-to-obtain-license",
                "provenance": {
                    "accepted_author_manuscript": True,  # FIXME: provisional
                    "description": "FTO PMC1234"
                }
            }]
        }

        # call the oag record callback
        oag_rerun = []
        workflow.oag_record_callback(oag_result, oag_rerun, job)

        # should have added the DOI to the re-run
        assert len(oag_rerun) == 1
        assert oag_rerun[0]["id"] == "10.1234"
        assert oag_rerun[0]["type"] == "doi"

        # give the index a moment to catch up
        time.sleep(2)

        r2 = models.Record.get_by_identifier("PMC1234", job.id, "pmcid").next()
        assert isinstance(r2, models.Record)

        # provenance added, pmcid=fto, aam set
        assert r2.licence_type is None
        assert r2.oag_pmcid == "fto"
        assert r2.aam_from_epmc is True
        assert r2.aam is True
        provs = [n for b, w, n in r2.provenance]
        assert len(provs) == 2
        assert "PMC1234 - FTO PMC1234" in provs
        assert "Detected AAM status from EPMC web page" in provs
Beispiel #5
0
    def test_03_handle_oag_response_04_pmcid_no_change(self):
        # first make ourselves a record that we want to enhance
        job = models.SpreadsheetJob()
        job.save()

        record = models.Record()
        record.upload_id = job.id
        record.pmcid = "PMC1234"
        record.licence_type = "CC BY"
        record.aam = True
        record.aam_from_xml = True
        record.save()
        time.sleep(2)

        # construct the OAG response object, which has detected a licence
        oag_result = {
            "identifier": [{
                "id": "PMC1234",
                "type": "pmcid"
            }],
            "license": [{
                "type": "failed-to-obtain-license",
                "provenance": {
                    "accepted_author_manuscript": False,  # FIXME: provisional
                    "description": "You won't see this PMC1234"
                }
            }]
        }

        # call the oag record callback
        oag_rerun = []
        workflow.oag_record_callback(oag_result, oag_rerun, job)

        # should not have added anything to the rerun
        assert len(oag_rerun) == 0

        # give the index a moment to catch up
        time.sleep(2)

        r2 = models.Record.get_by_identifier("PMC1234", job.id, "pmcid").next()
        assert isinstance(r2, models.Record)

        # expecting no changes
        assert r2.licence_type == "CC BY"
        assert r2.licence_source is None
        assert r2.oag_pmcid is None
        assert r2.aam_from_epmc is False
        assert r2.aam is True
        provs = [n for b, w, n in r2.provenance]
        assert len(provs) == 0
Beispiel #6
0
    def test_03_handle_oag_response_07_doi_fto(self):
        # first make ourselves a record that we want to enhance
        job = models.SpreadsheetJob()
        job.save()

        record = models.Record()
        record.upload_id = job.id
        record.doi = "10.1234"
        record.pmid = "1234"
        record.save()
        time.sleep(2)

        # construct the OAG response object, which has detected a licence
        oag_result = {
            "identifier": [{
                "id": "10.1234",
                "type": "doi"
            }],
            "license": [{
                "type": "failed-to-obtain-license",
                "provenance": {
                    "description": "FTO 10.1234"
                }
            }]
        }

        # call the oag record callback
        oag_rerun = []
        workflow.oag_record_callback(oag_result, oag_rerun, job)

        # should have added the DOI to the re-run
        assert len(oag_rerun) == 1
        assert oag_rerun[0]["id"] == "1234"
        assert oag_rerun[0]["type"] == "pmid"

        # give the index a moment to catch up
        time.sleep(2)

        r2 = models.Record.get_by_identifier("10.1234", job.id).next()
        assert isinstance(r2, models.Record)

        # provenance added, doi=fto, pmid reprocess
        assert r2.licence_type is None
        assert r2.oag_doi == "fto"
        assert r2.aam is None
        provs = [n for b, w, n in r2.provenance]
        assert len(provs) == 1
        assert "10.1234 - FTO 10.1234" in provs
Beispiel #7
0
    def test_03_handle_oag_response_06_doi_success(self):
        # first make ourselves a record that we want to enhance
        job = models.SpreadsheetJob()
        job.save()

        record = models.Record()
        record.upload_id = job.id
        record.doi = "10.1234"
        record.save()
        time.sleep(2)

        # construct the OAG response object, which has detected a licence
        oag_result = {
            "identifier": [{
                "id": "10.1234",
                "type": "doi"
            }],
            "license": [{
                "type": "cc-by",
                "provenance": {
                    "description": "Provenance 10.1234"
                }
            }]
        }

        # call the oag record callback
        oag_rerun = []
        workflow.oag_record_callback(oag_result, oag_rerun, job)

        # should not have added anything to the rerun
        assert len(oag_rerun) == 0

        # give the index a moment to catch up
        time.sleep(2)

        r2 = models.Record.get_by_identifier(
            "10.1234",
            job.id).next()  # leave out the "doi" type just for the hell of it
        assert isinstance(r2, models.Record)

        # licence added, source=publisher, doi=success, provenance added
        assert r2.licence_type == "cc-by"
        assert r2.licence_source == "publisher"
        assert r2.oag_doi == "success"
        assert r2.aam is None
        provs = [n for b, w, n in r2.provenance]
        assert len(provs) == 1
        assert "10.1234 - Provenance 10.1234" in provs
Beispiel #8
0
    def test_11_oag_callback_02_finished(self):
        cb = workflow.oag_callback_closure()

        job = models.SpreadsheetJob()
        job.save()

        state = oagclient.RequestState(["PMC1234", "PMC9876"], max_retries=1)
        state.record_requested(["PMC1234", "PMC9876"])

        oagrlink = models.OAGRLink()
        oagrlink.spreadsheet_id = job.id
        oagrlink.oagrjob_id = state.id
        oagrlink.save()

        record = models.Record()
        record.upload_id = job.id
        record.pmcid = "PMC1234"
        record.save()

        record = models.Record()
        record.upload_id = job.id
        record.pmcid = "PMC9876"
        record.save()

        time.sleep(2)

        cb("finished", state)

        time.sleep(2)

        r1 = models.Record.get_by_identifier("PMC1234", job.id, "pmcid").next()
        r2 = models.Record.get_by_identifier("PMC9876", job.id, "pmcid").next()

        assert r1.in_oag is False
        assert len(r1.provenance) == 1
        assert r1.provenance[0][2].startswith(
            "Attempted to retrieve PMC1234 1")
        assert r1.oag_pmcid == "error"
        assert r1.oag_complete is True

        assert r2.in_oag is False
        assert r2.oag_pmcid == "error"
        assert len(r2.provenance) == 1
        assert r2.provenance[0][2].startswith(
            "Attempted to retrieve PMC9876 1")
        assert r2.oag_complete is True
Beispiel #9
0
    def test_03_handle_oag_response_05_pmcid_error(self):
        # first make ourselves a record that we want to enhance
        job = models.SpreadsheetJob()
        job.save()

        record = models.Record()
        record.upload_id = job.id
        record.pmcid = "PMC1234"
        record.pmid = "1234"
        record.save()
        time.sleep(2)

        # construct the OAG response object, which has detected a licence
        oag_result = {
            "identifier": {
                "id": "PMC1234",
                "type": "pmcid"
            },
            "error": "broken!"
        }

        # call the oag record callback
        oag_rerun = []
        workflow.oag_record_callback(oag_result, oag_rerun, job)

        # should have added the PMID to the re-run
        assert len(oag_rerun) == 1
        assert oag_rerun[0]["id"] == "1234"
        assert oag_rerun[0]["type"] == "pmid"

        # give the index a moment to catch up
        time.sleep(2)

        r2 = models.Record.get_by_identifier("PMC1234", job.id, "pmcid").next()
        assert isinstance(r2, models.Record)

        # provenance added, pmcid=error, pmid reprocess
        assert r2.licence_type is None
        assert r2.oag_pmcid == "error"
        provs = [n for b, w, n in r2.provenance]
        assert len(provs) == 1
        assert "PMC1234 - broken!" in provs
Beispiel #10
0
def csv_upload(flask_file_handle, filename, contact_email):
    # make a record of the upload
    s = models.SpreadsheetJob()

    s.filename = filename
    s.contact_email = contact_email
    s.status_code = "submitted"
    s.id = s.makeid()

    # find out where to put the file
    upload = app.config.get("UPLOAD_DIR")
    if upload is None or upload == "":
        raise WorkflowException("UPLOAD_DIR is not set")

    # save the file and the record of the upload
    flask_file_handle.save(os.path.join(upload, s.id + ".csv"))
    s.save()

    # return the job that was created, in case the caller wants to do something with it
    return s
Beispiel #11
0
    def test_15_record_maxed_02_no_match(self):
        job = models.SpreadsheetJob()
        job.save()

        record = models.Record()
        record.pmcid = "PMC1234"
        record.upload_id = job.id
        record.save()

        time.sleep(2)

        oag_maxed = {"requested": 20, "init": "2001-01-01T09:30:00Z"}

        oag_rerun = []
        workflow.record_maxed("PMC9876", oag_maxed, job, oag_rerun)

        time.sleep(2)

        record = models.Record.pull(record.id)
        assert record.oag_complete is False
        assert len(record.provenance) == 0
Beispiel #12
0
    def test_12_licence_translate(self):
        assert workflow.translate_licence_type(
            "free-to-read") == "non-standard-licence"

        # first make ourselves a record that we want to enhance
        job = models.SpreadsheetJob()
        job.save()

        record = models.Record()
        record.upload_id = job.id
        record.pmcid = "PMC1234"
        record.save()
        time.sleep(2)

        # construct the OAG response object, which has detected a licence
        oag_result = {
            "identifier": [{
                "id": "PMC1234",
                "type": "pmcid"
            }],
            "license": [{
                "type": "free-to-read",
                "provenance": {
                    "accepted_author_manuscript": False,  # FIXME: provisional
                    "description": "FtR PMC1234"
                }
            }]
        }

        oag_rerun = []
        workflow.oag_record_callback(oag_result, oag_rerun, job)

        # give the index a moment to catch up
        time.sleep(2)

        r2 = models.Record.get_by_identifier("PMC1234", job.id).next()
        assert isinstance(r2, models.Record)

        assert r2.licence_type == "non-standard-licence"
Beispiel #13
0
    def test_14_oag_record_callback_duplicate(self):
        # first make ourselves a job/record that we want to enhance
        job = models.SpreadsheetJob()
        job.save()

        # make two distinct records with the same ids
        record = models.Record()
        record.upload_id = job.id
        record.pmcid = "PMC1234"
        record.save()

        record = models.Record()
        record.upload_id = job.id
        record.pmcid = "PMC1234"
        record.save()

        time.sleep(2)

        # construct the OAG response object, which has detected a licence
        oag_result = {
            "identifier": [{
                "id": "PMC1234",
                "type": "pmcid"
            }],
            "license": [{
                "type": "cc-by",
                "provenance": {
                    "accepted_author_manuscript": True,
                    "description": "Provenance PMC1234"
                }
            }]
        }

        # call the oag record callback
        oag_rerun = []
        workflow.oag_record_callback(oag_result, oag_rerun, job)

        # give the index a moment to catch up
        time.sleep(2)

        # read the duplicate records out of the index
        records = [
            r for r in models.Record.get_by_identifier("PMC1234", job.id,
                                                       "pmcid")
        ]

        # there should be 2 of them
        assert len(records) == 2
        for record in records:
            assert isinstance(r, models.Record)

            # both records should have the same data
            # licence added, source=epmc, pmcid=success, provenance added, aam set
            assert record.licence_type == "cc-by"
            assert record.licence_source == "epmc"
            assert record.oag_pmcid == "success"
            assert record.aam_from_epmc is True
            assert record.aam is True
            provs = [n for b, w, n in record.provenance]
            assert len(provs) == 2
            assert "PMC1234 - Provenance PMC1234" in provs
            assert "Detected AAM status from EPMC web page" in provs
            assert record.oag_complete is True
Beispiel #14
0
    def test_13_duplicate_check(self):
        # first make ourselves a job to work on
        job = models.SpreadsheetJob()
        job.save()

        # now make a bunch of records, some unique and some duplicate

        # unique pmcid
        r = models.Record()
        r.upload_id = job.id
        r.pmcid = "PMCunique"
        r.save()

        # duplicate pmcid
        r = models.Record()
        r.upload_id = job.id
        r.pmcid = "PMCdupe"
        r.save()

        r = models.Record()
        r.upload_id = job.id
        r.pmcid = "PMCdupe"
        r.save()

        # unique pmid
        r = models.Record()
        r.upload_id = job.id
        r.pmid = "unique"
        r.save()

        # duplicate pmid
        r = models.Record()
        r.upload_id = job.id
        r.pmid = "dupe"
        r.save()

        r = models.Record()
        r.upload_id = job.id
        r.pmid = "dupe"
        r.save()

        # unique doi
        r = models.Record()
        r.upload_id = job.id
        r.doi = "10.unique"
        r.save()

        # duplicate pmcid
        r = models.Record()
        r.upload_id = job.id
        r.doi = "10.dupe"
        r.save()

        r = models.Record()
        r.upload_id = job.id
        r.doi = "10.dupe"
        r.save()

        # one that is a duplicate of everything
        r = models.Record()
        r.upload_id = job.id
        r.pmcid = "PMCdupe"
        r.pmid = "dupe"
        r.doi = "10.dupe"
        r.save()

        # one that is confused about its duplication
        r = models.Record()
        r.upload_id = job.id
        r.pmcid = "PMCdupe"
        r.pmid = "dupe"
        r.doi = "10.notdupe"
        r.save()

        time.sleep(2)

        workflow.duplicate_check(job)

        time.sleep(2)

        # for each record, check that it got the provenance

        # unique pmcid - no provenance, one result
        unique = models.Record.get_by_identifier("PMCunique", job.id, "pmcid")
        ulen = 0
        for u in unique:
            ulen += 1
            assert len(u.provenance) == 0
        assert ulen == 1

        # unique pmid - no provenance, one result
        unique = models.Record.get_by_identifier("unique", job.id, "pmid")
        ulen = 0
        for u in unique:
            ulen += 1
            assert len(u.provenance) == 0
        assert ulen == 1

        # unique doi - no provenance, one result
        unique = models.Record.get_by_identifier("10.unique", job.id, "doi")
        ulen = 0
        for u in unique:
            ulen += 1
            assert len(u.provenance) == 0
        assert ulen == 1

        # duplicates of pmcdupe
        duped = models.Record.get_by_identifier("PMCdupe", job.id, "pmcid")
        dlen = 0
        for u in duped:
            dlen += 1
            prov = False
            for p in u.provenance:
                if "PMCID" in p[2]:
                    prov = True
                    break
            assert prov
        assert dlen == 4

        # duplicates of pmid dupe
        duped = models.Record.get_by_identifier("dupe", job.id, "pmid")
        dlen = 0
        for u in duped:
            dlen += 1
            prov = False
            for p in u.provenance:
                if "PMID" in p[2]:
                    prov = True
                    break
            assert prov
        assert dlen == 4

        # duplicates of 10.dupe
        duped = models.Record.get_by_identifier("10.dupe", job.id, "doi")
        dlen = 0
        for u in duped:
            dlen += 1
            prov = False
            for p in u.provenance:
                if "DOI" in p[2]:
                    prov = True
                    break
            assert prov
        assert dlen == 3
Beispiel #15
0
    def test_11_oag_callback_01_cycle(self):
        cb = workflow.oag_callback_closure()
        assert cb is not None

        import types
        assert type(cb) == types.FunctionType

        job = models.SpreadsheetJob()
        job.save()

        state = oagclient.RequestState(["PMC1234", "PMC9876"])
        oag_response = {
            "results": [{
                "identifier": [{
                    "id": "PMC1234",
                    "type": "epmc",
                    "canonical": "PMC1234"
                }],
                "license": [{
                    "type": "cc-by",
                    "provenance": {
                        "description": "SUCCESS"
                    }
                }]
            }],
            "errors": [{
                "identifier": {
                    "id": "PMC9876",
                    "type": "epmc",
                    "canonical": "PMC9876"
                },
                "error": "ERROR"
            }]
        }
        state.record_result(oag_response)

        oagrlink = models.OAGRLink()
        oagrlink.spreadsheet_id = job.id
        oagrlink.oagrjob_id = state.id
        oagrlink.save()

        record = models.Record()
        record.upload_id = job.id
        record.pmcid = "PMC1234"
        record.save()

        record = models.Record()
        record.upload_id = job.id
        record.pmcid = "PMC9876"
        record.save()

        time.sleep(2)

        cb("cycle", state)

        time.sleep(2)

        r1 = models.Record.get_by_identifier("PMC1234", job.id, "pmcid").next()
        r2 = models.Record.get_by_identifier("PMC9876", job.id, "pmcid").next()

        assert r1.in_oag is False
        assert len(r1.provenance) == 1
        assert "SUCCESS" in r1.provenance[0][2]
        assert r1.oag_pmcid == "success"
        assert r1.licence_source == "epmc"
        assert r1.licence_type == "cc-by"
        assert r1.oag_complete is True

        assert r2.in_oag is False
        assert r2.oag_pmcid == "error"
        assert len(r2.provenance) == 1
        assert "ERROR" in r2.provenance[0][2]
        assert r2.oag_complete is True
Beispiel #16
0
    def test_01_export(self):
        # make a job - we don't much care about its content for this test
        job = models.SpreadsheetJob()
        job.save()

        now = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")

        # make a few records for it

        # all fields filled in correctly
        r1 = models.Record()
        r1.pmcid = "PMC1234"
        r1.pmid = "1234"
        r1.doi = "10.1234"
        r1.title = "The Title"
        r1.has_ft_xml = True
        r1.in_epmc = True
        r1.aam = True
        r1.is_oa = True
        r1.licence_type = "CC0"
        r1.licence_source = "publisher"
        r1.journal_type = "hybrid"
        r1.confidence = 0.9
        r1.add_provenance("test", "provenance", now)
        r1.upload_id = job.id
        r1.upload_pos = 1
        r1.journal = "Journal of Science"
        r1.issn = ["1234-5678", "9876-5432"]
        r1.save()

        r2 = models.Record()
        r2.pmcid = "PMC9876"
        r2.upload_id = job.id
        r2.upload_pos = 2
        r2.save()

        r3 = models.Record()
        r3.pmid = "9876"
        r3.upload_id = job.id
        r3.upload_pos = 3
        r3.title = None
        r3.licence_type = ""
        r3.add_provenance("test", "provenance", now)
        r3.add_provenance("test", "more", now)
        r3.save()

        # refresh the index ready for querying
        models.SpreadsheetJob.refresh()
        models.Record.refresh()

        out = workflow.output_csv(job)

        s = StringIO(out)
        reader = csv.reader(s)
        rows = [r for r in reader]

        assert len(rows) == 4
        assert rows[0] == [
            'PMCID', 'PMID', 'DOI', "Journal title", "ISSN", 'Article title',
            "Fulltext in EPMC?", 'XML Fulltext?', 'AAM?', 'Open Access?',
            'Licence', 'Licence Source', 'Journal Type',
            'Correct Article Confidence', 'Standard Compliance?',
            'Deluxe Compliance?', 'Compliance Processing Ouptut'
        ]
        assert rows[1] == [
            'PMC1234', '1234', '10.1234', "Journal of Science",
            "1234-5678, 9876-5432", 'The Title', "True", 'True', 'True',
            'True', 'CC0', 'publisher', 'hybrid', '0.9', "True", "True",
            '[' + now + ' test] provenance'
        ]
        assert rows[2] == [
            "PMC9876", "", "", "", "", "", "", "", "unknown", "", "unknown",
            "", "", "", "False", "False", ""
        ]
        assert rows[3] == [
            "", "9876", "", "", "", "", "", "", "unknown", "", "unknown", "",
            "", "", "False", "False",
            '[' + now + ' test] provenance\n\n[' + now + ' test] more'
        ]
Beispiel #17
0
    parser.add_argument("-i",
                        "--identifier",
                        help="identifier to run through the system")
    args = parser.parse_args()

    if args.identifier is None or args.type is None:
        parser.print_help()
        exit()

    if args.type.lower() not in ["pmcid", "pmid", "doi"]:
        print "Type must be one of pmcid, pmid or doi"
        parser.print_help()
        exit()

    # we must create a job with a single record for it to be run
    job = models.SpreadsheetJob()
    job.contact_email = "*****@*****.**"
    job.save()

    record = models.Record()
    record.upload_id = job.id
    record.upload_pos = 1

    if args.type.lower() == "pmcid":
        record.pmcid = args.identifier
    elif args.type.lower() == "pmid":
        record.pmid = args.identifier
    elif args.type.lower() == "doi":
        record.doi = args.identifier
    record.save()