Exemple #1
0
    def test_09_hybrid_oa(self):
        def is_hybrid_lookup(msg):
            return False

        def is_oa_lookup(msg):
            return True

        def is_failed_lookup(msg):
            return None

        # Check that an OA record is correctly identified
        workflow.doaj_lookup = is_oa_lookup
        record = models.Record()
        msg = workflow.WorkflowMessage(record=record)
        workflow.hybrid_or_oa(msg)
        assert record.journal_type == "oa"
        assert len(record.provenance) == 1

        # check that a hybrid journal is correctly identified
        workflow.doaj_lookup = is_hybrid_lookup
        record = models.Record()
        msg = workflow.WorkflowMessage(record=record)
        workflow.hybrid_or_oa(msg)
        assert record.journal_type == "hybrid"
        assert len(record.provenance) == 1

        # check that no DOAJ check is performed if no issns are present
        # or alternatively the DOAJ lookup fails for unknown reasons
        workflow.doaj_lookup = is_failed_lookup
        record = models.Record()
        msg = workflow.WorkflowMessage(record=record)
        workflow.hybrid_or_oa(msg)
        assert record.journal_type is None
Exemple #2
0
    def test_10_process_record_03_aam_no_licence(self):
        def mock_get_md(*args, **kwargs):
            md = epmcmod.EPMCMetadata(json.loads(open(EPMC_MD, "r").read()))
            return md, 1.0

        def mock_get_ft(*args, **kwargs):
            data = open(EPMC_FT, "r").read()
            xml = etree.fromstring(data)
            l = xml.xpath("//license")
            l[0].getparent().remove(l[0])
            s = etree.tostring(xml)
            return epmc.EPMCFullText(s)

        def mock_doaj(*args, **kwargs):
            return True

        def mock_romeo(*args, **kwargs):
            pass

        def mock_core(*args, **kwargs):
            pass

        workflow.get_epmc_md = mock_get_md
        workflow.get_epmc_fulltext = mock_get_ft
        workflow.doaj_lookup = mock_doaj
        workflow.embargo = mock_romeo
        workflow.ou_core = mock_core

        record = models.Record()
        record.pmcid = "PMC4219345"
        record.id = record.makeid()
        oag = []
        msg = workflow.WorkflowMessage(record=record, oag_register=oag)
        workflow.process_record(msg)

        assert record.confidence == 1.0
        assert record.pmcid == "PMC4219345"
        assert record.pmid == "24279897"
        assert record.doi == "10.1186/1471-2121-14-52"
        assert record.in_epmc is True
        assert record.is_oa is False
        assert len(record.issn) == 1
        assert "1471-2121" in record.issn
        assert record.id is not None  # implies it has been saved
        assert record.has_ft_xml is True
        assert record.aam is True
        assert record.aam_from_xml is True
        assert record.licence_type is None
        assert record.licence_source is None
        assert record.journal_type == "oa"
        assert len(oag) == 1
        assert oag[0]["id"] == "PMC4219345"
        assert oag[0]["type"] == "pmcid"
Exemple #3
0
    def test_09_hybrid_oa(self):
        def is_hybrid_lookup(msg):
            return False

        def is_oa_lookup(msg):
            return True

        # Check that an OA record is correctly identified
        workflow.doaj_lookup = is_oa_lookup
        record = models.Record()
        msg = workflow.WorkflowMessage(record=record)
        workflow.hybrid_or_oa(msg)
        assert record.journal_type == "oa"
        assert len(record.provenance) == 1

        # check that a hybrid journal is correctly identified
        workflow.doaj_lookup = is_hybrid_lookup
        record = models.Record()
        msg = workflow.WorkflowMessage(record=record)
        workflow.hybrid_or_oa(msg)
        assert record.journal_type == "hybrid"
        assert len(record.provenance) == 1
Exemple #4
0
    def test_07_ft_info(self):
        record = models.Record()
        msg = workflow.WorkflowMessage(record=record)

        data = open(EPMC_FT, "r").read()
        ft = epmc.EPMCFullText(data)

        workflow.extract_fulltext_info(msg, ft)

        assert record.has_ft_xml is True
        assert len(record.provenance) == 2
        assert record.aam is True
        assert record.aam_from_xml is True
Exemple #5
0
    def test_06_epmc_compliance_data(self):
        record = models.Record()
        msg = workflow.WorkflowMessage(record=record)

        data = json.loads(open(EPMC_MD, "r").read())
        epmc_md = epmcmod.EPMCMetadata(data)

        workflow.extract_metadata(msg, epmc_md)

        assert record.in_epmc is True
        assert record.is_oa is False
        assert len(record.issn) == 1
        assert "1471-2121" in record.issn
Exemple #6
0
    def test_03_doaj(self):
        record = models.Record()
        msg = workflow.WorkflowMessage(record=record)

        # An OA journal
        record.issn = "1338-3973"
        is_oa = workflow.doaj_lookup(msg)
        assert is_oa is True

        # a journal that we invented
        record.issn = "1234-5678"
        is_oa = workflow.doaj_lookup(msg)
        assert is_oa is False
    def test_02_get_fulltext_xml(self):
        record = models.Record()
        msg = workflow.WorkflowMessage(record=record)

        # a successful fulltext retrieval
        record.pmcid = PMCID_SUCCESS
        ft = workflow.get_epmc_fulltext(msg)
        assert ft is not None
        assert ft.title == PMCID_SUCCESS_FT_TITLE, ft.title

        # failed fulltext retrieval
        record.pmcid = PMCID_ERROR
        ft = workflow.get_epmc_fulltext(msg)
        assert ft is None
Exemple #8
0
    def test_10_process_record_02_no_md(self):
        def mock_get_md(*args, **kwargs):
            return None, None

        workflow.get_epmc_md = mock_get_md

        record = models.Record()
        record.pmcid = "PMC4219345"
        record.id = record.makeid()
        oag = []
        msg = workflow.WorkflowMessage(record=record, oag_register=oag)
        workflow.process_record(msg)

        assert record.confidence is None
        assert len(record.provenance) == 1
        assert len(oag) == 0
Exemple #9
0
    def test_10_process_record_01_everything(self):
        def mock_get_md(*args, **kwargs):
            md = epmcmod.EPMCMetadata(json.loads(open(EPMC_MD, "r").read()))
            return md, 1.0

        def mock_get_ft(*args, **kwargs):
            data = open(EPMC_FT, "r").read()
            return epmc.EPMCFullText(data)

        def mock_doaj(*args, **kwargs):
            return False

        def mock_romeo(*args, **kwargs):
            pass

        def mock_core(*args, **kwargs):
            pass

        workflow.get_epmc_md = mock_get_md
        workflow.get_epmc_fulltext = mock_get_ft
        workflow.doaj_lookup = mock_doaj
        workflow.embargo = mock_romeo
        workflow.ou_core = mock_core

        record = models.Record()
        record.pmcid = "PMC4219345"
        record.id = record.makeid()
        oag = []
        msg = workflow.WorkflowMessage(record=record, oag_register=oag)
        workflow.process_record(msg)

        assert record.confidence == 1.0
        assert record.pmcid == "PMC4219345"
        assert record.pmid == "24279897"
        assert record.doi == "10.1186/1471-2121-14-52"
        assert record.in_epmc is True
        assert record.is_oa is False
        assert len(record.issn) == 1
        assert "1471-2121" in record.issn
        assert record.id is not None  # implies it has been saved
        assert record.has_ft_xml is True
        assert record.aam is True
        assert record.aam_from_xml is True
        assert record.licence_type == "cc-by"
        assert record.licence_source == "epmc_xml"
        assert record.journal_type == "hybrid"
        assert len(oag) == 0
Exemple #10
0
    def test_10_process_record_04_licence_no_aam(self):
        def mock_get_md(*args, **kwargs):
            md = epmcmod.EPMCMetadata(json.loads(open(EPMC_MD, "r").read()))
            return md, 1.0

        def mock_get_ft(*args, **kwargs):
            data = open(EPMC_FT, "r").read()
            xml = etree.fromstring(data)
            aids = xml.xpath("//article-id[@pub-id-type='manuscript']")
            aids[0].getparent().remove(aids[0])
            s = etree.tostring(xml)
            return epmc.EPMCFullText(s)

        def mock_doaj(*args, **kwargs):
            return True

        workflow.get_epmc_md = mock_get_md
        workflow.get_epmc_fulltext = mock_get_ft
        workflow.doaj_lookup = mock_doaj

        record = models.Record()
        record.pmcid = "PMC4219345"
        record.id = record.makeid()
        oag = []
        msg = workflow.WorkflowMessage(record=record, oag_register=oag)
        workflow.process_record(msg)

        assert record.confidence == 1.0
        assert record.pmcid == "PMC4219345"
        assert record.pmid == "24279897"
        assert record.doi == "10.1186/1471-2121-14-52"
        assert record.in_epmc is True
        assert record.is_oa is False
        assert len(record.issn) == 1
        assert "1471-2121" in record.issn
        assert record.id is not None  # implies it has been saved
        assert record.has_ft_xml is True
        assert record.aam is False
        assert record.aam_from_xml is True
        assert record.licence_type == "cc-by"
        assert record.licence_source == "epmc_xml"
        assert record.journal_type == "oa"
        assert len(oag) == 0
Exemple #11
0
    def test_10_process_record_05_no_ft(self):
        def mock_get_md(*args, **kwargs):
            md = epmcmod.EPMCMetadata(json.loads(open(EPMC_MD, "r").read()))
            return md, 1.0

        def mock_get_ft(*args, **kwargs):
            return None

        def mock_doaj(*args, **kwargs):
            return False

        workflow.get_epmc_md = mock_get_md
        workflow.get_epmc_fulltext = mock_get_ft
        workflow.doaj_lookup = mock_doaj

        record = models.Record()
        record.pmcid = "PMC4219345"
        record.id = record.makeid()
        oag = []
        msg = workflow.WorkflowMessage(record=record, oag_register=oag)
        workflow.process_record(msg)

        assert record.confidence == 1.0
        assert record.pmcid == "PMC4219345"
        assert record.pmid == "24279897"
        assert record.doi == "10.1186/1471-2121-14-52"
        assert record.in_epmc is True
        assert record.is_oa is False
        assert len(record.issn) == 1
        assert "1471-2121" in record.issn
        assert record.id is not None  # implies it has been saved
        assert record.has_ft_xml is False
        assert record.aam is None
        assert record.aam_from_xml is False
        assert record.licence_type is None
        assert record.licence_source is None
        assert record.journal_type == "hybrid"
        assert len(oag) == 1
        assert oag[0]["id"] == "PMC4219345"
        assert oag[0]["type"] == "pmcid"
Exemple #12
0
    def test_05_populate_identifiers(self):
        record = models.Record()
        msg = workflow.WorkflowMessage(record=record)

        data = json.loads(open(EPMC_MD, "r").read())
        epmc_md = epmcmod.EPMCMetadata(data)

        workflow.populate_identifiers(msg, epmc_md)

        assert record.pmcid == "PMC4219345"
        assert record.pmid == "24279897"
        assert record.doi == "10.1186/1471-2121-14-52"

        record.pmcid = "PMC000000"
        record.pmid = "0000000"
        del record.doi

        workflow.populate_identifiers(msg, epmc_md)

        assert record.pmcid == "PMC000000"
        assert record.pmid == "0000000"
        assert record.doi == "10.1186/1471-2121-14-52"
Exemple #13
0
    def test_08_ft_licence(self):
        data = open(EPMC_FT, "r").read()
        xml = etree.fromstring(data)

        l = xml.xpath("//license")
        lp = l[0].find("license-p")

        # licence in type attribute
        l[0].set(
            "license-type", "cc by"
        )  # note the missing "-"; to test the licence representation variations at the same time
        l[0].set("{http://www.w3.org/1999/xlink}href", "http://random.url")
        lp.clear()
        s = etree.tostring(xml)
        ft = epmc.EPMCFullText(s)
        record = models.Record()
        msg = workflow.WorkflowMessage(record=record)
        workflow.extract_fulltext_licence(msg, ft)
        assert record.licence_type == "cc-by"
        assert record.licence_source == "epmc_xml"
        assert len(record.provenance) == 1

        # licence in href attribute
        l[0].set("license-type", "open access")
        l[0].set("{http://www.w3.org/1999/xlink}href",
                 "http://creativecommons.org/licenses/by-nd/3.0")
        s = etree.tostring(xml)
        ft = epmc.EPMCFullText(s)
        record = models.Record()
        msg = workflow.WorkflowMessage(record=record)
        workflow.extract_fulltext_licence(msg, ft)
        assert record.licence_type == "cc-by-nd"
        assert record.licence_source == "epmc_xml"
        assert len(record.provenance) == 1

        # licence in text
        l[0].set("license-type", "open access")
        l[0].set("{http://www.w3.org/1999/xlink}href", "http://random.url")
        lp.text = "licence is <a href='http://creativecommons.org/licenses/by-nc-nd/3.0'>http://creativecommons.org/licenses/by-nc-nd/3.0</a>"
        s = etree.tostring(xml)
        ft = epmc.EPMCFullText(s)
        record = models.Record()
        msg = workflow.WorkflowMessage(record=record)
        workflow.extract_fulltext_licence(msg, ft)
        assert record.licence_type == "cc-by-nc-nd"
        assert record.licence_source == "epmc_xml"
        assert len(record.provenance) == 1

        # licence in /second/ licence paragraph
        lp.text = "some waffle"
        lp2 = etree.SubElement(l[0], "license-p")
        lp2.text = "licence is <a href='http://creativecommons.org/licenses/by/3.0'>http://creativecommons.org/licenses/by/3.0</a>"
        s = etree.tostring(xml)
        ft = epmc.EPMCFullText(s)
        record = models.Record()
        msg = workflow.WorkflowMessage(record=record)
        workflow.extract_fulltext_licence(msg, ft)
        assert record.licence_type == "cc-by"
        assert record.licence_source == "epmc_xml"
        assert len(record.provenance) == 1

        # licence in words in text
        l[0].set("license-type", "open access")
        l[0].set("{http://www.w3.org/1999/xlink}href", "http://random.url")
        lp.text = "This is a Creative Commons Attribution-NonCommercial licenced article"
        l[0].remove(lp2)
        s = etree.tostring(xml)
        ft = epmc.EPMCFullText(s)
        record = models.Record()
        msg = workflow.WorkflowMessage(record=record)
        workflow.extract_fulltext_licence(msg, ft)
        assert record.licence_type == "cc-by-nc"
        assert record.licence_source == "epmc_xml"
        assert len(record.provenance) == 1

        # licence present but unrecognised
        lp.text = "wibble wibble wobble"
        s = etree.tostring(xml)
        ft = epmc.EPMCFullText(s)
        record = models.Record()
        msg = workflow.WorkflowMessage(record=record)
        workflow.extract_fulltext_licence(msg, ft)
        assert record.licence_type == "non-standard-licence"
        assert record.licence_source == "epmc_xml"
        assert len(record.provenance) == 1

        # no licence element present
        p = l[0].getparent()
        p.remove(l[0])
        s = etree.tostring(xml)
        ft = epmc.EPMCFullText(s)
        record = models.Record()
        msg = workflow.WorkflowMessage(record=record)
        workflow.extract_fulltext_licence(msg, ft)
        assert record.licence_type is None
        assert record.licence_source is None
        assert len(record.provenance) == 0
Exemple #14
0
    def test_02_send_to_oag(self):
        record = models.Record()

        # Has PMCID, AAM and Licence
        record.pmcid = "PMC1234"
        record.aam_from_xml = True
        record.licence_type = "CC BY"
        record.id = record.makeid()
        oag = []
        msg = workflow.WorkflowMessage(record=record, oag_register=oag)
        workflow.register_with_oag(msg)
        assert len(oag) == 0

        # Has PMCID, AAM, but no licence
        record.pmcid = "PMC1234"
        record.aam_from_xml = True
        del record.licence_type
        oag = []
        msg = workflow.WorkflowMessage(record=record, oag_register=oag)
        workflow.register_with_oag(msg)
        assert len(oag) == 1
        assert oag[0].get("id") == "PMC1234"
        assert oag[0].get("type") == "pmcid"

        # Has PMCID, not AAM, Licence
        record.pmcid = "PMC1234"
        record.aam_from_xml = False
        record.licence_type = "CC BY"
        oag = []
        msg = workflow.WorkflowMessage(record=record, oag_register=oag)
        workflow.register_with_oag(msg)
        assert len(oag) == 1
        assert oag[0].get("id") == "PMC1234"
        assert oag[0].get("type") == "pmcid"

        # Has PMCID, not AAM or Licence
        record.pmcid = "PMC1234"
        record.aam_from_xml = False
        del record.licence_type
        oag = []
        msg = workflow.WorkflowMessage(record=record, oag_register=oag)
        workflow.register_with_oag(msg)
        assert len(oag) == 1
        assert oag[0].get("id") == "PMC1234"
        assert oag[0].get("type") == "pmcid"

        # No PMCID, has DOI and Licence
        del record.pmcid
        record.aam_from_xml = False
        record.licence_type = "CC BY"
        oag = []
        msg = workflow.WorkflowMessage(record=record, oag_register=oag)
        workflow.register_with_oag(msg)
        assert len(oag) == 0

        # No PMCID, has DOI, no Licence
        record.doi = "10.1234"
        del record.licence_type
        oag = []
        msg = workflow.WorkflowMessage(record=record, oag_register=oag)
        workflow.register_with_oag(msg)
        assert len(oag) == 1
        assert oag[0].get("id") == "10.1234"
        assert oag[0].get("type") == "doi"

        # No PMCID or DOI, has PMID but no Licence
        del record.doi
        record.pmid = "1234"
        oag = []
        msg = workflow.WorkflowMessage(record=record, oag_register=oag)
        workflow.register_with_oag(msg)
        assert len(oag) == 1
        assert oag[0].get("id") == "1234"
        assert oag[0].get("type") == "pmid"

        # No identifiers or licence
        del record.pmid
        del record.licence_type
        oag = []
        msg = workflow.WorkflowMessage(record=record, oag_register=oag)
        workflow.register_with_oag(msg)
        assert len(oag) == 0

        # identifier which has previously been added to the run
        record.pmid = "1234"
        oag = [{"id": "1234", "type": "pmid"}]
        msg = workflow.WorkflowMessage(record=record, oag_register=oag)
        workflow.register_with_oag(msg)
        assert len(oag) == 1
Exemple #15
0
    record = models.Record()
    record.upload_id = job.id
    record.upload_pos = 1

    if args.type.lower() == "pmcid":
        record.pmcid = args.identifier
    elif args.type.lower() == "pmid":
        record.pmid = args.identifier
    elif args.type.lower() == "doi":
        record.doi = args.identifier
    record.save()

    time.sleep(2)

    oag_register = []
    msg = workflow.WorkflowMessage(job, record, oag_register)
    workflow.process_record(msg)
    workflow.process_oag(oag_register, job)

    time.sleep(2)

    i = 0
    while True:
        i += 1
        pcc = job.pc_complete
        print i, job.pc_complete, "%",
        sys.stdout.flush()
        if int(pcc) == 100:
            break
        time.sleep(2)
    def test_01_get_epmc_metadata(self):
        record = models.Record()

        # an empty record, shoud result in a failure
        msg = workflow.WorkflowMessage(record=record)
        md, conf = workflow.get_epmc_md(msg)
        assert md is None
        assert conf is None

        # contains a pmcid that yields a result
        record.pmcid = PMCID_SUCCESS
        msg = workflow.WorkflowMessage(record=record)
        md, conf = workflow.get_epmc_md(msg)
        assert md is not None
        assert md.title == PMCID_SUCCESS_TITLE
        assert conf == 1.0

        # contains a pmcid that does not yeild a result
        record.pmcid = PMCID_ERROR
        msg = workflow.WorkflowMessage(record=record)
        md, conf = workflow.get_epmc_md(msg)
        assert md is None
        assert conf is None

        # contains invalid pmcid and valid pmid
        record.pmcid = PMCID_ERROR
        record.pmid = PMID_SUCCESS
        msg = workflow.WorkflowMessage(record=record)
        md, conf = workflow.get_epmc_md(msg)
        assert md is not None
        assert md.title == PMID_SUCCESS_TITLE
        assert conf == 1.0

        # contains an invalid pmid only
        del record.pmcid
        record.pmid = PMID_ERROR
        msg = workflow.WorkflowMessage(record=record)
        md, conf = workflow.get_epmc_md(msg)
        assert md is None
        assert conf is None

        # invalid pmid and valid doi
        record.pmid = PMID_ERROR
        record.doi = DOI_SUCCESS
        msg = workflow.WorkflowMessage(record=record)
        md, conf = workflow.get_epmc_md(msg)
        assert md is not None
        assert md.title == DOI_SUCCESS_TITLE
        assert conf == 1.0

        # contains invalid doi only
        del record.pmid
        record.doi = DOI_ERROR
        msg = workflow.WorkflowMessage(record=record)
        md, conf = workflow.get_epmc_md(msg)
        assert md is None
        assert conf is None

        # contains an invalid doi and a title which can be matched exactly
        record.doi = DOI_ERROR
        record.title = EXACT_TITLE
        msg = workflow.WorkflowMessage(record=record)
        md, conf = workflow.get_epmc_md(msg)
        assert md is not None
        assert md.pmcid == EXACT_TITLE_PMCID
        assert conf < 1.0

        # contains a title that can be matched fuzzily
        del record.doi
        record.title = FUZZY_TITLE
        msg = workflow.WorkflowMessage(record=record)
        md, conf = workflow.get_epmc_md(msg)
        assert md is not None
        assert md.pmcid == FUZZY_TITLE_PMCID
        assert conf < 1.0

        # contains a title that can't be matched in any way
        record.title = TITLE_ERROR
        msg = workflow.WorkflowMessage(record=record)
        md, conf = workflow.get_epmc_md(msg)
        assert md is None
        assert conf is None