def test_01_read(self):
        ms = sheets.MasterSheet(path=TEST_SUBMISSION)
        objects = False
        for o in ms.objects():
            objects = True
            # just check a few fields to make sure the object looks reasonable
            assert "university" in o
            assert "pmcid" in o
            assert "journal_title" in o

        assert objects
    def test_05_defaults(self):
        s = StringIO()
        ms = sheets.MasterSheet(writer=s)
        ms.add_object({
            "aam" : None,
            "licence" : "",
            "university" : "A"
        })

        size = 0
        for o in ms.objects():
            size += 1
            assert o.get("aam") == "unknown"
            assert o.get("licence") == "unknown"
            assert o.get("university") == "A"
        assert size == 1
    def test_03_write_full(self):
        s = StringIO()
        ms = sheets.MasterSheet(writer=s)

        # add an object which conforms to the spec of the subset
        ms.add_object({
            "university" : "A",
            "pmcid" : "a",
            "journal_title" : "1"
        })

        # check that the record has been written
        size = 0
        for o in ms.objects():
            size += 1
            assert o.get("university") == "A"
            assert o.get("pmcid") == "a"
            assert o.get("journal_title") == "1"
            assert o.get("doi") == ""
            assert len(o.keys()) == len(ms.OUTPUT_ORDER)
        assert size == 1

        # now add an object with too much data for the spec
        ms.add_object({
            "university" : "C",
            "pmcid" : "c",
            "something_else" : "Gamma"
        })

        # check that the new record has been written correctly (with suitable defaults)
        size = 0
        found = False
        for o in ms.objects():
            size += 1
            if o.get("university") == "C":
                found = True
                assert o.get("journal_title") == ""
                assert o.get("pmcid") == "c"
                assert "something_else" not in o

        assert size == 2
        assert found
    def test_04_output(self):
        # set up a very simple subset sheet (not it's not in the desired output order)
        spec = ["journal_title", "university", "pmcid"]
        s = StringIO()
        ms = sheets.MasterSheet(writer=s, spec=spec)

        # add an object which conforms to the spec of the subset
        ms.add_object({
            "university" : "A",
            "pmcid" : "a",
            "journal_title" : "1"
        })

        # output the sheet to the StringIO object
        ms.save()

        # now open the StringIO in the python standard csv reader
        s.seek(0)
        reader = csv.reader(s)
        rows = [row for row in reader]
        assert len(rows) == 2
        assert rows[0] == ["University", "PMCID", "Journal title"]
        assert rows[1] == ["A", "a", "1"]
Exemple #5
0
def parse_csv(job):
    app.logger.info("Loading records from " + job.id)

    # find out where to get the file
    upload = app.config.get("UPLOAD_DIR")
    if upload is None or upload == "":
        raise WorkflowException("UPLOAD_DIR is not set")

    path = os.path.join(upload, job.id + ".csv")

    # FIXME: what happens if the sheet can't be read
    sheet = sheets.MasterSheet(path)

    i = 0
    for obj in sheet.objects():
        i += 1
        r = models.Record()
        r.upload_id = job.id
        r.upload_pos = i
        r.set_source_data(**obj)

        # also copy the various identifiers over into the locations where they can be normalised
        # and used for lookup

        if obj.get("pmcid") is not None and obj.get("pmcid") != "":
            npmicd = normalise_pmcid(obj.get("pmcid"))
            if npmicd is not None:
                r.pmcid = npmicd
                note = "normalised PMCID %(source)s to %(target)s" % {
                    "source": obj.get("pmcid"),
                    "target": r.pmcid
                }
            else:
                note = "PMCID %(source)s was syntactically invalid, so ignoring" % {
                    "source": obj.get("pmcid")
                }

            r.add_provenance("importer", note)

        if obj.get("pmid") is not None and obj.get("pmid") != "":
            npmid = normalise_pmid(obj.get("pmid"))
            if npmid is not None:
                r.pmid = npmid
                note = "normalised PMID %(source)s to %(target)s" % {
                    "source": obj.get("pmid"),
                    "target": r.pmid
                }
            else:
                note = "PMID %(source)s was syntactically invalid, so ignoring" % {
                    "source": obj.get("pmid")
                }
            r.add_provenance("importer", note)

        if obj.get("doi") is not None and obj.get("doi") != "":
            ndoi = normalise_doi(obj.get("doi"))
            if ndoi is not None:
                r.doi = ndoi
                note = "normalised DOI %(source)s to %(target)s" % {
                    "source": obj.get("doi"),
                    "target": r.doi
                }
            else:
                note = "DOI %(source)s was syntactically invalid, so ignoring" % {
                    "source": obj.get("doi")
                }
            r.add_provenance("importer", note)

        if obj.get("article_title"
                   ) is not None and obj.get("article_title") != "":
            r.title = obj.get("article_title")

        r.save()

    app.logger.info("Loaded " + str(i) + " records from spreadsheet")

    # FIXME: I'm not totally convinced this a/ works or b/ is a good idea
    # Refresh can behave quite strangely, sometimes,
    # refresh the index so the data is ready to use
    models.Record.refresh()
Exemple #6
0
def output_csv(job):
    def serialise_provenance(r):
        s = ""
        first = True
        for by, when, what in r.provenance:
            if not first:
                s += "\n\n"
            else:
                first = False
            s += "[%(when)s %(by)s] %(what)s" % {
                "when": when,
                "by": by,
                "what": what
            }
        return s

    def objectify(r):
        obj = {
            # the identifiers
            "pmcid": r.pmcid,
            "pmid": r.pmid,
            "doi": r.doi,
            "article_title": r.title,

            # the results of the run
            "in_epmc": r.in_epmc,
            "xml_ft_in_epmc": r.has_ft_xml,
            "aam": r.aam,
            "open_access": r.is_oa,
            "licence": r.licence_type,
            "licence_source": r.licence_source,
            "journal_type": r.journal_type,
            "confidence": r.confidence,
            "standard_compliance": r.standard_compliance,
            "deluxe_compliance": r.deluxe_compliance,
            "provenance": serialise_provenance(r),
            "issn": ", ".join(r.issn),

            # this is also a result of the run, but it can be overridden by the source data
            # if it was passed in and not empty
            "journal_title": r.journal
        }

        # add the original data if present, being careful not to overwrite the data we have produced
        if r.source is not None:
            # list the fields to overwrite in the source.  Note that journal_title should only be overwritten
            # if the source does not contain a value
            overwrite = obj.keys()
            jt = r.source.get("journal_title")
            if jt is not None and jt != "":
                overwrite.remove("journal_title")

            original = deepcopy(r.source)
            for k in overwrite:
                if k in original:
                    del original[k]
            obj.update(original)

        return obj

    # get the records and work out what shape they are
    # (makes the assumption that all records have the same spec, which /should/ be true)
    records = models.Record.list_by_upload(job.id)
    spec = objectify(records[0])

    # create a master spreadsheet with the right shape
    s = StringIO()
    sheet = sheets.MasterSheet(writer=s, spec=spec.keys())

    # for each record, objectify it and add to the sheet
    for r in records:
        assert isinstance(r, models.Record)
        obj = objectify(r)
        sheet.add_object(obj)

    sheet.save()

    return s.getvalue()
    def test_02_write_subset(self):
        # set up a very simple subset sheet
        spec = ["university", "pmcid", "journal_title"]
        s = StringIO()
        ms = sheets.MasterSheet(writer=s, spec=spec)

        # add an object which conforms to the spec of the subset
        ms.add_object({
            "university" : "A",
            "pmcid" : "a",
            "journal_title" : "1"
        })

        # check that the record has been written
        size = 0
        for o in ms.objects():
            size += 1
            assert o.get("university") == "A"
            assert o.get("pmcid") == "a"
            assert o.get("journal_title") == "1"
            assert len(o.keys()) == 3
        assert size == 1

        # now add an object with insufficient data for all columns
        ms.add_object({
            "university" : "B",
            "journal_title" : "2",
        })

        # check that the new record has been written correctly (with suitable defaults)
        size = 0
        found = False
        for o in ms.objects():
            size += 1
            if o.get("university") == "B":
                found = True
                assert o.get("journal_title") == "2"
                assert o.get("pmcid") == ""

        assert size == 2
        assert found

        # now add an object with too much data for the spec
        ms.add_object({
            "university" : "C",
            "pmcid" : "c",
            "something_else" : "Gamma"
        })

        # check that the new record has been written correctly (with suitable defaults)
        size = 0
        found = False
        for o in ms.objects():
            size += 1
            if o.get("university") == "C":
                found = True
                assert o.get("journal_title") == ""
                assert o.get("pmcid") == "c"
                assert "something_else" not in o

        assert size == 3
        assert found
 def test_06_blank_rows(self):
     ms = sheets.MasterSheet(path=BLANK_LINES)
     objects = [o for o in ms.objects()]
     assert len(objects) == 20