def test_create_dataset_from_minimal_dataverse_json(shared_datadir): json_record = json.loads( (shared_datadir / "jpal/jpal_minimal_record.json").read_text()) actual = create_from_dataverse_json(json_record) expected = Dataset( alternativeURL="https://doi.org/00.0000/DVN/00002", authors=[ Author(authorName="Finch, Fiona", authorAffiliation="Birds Inc.") ], contacts=[ Contact( datasetContactName="Finch, Fiona", datasetContactEmail="*****@*****.**", ) ], description=[ Description(dsDescriptionValue="Darwin's finches (also known" " as the Galápagos finches) are a group of about" " fifteen species of passerine birds.", ) ], distributionDate="2020-01-01", distributors=[ Distributor( distributorName= "The Abdul Latif Jameel Poverty Action Lab Dataverse", distributorURL="https://dataverse.harvard.edu/dataverse/jpal", ), ], subjects=["Medicine, Health and Life Sciences"], title="Darwin's Finches", ) assert expected == actual
def test_minimal_dataset(dataverse_minimal_json_record): author = Author(authorName="Finch, Fiona", authorAffiliation="Birds Inc.") contact = Contact(datasetContactName="Finch, Fiona", datasetContactEmail="*****@*****.**") description = Description(dsDescriptionValue="Darwin's finches (also known" " as the Galápagos finches) are a group of about" " fifteen species of passerine birds.") new_record = Dataset( authors=[author], contacts=[contact], description=[description], subjects=["Medicine, Health and Life Sciences"], title="Darwin's Finches", ) assert new_record.asdict() == dataverse_minimal_json_record
def test_asdict_removes_null_values(dataverse_partial_json_record): author = Author( authorName="LastAuthor1, FirstAuthor1", authorAffiliation="AuthorAffiliation1", ) contact = Contact( datasetContactName="LastContact1, FirstContact1", datasetContactEmail="*****@*****.**", ) description = Description(dsDescriptionValue="DescriptionText 1", ) contributors = Contributor( contributorName="LastContributor1, FirstContributor1", ) distributors = Distributor( distributorName="LastDistributor1, FirstDistributor1", ) keywords = Keyword(keywordValue="KeywordTerm1") grantNumbers = GrantNumber( grantNumberValue="GrantInformationGrantNumber1", ) otherIds = OtherId(otherIdValue="OtherIDIdentifier1", ) producers = Producer(producerName="LastProducer1, FirstProducer1", ) publications = Publication( publicationCitation="RelatedPublicationCitation1", publicationURL="http://RelatedPublicationURL1.org", ) series = Series(seriesInformation="SeriesInformation", ) new_record = Dataset( authors=[author], contacts=[contact], description=[description], subjects=["Agricultural Sciences"], title="Replication Data for: Title", keywords=[keywords], otherIds=[otherIds], publications=[publications], producers=[producers], contributors=[contributors], grantNumbers=[grantNumbers], distributors=[distributors], kindOfData=None, series=series, ) assert json.dumps(new_record.asdict(), sort_keys=True) == json.dumps( dataverse_partial_json_record, sort_keys=True)
def test_full_dataset(dataverse_full_json_record): author = Author( authorName="LastAuthor1, FirstAuthor1", authorAffiliation="AuthorAffiliation1", authorIdentifier="AuthorIdentifier1", authorIdentifierScheme="ORCID", ) contact = Contact( datasetContactName="LastContact1, FirstContact1", datasetContactEmail="*****@*****.**", datasetContactAffiliation="ContactAffiliation1", ) description = Description(dsDescriptionValue="DescriptionText 1", dsDescriptionDate="1000-01-01") contributors = Contributor( contributorName="LastContributor1, FirstContributor1", contributorType="Data Collector", ) distributors = Distributor( distributorName="LastDistributor1, FirstDistributor1", distributorURL="http://DistributorURL1.org", ) keywords = Keyword(keywordValue="KeywordTerm1") grantNumbers = GrantNumber( grantNumberValue="GrantInformationGrantNumber1", grantNumberAgency="GrantInformationGrantAgency1", grantNumberInformation="GrantInformationInformation1", ) otherIds = OtherId( otherIdValue="OtherIDIdentifier1", otherIdAgency="OtherIDAgency1", ) producers = Producer( producerName="LastProducer1, FirstProducer1", producerURL="http://ProducerURL1.org", ) publications = Publication( publicationCitation="RelatedPublicationCitation1", publicationIDNumber="RelatedPublicationIDNumber1", publicationIDType="ark", publicationURL="http://RelatedPublicationURL1.org", ) series = Series(seriesName="SeriesName", seriesInformation="SeriesInformation") timePeriodsCovered = TimePeriodCovered( timePeriodCoveredStart="1005-01-01", timePeriodCoveredEnd="1005-01-02", ) new_record = Dataset( authors=[author], alternativeURL="http://AlternativeURL.org", contacts=[contact], description=[description], subjects=[ "Agricultural Sciences", "Business and Management", "Engineering", "Law", ], title="Replication Data for: Title", keywords=[keywords], otherIds=[otherIds], publications=[publications], notesText="Notes1", producers=[producers], productionPlace="ProductionPlace", contributors=[contributors], grantNumbers=[grantNumbers], distributors=[distributors], distributionDate="1004-01-01", language=["English"], timePeriodsCovered=[timePeriodsCovered], kindOfData=["KindOfData1", "KindOfData2"], series=series, license="CC0", termsOfUse="CC0 Waiver", ) assert json.dumps(new_record.asdict(), sort_keys=True) == json.dumps(dataverse_full_json_record, sort_keys=True)
def test_create_whoas_dim_xml(whoas_oai_server, dspace_oai_xml_series_name_record): with requests_mock.Mocker() as m: m.get( "http+mock://example.com/oai?verb=GetRecord&metadataPrefix=dim" "&identifier=oai%3Adarchive.mblwhoilibrary.org:1912/6867", text=dspace_oai_xml_series_name_record, ) client = OAIClient("http+mock://example.com/oai", "dim", "Test_Collection") title = ( "Animals on the Move and Deep‐Sea Vents: Dataset for Spherical Display " "Systems") authors = [ Author( authorName="Beaulieu, Stace E.", authorAffiliation="Woods Hole", authorIdentifierScheme=None, authorIdentifier=None, ), Author( authorName="Brickley, Annette", authorAffiliation="Woods Hole", authorIdentifierScheme=None, authorIdentifier=None, ), ] contacts = [ Contact( datasetContactName="Woods Hole Open Access Server", datasetContactEmail="*****@*****.**", ) ] description = [ Description( dsDescriptionValue="This educational package was developed.", dsDescriptionDate=None, ), Description( dsDescriptionValue="Sample abstract", dsDescriptionDate=None, ), ] distributors = [ Distributor(distributorName="Esteemed Publishing Conglomerate") ] grantNumbers = [ GrantNumber( grantNumberInformation="Funding for this educational package.") ] keywords = [ Keyword(keywordValue="Migration"), Keyword(keywordValue="Larval dispersal"), ] notesText = ("This zipped file contains educational materials. " "This educational package is Copyright ©2019 Woods" " Hole Oceanographic Institution.") otherIds = [ OtherId(otherIdValue="https://hdl.handle.net/1912/2368", otherIdAgency=None), OtherId(otherIdValue="10.26025/8ke9-av98", otherIdAgency=None), ] otherIds_2 = [ OtherId(otherIdValue="https://hdl.handle.net/1912/2371", otherIdAgency=None) ] otherIds_3 = [ OtherId(otherIdValue="https://hdl.handle.net/1912/2372", otherIdAgency=None) ] publications = [ Publication(publicationCitation="Associated publication") ] series = Series( seriesName="Series Title", seriesInformation="https://hdl.handle.net/1912/6867", ) timePeriodsCovered = [ TimePeriodCovered( timePeriodCoveredStart="2019-06-04", timePeriodCoveredEnd="2019-06-04", ) ] partial_timePeriodsCovered = [ TimePeriodCovered(timePeriodCoveredStart="2019-06-04") ] # minimal record subjects = ["Earth and Environmental Sciences"] dataset = create_from_whoas_dim_xml(whoas_oai_server[0], client) assert dataset.title == title assert dataset.authors == authors assert dataset.contacts == contacts assert dataset.description == description assert dataset.subjects == subjects # full record dataset = create_from_whoas_dim_xml(whoas_oai_server[1], client) for _k, v in dataset.__dict__.items(): assert v != [] assert dataset.title == title assert dataset.authors == authors assert dataset.contacts == contacts assert dataset.description == description assert dataset.subjects == subjects assert dataset.distributors == distributors assert dataset.grantNumbers == grantNumbers assert dataset.keywords == keywords assert dataset.language == ["English"] assert dataset.notesText == notesText assert dataset.otherIds == otherIds assert dataset.publications == publications assert dataset.series == series assert dataset.timePeriodsCovered == timePeriodsCovered assert dataset.license == "Attribution 4.0 International" assert dataset.termsOfUse == "Attribution 4.0 International" # record with no description dataset = create_from_whoas_dim_xml(whoas_oai_server[4], client) for _k, v in dataset.__dict__.items(): assert v != [] assert dataset.title == title assert dataset.authors == authors assert dataset.contacts == contacts assert dataset.description == [Description(dsDescriptionValue=title)] assert dataset.subjects == subjects assert dataset.distributors == distributors assert dataset.grantNumbers == grantNumbers assert dataset.keywords == keywords assert dataset.language == ["English"] assert dataset.otherIds == otherIds_2 assert dataset.publications == publications assert dataset.series == series assert dataset.timePeriodsCovered == timePeriodsCovered assert dataset.license == "Attribution 4.0 International" assert dataset.termsOfUse == "Attribution 4.0 International" # record with invalid date dataset = create_from_whoas_dim_xml(whoas_oai_server[5], client) for _k, v in dataset.__dict__.items(): assert v != [] assert dataset.title == title assert dataset.authors == authors assert dataset.contacts == contacts assert dataset.description == description assert dataset.subjects == subjects assert dataset.distributors == distributors assert dataset.grantNumbers == grantNumbers assert dataset.keywords == keywords assert dataset.language == ["English"] assert dataset.notesText == notesText assert dataset.otherIds == otherIds_3 assert dataset.publications == publications assert dataset.series == series assert dataset.timePeriodsCovered == partial_timePeriodsCovered assert dataset.license == "Attribution 4.0 International" assert dataset.termsOfUse == "Attribution 4.0 International"
def create_from_dataverse_json(data: dict) -> Dataset: kwargs: Dict[str, Any] = {} # Dataset fields kwargs["alternativeURL"] = data.get("persistentUrl") kwargs["distributionDate"] = data.get("publicationDate") kwargs["distributors"] = [ Distributor( distributorName= "The Abdul Latif Jameel Poverty Action Lab Dataverse", distributorURL="https://dataverse.harvard.edu/dataverse/jpal", ) ] kwargs["license"] = data["datasetVersion"].get("license") kwargs["termsOfUse"] = data["datasetVersion"].get("termsOfUse") # Citation fields fields = data["datasetVersion"]["metadataBlocks"]["citation"]["fields"] for field in fields: if field["typeName"] == "author": kwargs["authors"] = [ Author( authorAffiliation=v["authorAffiliation"]["value"], authorIdentifier=get_optional_value(v, "authorIdentifier"), authorIdentifierScheme=get_optional_value( v, "authorIdentifierScheme"), authorName=v["authorName"]["value"], ) for v in field["value"] ] elif field["typeName"] == "datasetContact": kwargs["contacts"] = [ Contact( datasetContactAffiliation=get_optional_value( v, "datasetContactAffiliation"), datasetContactEmail=v["datasetContactEmail"]["value"], datasetContactName=v["datasetContactName"]["value"], ) for v in field["value"] ] elif field["typeName"] == "contributor": kwargs["contributors"] = [ Contributor( contributorName=get_optional_value(v, "contributorName"), contributorType=get_optional_value(v, "contributorType"), ) for v in field["value"] ] elif field["typeName"] == "dsDescription": kwargs["description"] = [ Description( dsDescriptionDate=get_optional_value( v, "dsDescriptionDate"), dsDescriptionValue=v["dsDescriptionValue"]["value"], ) for v in field["value"] ] elif field["typeName"] == "distributor": kwargs["distributors"].extend([ Distributor( distributorName=get_optional_value(v, "distributorName"), distributorURL=get_optional_value(v, "distributorURL"), ) for v in field["value"] ]) elif field["typeName"] == "grantNumber": kwargs["grantNumbers"] = [ GrantNumber( grantNumberAgency=get_optional_value( v, "grantNumberAgency"), grantNumberValue=get_optional_value(v, "grantNumberValue"), ) for v in field["value"] ] elif field["typeName"] == "keyword": kwargs["keywords"] = [ Keyword(keywordValue=get_optional_value(v, "keywordValue"), ) for v in field["value"] ] elif field["typeName"] == "kindOfData": kwargs["kindOfData"] = field["value"] elif field["typeName"] == "language": kwargs["language"] = field["value"] elif field["typeName"] == "notesText": kwargs["notesText"] = field["value"] elif field["typeName"] == "otherId": kwargs["otherIds"] = [ OtherId( otherIdAgency=get_optional_value(v, "otherIdAgency"), otherIdValue=get_optional_value(v, "otherIdValue"), ) for v in field["value"] ] elif field["typeName"] == "producer": kwargs["producers"] = [ Producer( producerName=get_optional_value(v, "producerName"), producerURL=get_optional_value(v, "producerURL"), ) for v in field["value"] ] elif field["typeName"] == "productionPlace": kwargs["productionPlace"] = field["value"] elif field["typeName"] == "publication": kwargs["publications"] = [ Publication( publicationCitation=get_optional_value( v, "publicationCitation"), publicationIDNumber=get_optional_value( v, "publicationIDNumber"), publicationIDType=get_optional_value( v, "publicationIDType"), publicationURL=get_optional_value(v, "publicationURL"), ) for v in field["value"] ] elif field["typeName"] == "series": kwargs["series"] = Series( seriesName=get_optional_value(field["value"], "seriesName"), seriesInformation=get_optional_value(field["value"], "seriesInformation"), ) elif field["typeName"] == "subject": kwargs["subjects"] = field["value"] elif field["typeName"] == "timePeriodCovered": kwargs["timePeriodsCovered"] = [ TimePeriodCovered( timePeriodCoveredStart=get_optional_value( v, "timePeriodCoveredStart"), timePeriodCoveredEnd=get_optional_value( v, "timePeriodCoveredEnd"), ) for v in field["value"] ] elif field["typeName"] == "title": kwargs["title"] = field["value"] return Dataset(**kwargs)
def test_create_dataset_from_full_dataverse_json(shared_datadir): json_record = json.loads( (shared_datadir / "jpal/jpal_complete_record.json").read_text()) actual = create_from_dataverse_json(json_record) expected = Dataset( alternativeURL="https://doi.org/00.0000/DVN/00001", authors=[ Author( authorAffiliation="AuthorAffiliation1", authorIdentifier="AuthorIdentifier1", authorIdentifierScheme="ORCID", authorName="LastAuthor1, FirstAuthor1", ), Author( authorAffiliation="AuthorAffiliation2", authorIdentifier="AuthorIdentifier2", authorIdentifierScheme="ORCID", authorName="LastAuthor2, FirstAuthor2", ), ], contacts=[ Contact( datasetContactAffiliation="ContactAffiliation1", datasetContactEmail="*****@*****.**", datasetContactName="LastContact1, FirstContact1", ), Contact( datasetContactAffiliation="ContactAffiliation2", datasetContactEmail="*****@*****.**", datasetContactName="LastContact2, FirstContact2", ), ], contributors=[ Contributor( contributorName="LastContributor1, FirstContributor1", contributorType="Data Collector", ), Contributor( contributorName="LastContributor2, FirstContributor2", contributorType="Researcher", ), ], description=[ Description( dsDescriptionDate="2020-01-01", dsDescriptionValue="DescriptionText 1", ), Description(dsDescriptionValue="DescriptionText 2"), ], distributionDate="2020-06-27", distributors=[ Distributor( distributorName= "The Abdul Latif Jameel Poverty Action Lab Dataverse", distributorURL="https://dataverse.harvard.edu/dataverse/jpal", ), Distributor( distributorName="LastDistributor1, FirstDistributor1", distributorURL="http://DistributorURL1.org", ), Distributor( distributorName="LastDistributor2, FirstDistributor2", distributorURL="http://DistributorURL2.org", ), ], grantNumbers=[ GrantNumber( grantNumberAgency="GrantInformationGrantAgency1", grantNumberValue="GrantInformationGrantNumber1", ) ], keywords=[ Keyword(keywordValue="KeywordTerm1"), Keyword(keywordValue="KeywordTerm2"), ], kindOfData=["KindOfData1", "KindOfData2"], language=["English", "Swahili"], license="CC0", notesText="Notes1", otherIds=[ OtherId( otherIdAgency="OtherIDAgency1", otherIdValue="OtherIDIdentifier1", ), OtherId( otherIdAgency="OtherIDAgency2", otherIdValue="OtherIDIdentifier2", ), ], producers=[ Producer( producerName="LastProducer1, FirstProducer1", producerURL="http://ProducerURL1.org", ), Producer( producerName="LastProducer2, FirstProducer2", producerURL="http://ProducerURL2.org", ), ], productionPlace="ProductionPlace", publications=[ Publication( publicationCitation="RelatedPublicationCitation1", publicationIDNumber="RelatedPublicationIDNumber1", publicationIDType="ark", publicationURL="http://RelatedPublicationURL1.org", ), Publication( publicationCitation="RelatedPublicationCitation2", publicationIDNumber="RelatedPublicationIDNumber2", publicationIDType="doi", publicationURL="https://doi.org/RelatedPublicationURL2", ), ], series=Series( seriesInformation="SeriesInformation", seriesName="SeriesName", ), subjects=[ "Agricultural Sciences", "Business and Management", "Engineering", "Law", ], termsOfUse="CC0 Waiver", timePeriodsCovered=[ TimePeriodCovered( timePeriodCoveredStart="1005-01-01", timePeriodCoveredEnd="1005-01-02", ), TimePeriodCovered( timePeriodCoveredStart="2020-01-01", timePeriodCoveredEnd="2020-01-02", ), ], title="Replication Data for: Title", ) assert expected == actual