def test_token_right_create_dataset_rights(self): BASE_URL = os.getenv("BASE_URL") api_su = NativeApi(BASE_URL, os.getenv("API_TOKEN_SUPERUSER")) api_nru = NativeApi(BASE_URL, os.getenv("API_TOKEN_TEST_NO_RIGHTS")) resp = api_su.get_info_version() assert resp.json()["data"]["version"] == "4.18.1" assert resp.json()["data"]["build"] == "267-a91d370" # resp = api_nru.get_info_version() # assert resp.json()["data"]["version"] == "4.18.1" # assert resp.json()["data"]["build"] == "267-a91d370" ds = Dataset() ds.from_json( read_file( os.path.join( BASE_DIR, "tests/data/dataset_upload_min_default.json"))) resp = api_su.create_dataset(":root", ds.json()) pid = resp.json()["data"]["persistentId"] assert resp.json()["status"] == "OK" # with pytest.raises(ApiAuthorizationError): # resp = api_nru.get_dataset(pid) resp = api_su.delete_dataset(pid) assert resp.json()["status"] == "OK"
def test_dataset_init_invalid(self): """Test Dataset.init() with invalid data.""" pdv = Dataset() # invalid data for data in test_config["invalid_set_types"]: with pytest.raises(AssertionError): pdv.set(data)
def test_token_empty_string(self): BASE_URL = os.getenv("BASE_URL") api = NativeApi(BASE_URL, "") resp = api.get_info_version() assert resp.json()["data"]["version"] == "4.18.1" assert resp.json()["data"]["build"] == "267-a91d370" with pytest.raises(ApiAuthorizationError): ds = Dataset() ds.from_json( read_file( os.path.join( BASE_DIR, "tests/data/dataset_upload_min_default.json"))) api.create_dataset(":root", ds.json())
def test_dataset_init_valid(self): """Test Dataset.__init__() with valid data.""" # specific data = [ (Dataset(), {}), (Dataset(dict_flat_set_min()), object_data_min()), (Dataset(dict_flat_set_full()), object_data_full()), (Dataset({}), {}), ] for pdv, data_eval in data: for key, val in data_eval.items(): assert getattr(pdv, key) == data_eval[key] assert len(pdv.__dict__) - len( object_data_init()) == len(data_eval)
def test_token_no_rights(self): BASE_URL = os.getenv("BASE_URL") API_TOKEN = os.getenv("API_TOKEN_NO_RIGHTS") api = NativeApi(BASE_URL, API_TOKEN) resp = api.get_info_version() assert resp.json()["data"]["version"] == "4.15.1" assert resp.json()["data"]["build"] == "1377-701b56b" with pytest.raises(ApiAuthorizationError): ds = Dataset() ds.from_json( read_file( os.path.join( BASE_DIR, "tests/data/dataset_upload_min_default.json"))) api.create_dataset(":root", ds.json())
def data_object(): """Get Dataset object. Returns ------- pydataverse.models.Dataset :class:`Dataset` object. """ return Dataset()
def test_dataset_set_dv_up(self, import_dataset_min_dict): """Test Dataset.set() with format=`dv_up`. Parameters ---------- import_dataset_min_dict : dict Fixture, which returns a flat dataset dict(). """ ds = Dataset() data = import_dataset_min_dict ds.set(data) """dataset""" assert ds.license == 'CC0' assert ds.termsOfUse == 'CC0 Waiver' assert ds.termsOfAccess == 'Terms of Access' """citation""" assert ds.citation_displayName == 'Citation Metadata' assert ds.title == 'Replication Data for: Title'
def test_dataset_is_valid_valid_not(self): """Test Dataset.is_valid() with non-valid data.""" ds = Dataset() ds.import_metadata(TEST_DIR + '/data/dataset_full.json') ds.title = None assert not ds.is_valid()
def datasync(self): native_api = NativeApi(BASE_URL, API_TOKEN) self.ds_id = str( int(self.make_dataset_id(self.REPO).hexdigest(), 16))[:6] ## turn the md5 string into a 6 digits integer metadata = self.make_dataset_metadata(self.REPO) print(metadata) self.ds = Dataset() self.ds.set(metadata) self.ds.displayName = metadata['title'] self.ds.json = metadata print(self.ds.get()) if self.DEBUG: print("[datasync]") print(self.ds) print(self.ds_id) print(self.ds.displayName) self.create_dataset(native_api, self.ds, DV_ALIAS, self.ds_id, BASE_URL) if self.DEBUG: print(metadata) self.upload_files_to_dataverse(self.ds_id, self.urls_found) return True
def test_dataverse_json_all_valid(self, import_dataverse_min_dict): """Test Dataverse.json() with format=`all` and valid data. Parameters ---------- import_dataverse_min_dict : dict Fixture, which returns a flat dataset dict() coming from `tests/data/dataverse_min.json`. """ data = import_dataverse_min_dict dv = Dataverse() dv.set(data) dv.datasets = [Dataset()] dv.dataverses = [Dataverse()] dv.pid = 'doi:10.11587/EVMUHP' data = dv.json('all') assert data assert isinstance(data, str)
def test_dataverse_json_format_wrong_valid(self, import_dataverse_min_dict): """Test Dataverse.json() with non-valid format and valid data. Parameters ---------- import_dataverse_min_dict : dict Fixture, which returns a flat dataset dict() coming from `tests/data/dataverse_min.json`. """ data = import_dataverse_min_dict dv = Dataverse() dv.set(data) dv.datasets = [Dataset()] dv.dataverses = [Dataverse()] dv.pid = 'doi:10.11587/EVMUHP' data = dv.json('wrong') assert not data
def test_dataverse_dict_all_valid(self, import_dataverse_min_dict): """Test Dataverse.dict() with format=`all` and valid data. Parameters ---------- import_dataverse_min_dict : dict Fixture, which returns a flat dataset dict() coming from `tests/data/dataverse_min.json`. """ data = import_dataverse_min_dict dv = Dataverse() dv.set(data) dv.datasets = [Dataset()] dv.dataverses = [Dataverse()] dv.pid = 'doi:10.11587/EVMUHP' data = dv.dict('all') assert data assert isinstance(data, dict) assert data['alias'] == 'test-pyDataverse' assert data['name'] == 'Test pyDataverse' assert data['dataverseContacts'][0]['contactEmail'] == '*****@*****.**' assert data['pid'] == 'doi:10.11587/EVMUHP'
def create_testdata(config_file: str, force: bool) -> None: """Create testdata defined in a config file. Creates a pre-defined set of testdata on your instance. By default, the function uses the AUSSDA test data repository, which is so far not publicly available. If `PRODUCTION` is `true`, this function will not execute, as long as you not add `--force` to the function call. This is to protect from unwanted changes on a production instance. """ # Init if config.PRODUCTION and not force: print( "Create testdata on a PRODUCTION instance not allowed. Use --force to force it." ) sys.exit() pid_idx = [] users = read_json(config.USER_FILENAME) workflow = read_json(os.path.join(ROOT_DIR, config_file)) # Dataverses for dv_conf in workflow["dataverses"]: dv_alias = None if "create" in dv_conf: api = NativeApi( config.BASE_URL, users[dv_conf["create"]["user-handle"]]["api-token"]) dv = Dataverse() dv_filename = os.path.join(ROOT_DIR, dv_conf["create"]["metadata-filename"]) dv.from_json(read_file(dv_filename)) if "update" in dv_conf["create"]: for key, val in dv_conf["create"]["update"].items(): kwargs = {key: val} dv.set(kwargs) dv_alias = dv.get()["alias"] resp = api.create_dataverse(dv_conf["create"]["parent"], dv.json()) if "publish" in dv_conf: api = NativeApi( config.BASE_URL, users[dv_conf["publish"]["user-handle"]]["api-token"]) if not dv_alias and "alias" in dv_conf["publish"]: dv_alias = dv_conf["publish"]["alias"] resp = api.publish_dataverse(dv_alias) # Datasets for ds_conf in workflow["datasets"]: pid = None if "create" in ds_conf: api = NativeApi( config.BASE_URL, users[ds_conf["create"]["user-handle"]]["api-token"]) ds = Dataset() ds_filename = os.path.join(ROOT_DIR, ds_conf["create"]["metadata-filename"]) ds.from_json(read_file(ds_filename)) if "update" in ds_conf["create"]: for key, val in ds_conf["create"]["update"].items(): kwargs = {key: val} ds.set(kwargs) resp = api.create_dataset(dv_alias, ds.json()) pid = resp.json()["data"]["persistentId"] pid_idx.append(pid) if "publish" in ds_conf: if not pid: print("ERROR: PID missing!") sys.exit() api = NativeApi( config.BASE_URL, users[ds_conf["publish"]["user-handle"]]["api-token"]) resp = api.publish_dataset(pid, release_type="major") # Datafiles for dataset_id, ds_datafiles in workflow["datafiles"].items(): if int(dataset_id) == workflow["datasets"][int(dataset_id)]["id"]: pid = pid_idx[int(dataset_id)] else: print("ERROR: Dataset ID not matching.") sys.exit() for df_conf in ds_datafiles: if "upload" in df_conf: api = NativeApi( config.BASE_URL, users[df_conf["upload"]["user-handle"]]["api-token"], ) metadata = read_json(df_conf["upload"]["metadata-filename"]) df = Datafile() df.set(metadata) if "update" in df_conf["upload"]: for key, val in df_conf["upload"]["update"].items(): kwargs = {key: val} df.set(kwargs) df.set({"pid": pid}) filename = df_conf["upload"]["filename"] resp = api.upload_datafile(pid, filename, df.json()) if filename[-4:] == ".sav" or filename[-4:] == ".dta": sleep(30) else: sleep(3) if "publish-dataset" in df_conf: api = NativeApi( config.BASE_URL, users[df_conf["publish-dataset"]["user-handle"]]["api-token"], ) if df_conf["publish-dataset"]: resp = api.publish_dataset(pid, release_type="major")
def test_dataset_import_metadata_format_wrong(self): """Test Dataset.import_metadata() with non-valid format.""" ds = Dataset() ds.import_metadata(TEST_DIR + '/data/dataset_full.json', 'wrong') assert isinstance(ds.datafiles, list) assert len(ds.datafiles) == 0 """Metadata: dataset""" assert not ds.license assert not ds.termsOfUse assert not ds.termsOfAccess """Metadata: citation""" assert not ds.citation_displayName assert not ds.title assert not ds.subtitle assert not ds.alternativeTitle assert not ds.alternativeURL assert isinstance(ds.otherId, list) assert len(ds.otherId) == 0 assert isinstance(ds.author, list) assert len(ds.author) == 0 assert isinstance(ds.datasetContact, list) assert len(ds.datasetContact) == 0 assert isinstance(ds.dsDescription, list) assert len(ds.dsDescription) == 0 assert isinstance(ds.subject, list) assert len(ds.subject) == 0 assert isinstance(ds.subject, list) assert len(ds.subject) == 0 assert isinstance(ds.topicClassification, list) assert len(ds.topicClassification) == 0 assert isinstance(ds.publication, list) assert len(ds.publication) == 0 assert not ds.notesText assert isinstance(ds.producer, list) assert len(ds.producer) == 0 assert not ds.productionDate assert not ds.productionPlace assert isinstance(ds.contributor, list) assert len(ds.contributor) == 0 assert isinstance(ds.grantNumber, list) assert len(ds.grantNumber) == 0 assert isinstance(ds.distributor, list) assert len(ds.distributor) == 0 assert not ds.distributionDate assert not ds.depositor assert not ds.dateOfDeposit assert isinstance(ds.timePeriodCovered, list) assert len(ds.timePeriodCovered) == 0 assert isinstance(ds.dateOfCollection, list) assert len(ds.dateOfCollection) == 0 assert isinstance(ds.kindOfData, list) assert len(ds.kindOfData) == 0 assert not ds.seriesName assert not ds.seriesInformation assert isinstance(ds.software, list) assert len(ds.software) == 0 assert isinstance(ds.relatedMaterial, list) assert len(ds.relatedMaterial) == 0 assert isinstance(ds.relatedDatasets, list) assert len(ds.relatedDatasets) == 0 assert isinstance(ds.otherReferences, list) assert len(ds.otherReferences) == 0 assert isinstance(ds.dataSources, list) assert len(ds.dataSources) == 0 assert not ds.originOfSources assert not ds.characteristicOfSources assert not ds.accessToSources """Metadata: geospatial""" assert not ds.geospatial_displayName assert isinstance(ds.geographicCoverage, list) assert len(ds.geographicCoverage) == 0 assert not ds.geographicUnit assert isinstance(ds.geographicBoundingBox, list) assert len(ds.geographicBoundingBox) == 0 """Metadata: socialscience""" assert not ds.socialscience_displayName assert isinstance(ds.unitOfAnalysis, list) assert len(ds.unitOfAnalysis) == 0 assert isinstance(ds.universe, list) assert len(ds.universe) == 0 assert not ds.timeMethod assert not ds.dataCollector assert not ds.collectorTraining assert not ds.frequencyOfDataCollection assert not ds.samplingProcedure assert not ds.targetSampleActualSize assert not ds.targetSampleSizeFormula assert not ds.socialScienceNotesType assert not ds.socialScienceNotesSubject assert not ds.socialScienceNotesText assert not ds.deviationsFromSampleDesign assert not ds.collectionMode assert not ds.researchInstrument assert not ds.dataCollectionSituation assert not ds.actionsToMinimizeLoss assert not ds.controlOperations assert not ds.weighting assert not ds.cleaningOperations assert not ds.datasetLevelErrorNotes assert not ds.responseRate assert not ds.samplingErrorEstimates assert not ds.otherDataAppraisal """Metadata: journal""" assert not ds.journal_displayName assert isinstance(ds.journalVolumeIssue, list) assert len(ds.journalVolumeIssue) == 0 assert not ds.journalArticleType
def test_dataset_import_metadata_dv_up(self): """Test Dataset.import_metadata() with format=`dv_up`.""" ds = Dataset() ds.import_metadata(TEST_DIR + '/data/dataset_full.json') """dataset""" assert ds.license == 'CC0' assert ds.termsOfUse == 'CC0 Waiver' assert ds.termsOfAccess == 'Terms of Access' """citation""" assert ds.citation_displayName == 'Citation Metadata' assert ds.title == 'Replication Data for: Title' assert ds.subtitle == 'Subtitle' assert ds.alternativeTitle == 'Alternative Title' assert ds.alternativeURL == 'http://AlternativeURL.org' assert isinstance(ds.otherId, list) assert len(ds.otherId) == 1 for d in ds.otherId: assert d['otherIdAgency'] in ['OtherIDAgency1'] assert d['otherIdValue'] in ['OtherIDIdentifier1'] assert isinstance(ds.author, list) assert len(ds.author) == 1 for d in ds.author: assert d['authorName'] in ['LastAuthor1, FirstAuthor1'] assert d['authorAffiliation'] in ['AuthorAffiliation1'] assert d['authorIdentifierScheme'] in ['ORCID'] assert d['authorIdentifier'] in ['AuthorIdentifier1'] assert isinstance(ds.datasetContact, list) assert len(ds.datasetContact) == 1 for d in ds.datasetContact: assert d['datasetContactName'] in ['LastContact1, FirstContact1'] assert d['datasetContactAffiliation'] in ['ContactAffiliation1'] assert d['datasetContactEmail'] in ['*****@*****.**'] assert isinstance(ds.dsDescription, list) assert len(ds.dsDescription) == 1 for d in ds.dsDescription: assert d['dsDescriptionValue'] in ['DescriptionText2'] assert d['dsDescriptionDate'] in ['1000-02-02'] assert ds.subject == [ 'Agricultural Sciences', 'Business and Management', 'Engineering', 'Law' ] assert isinstance(ds.keyword, list) assert len(ds.keyword) == 1 for d in ds.keyword: assert d['keywordValue'] in ['KeywordTerm1'] assert d['keywordVocabulary'] in ['KeywordVocabulary1'] assert d['keywordVocabularyURI'] in [ 'http://KeywordVocabularyURL1.org' ] assert isinstance(ds.topicClassification, list) assert len(ds.topicClassification) == 1 for d in ds.topicClassification: assert d['topicClassValue'] in ['Topic Class Value1'] assert d['topicClassVocab'] in ['Topic Classification Vocabulary'] assert isinstance(ds.publication, list) assert len(ds.publication) == 1 for d in ds.publication: assert d['publicationCitation'] in ['RelatedPublicationCitation1'] assert d['publicationIDType'] in ['ark'] assert d['publicationIDNumber'] in ['RelatedPublicationIDNumber1'] assert d['publicationURL'] in ['http://RelatedPublicationURL1.org'] assert ds.notesText == 'Notes1' assert isinstance(ds.producer, list) assert len(ds.producer) == 1 for d in ds.producer: assert d['producerName'] in ['LastProducer1, FirstProducer1'] assert d['producerAffiliation'] in ['ProducerAffiliation1'] assert d['producerAbbreviation'] in ['ProducerAbbreviation1'] assert d['producerURL'] in ['http://ProducerURL1.org'] assert d['producerLogoURL'] in ['http://ProducerLogoURL1.org'] assert ds.productionDate == '1003-01-01' assert ds.productionPlace == 'ProductionPlace' assert isinstance(ds.contributor, list) assert len(ds.contributor) == 1 for d in ds.contributor: assert d['contributorType'] in ['Data Collector'] assert d['contributorName'] in [ 'LastContributor1, FirstContributor1' ] assert isinstance(ds.grantNumber, list) assert len(ds.grantNumber) == 1 for d in ds.grantNumber: assert d['grantNumberAgency'] in ['GrantInformationGrantAgency1'] assert d['grantNumberValue'] in ['GrantInformationGrantNumber1'] assert isinstance(ds.distributor, list) assert len(ds.distributor) == 1 for d in ds.distributor: assert d['distributorName'] in [ 'LastDistributor1, FirstDistributor1' ] assert d['distributorAffiliation'] in ['DistributorAffiliation1'] assert d['distributorAbbreviation'] in ['DistributorAbbreviation1'] assert d['distributorURL'] in ['http://DistributorURL1.org'] assert d['distributorLogoURL'] in [ 'http://DistributorLogoURL1.org' ] assert ds.distributionDate == '1004-01-01' assert ds.depositor == 'LastDepositor, FirstDepositor' assert ds.dateOfDeposit == '1002-01-01' assert isinstance(ds.timePeriodCovered, list) assert len(ds.timePeriodCovered) == 1 for d in ds.timePeriodCovered: assert d['timePeriodCoveredStart'] in ['1005-01-01'] assert d['timePeriodCoveredEnd'] in ['1005-01-02'] assert isinstance(ds.dateOfCollection, list) assert len(ds.dateOfCollection) == 1 for d in ds.dateOfCollection: assert d['dateOfCollectionStart'] in ['1006-01-01'] assert d['dateOfCollectionEnd'] in ['1006-01-01'] assert ds.kindOfData == ['KindOfData1', 'KindOfData2'] assert ds.seriesName == 'SeriesName' assert ds.seriesInformation == 'SeriesInformation' assert isinstance(ds.software, list) assert len(ds.software) == 1 for d in ds.software: assert d['softwareName'] in ['SoftwareName1'] assert d['softwareVersion'] in ['SoftwareVersion1'] assert ds.relatedMaterial == ['RelatedMaterial1', 'RelatedMaterial2'] assert ds.relatedDatasets == ['RelatedDatasets1', 'RelatedDatasets2'] assert ds.otherReferences == ['OtherReferences1', 'OtherReferences2'] assert ds.dataSources == ['DataSources1', 'DataSources2'] assert ds.originOfSources == 'OriginOfSources' assert ds.characteristicOfSources == 'CharacteristicOfSourcesNoted' assert ds.accessToSources == 'DocumentationAndAccessToSources' """geospatial""" assert ds.geospatial_displayName == 'Geospatial Metadata' assert isinstance(ds.geographicCoverage, list) assert len(ds.geographicCoverage) == 1 for d in ds.geographicCoverage: assert d['country'] in ['Afghanistan'] assert d['state'] in ['GeographicCoverageStateProvince1'] assert d['city'] in ['GeographicCoverageCity1'] assert d['otherGeographicCoverage'] in ['GeographicCoverageOther1'] assert ds.geographicUnit == ['GeographicUnit1', 'GeographicUnit2'] assert isinstance(ds.geographicBoundingBox, list) assert len(ds.geographicBoundingBox) == 1 for d in ds.geographicBoundingBox: assert d['westLongitude'] in ['10'] assert d['eastLongitude'] in ['20'] assert d['northLongitude'] in ['30'] assert d['southLongitude'] in ['40'] """socialscience""" assert ds.socialscience_displayName == 'Social Science and Humanities Metadata' assert ds.unitOfAnalysis == ['UnitOfAnalysis1', 'UnitOfAnalysis2'] assert ds.universe == ['Universe1', 'Universe2'] assert ds.timeMethod == 'TimeMethod' assert ds.dataCollector == 'LastDataCollector1, FirstDataCollector1' assert ds.collectorTraining == 'CollectorTraining' assert ds.frequencyOfDataCollection == 'Frequency' assert ds.samplingProcedure == 'SamplingProcedure' assert ds.targetSampleActualSize == '100' assert ds.targetSampleSizeFormula == 'TargetSampleSizeFormula' assert ds.deviationsFromSampleDesign == 'MajorDeviationsForSampleDesign' assert ds.collectionMode == 'CollectionMode' assert ds.researchInstrument == 'TypeOfResearchInstrument' assert ds.dataCollectionSituation == 'CharacteristicsOfDataCollectionSituation' assert ds.actionsToMinimizeLoss == 'ActionsToMinimizeLosses' assert ds.controlOperations == 'ControlOperations' assert ds.weighting == 'Weighting' assert ds.cleaningOperations == 'CleaningOperations' assert ds.datasetLevelErrorNotes == 'StudyLevelErrorNotes' assert ds.responseRate == 'ResponseRate' assert ds.samplingErrorEstimates == 'EstimatesOfSamplingError' assert ds.otherDataAppraisal == 'OtherFormsOfDataAppraisal' assert ds.socialScienceNotesType == 'NotesType' assert ds.socialScienceNotesSubject == 'NotesSubject' assert ds.socialScienceNotesText == 'NotesText' """journal""" assert ds.journal_displayName == 'Journal Metadata' assert isinstance(ds.journalVolumeIssue, list) assert len(ds.journalVolumeIssue) == 1 for d in ds.journalVolumeIssue: assert d['journalVolume'] in ['JournalVolume1'] assert d['journalIssue'] in ['JournalIssue1'] assert d['journalPubDate'] in ['1008-01-01'] assert ds.journalArticleType == 'abstract'
def test_dataset_is_valid_valid(self): """Test Dataset.is_valid() with valid data.""" ds = Dataset() ds.import_metadata(TEST_DIR + '/data/dataset_full.json') assert ds.is_valid()
class DataverseData(): def __init__(self, REPO, validate=False): self.ext = PARSABLE_EXTENSIONS self.REPO = REPO self.mapping_dsid2pid = {} self.validate_df = validate self.g = Github(GITHUB_TOKEN) self.repo = self.g.get_repo(REPO) self.urls_found = {} self.ds_id = 0 self.DEBUG = True def githubsearch(self, thisquery): repositories = self.g.search_repositories(query=thisquery, sort='updated') return repositories def search(self, thisquery): search_api = SearchApi(BASE_URL, API_TOKEN) return search_api.search(thisquery).json()['data'] def if_exist(self, thisquery): self.exists = False repoquery = "authorName:%s" % (thisquery) try: for item in self.search(repoquery)['items'][0]['authors']: if item == thisquery: self.exists = True print(item) except: self.exists = False if self.DEBUG: print(self.exists) return self.exists def datasync(self): native_api = NativeApi(BASE_URL, API_TOKEN) self.ds_id = str( int(self.make_dataset_id(self.REPO).hexdigest(), 16))[:6] ## turn the md5 string into a 6 digits integer metadata = self.make_dataset_metadata(self.REPO) print(metadata) self.ds = Dataset() self.ds.set(metadata) self.ds.displayName = metadata['title'] self.ds.json = metadata print(self.ds.get()) if self.DEBUG: print("[datasync]") print(self.ds) print(self.ds_id) print(self.ds.displayName) self.create_dataset(native_api, self.ds, DV_ALIAS, self.ds_id, BASE_URL) if self.DEBUG: print(metadata) self.upload_files_to_dataverse(self.ds_id, self.urls_found) return True def extract_urls(self, content: str) -> list: matches = re.findall(r"(http[^\s'\"\\]+)", content) pattern = re.compile(r"([^/\w]+)$") return [pattern.sub("", match) for match in matches] def decode_github_content(self, content: str) -> str: return base64.b64decode(content).decode("utf-8") def make_dataset_id(self, repo_name): return hashlib.md5(repo_name.encode("utf-8")) def make_default_dataset(self, data, repo_name): ds_id = self.make_dataset_id(repo_name) data[ds_id] = {'metadata': make_dataset_metadata(repo_name)} return data def make_dataset_metadata(self, repo_name): metadata = {} repo = self.g.get_repo(repo_name) metadata['termsOfAccess'] = '' metadata[ 'title'] = 'Automatic uploads from {} github repository'.format( repo_name) metadata[ 'subtitle'] = 'Automatic uploads from {} github repository'.format( repo_name) metadata['author'] = [{ "authorName": repo_name, "authorAffiliation": "CoronaWhy" }] metadata['dsDescription'] = [{'dsDescriptionValue': ''}] metadata['dsDescription'] = [{ 'dsDescriptionValue': format(repo.get_topics()) }] if len(metadata['dsDescription']) < 3: metadata['dsDescription'] = [{'dsDescriptionValue': 'coronavirus'}] metadata['subject'] = ['Medicine, Health and Life Sciences'] metadata['keyword'] = repo.get_topics() metadata['datasetContact'] = [{ 'datasetContactName': 'https://github.com/{}'.format(repo_name), 'datasetContactEmail': '*****@*****.**' }] return metadata def make_file_metadata(self, repo_name, file, url): metadata = {} metadata['description'] = file metadata['filename'] = url metadata['datafile_id'] = hashlib.md5(url.encode("utf-8")) metadata['dataset_id'] = hashlib.md5(repo_name.encode("utf-8")) return metadata def create_dataset(self, api, ds, dv_alias, ds_id, base_url): if self.DEBUG: print("\n\n[create_dataset]") print(ds.get()) # print(ds.to_json()) resp = '' try: resp = api.create_dataset(dv_alias, ds.json()) pid = resp.json()['data']['persistentId'] except: # print(resp.content) return resp, self.mapping_dsid2pid self.mapping_dsid2pid[ds_id] = pid time.sleep(1) print('{0}/dataset.xhtml?persistentId={1}&version=DRAFT'.format( base_url, pid)) return resp # Implementation adapted from http://guides.dataverse.org/en/latest/api/native-api.html#id62 def upload_datafile(self, server, api_key, p_id, repo_name, filename, repo_file, url, columns): dataverse_server = server api_key = api_key persistentId = p_id files = {'file': (url.split('/')[-1], open(filename, 'rb'))} desc = "Data snapshot from %s" % url cat = [repo_name.split('/')[1]] for col in columns: cat.append(col) params = dict(description=desc, directoryLabel=repo_file, categories=cat) params_as_json_string = json.dumps(params) payload = dict(jsonData=params_as_json_string) url_persistent_id = '%s/api/datasets/:persistentId/add?persistentId=%s&key=%s' % ( dataverse_server, persistentId, api_key) print('-' * 40) print('making request') r = requests.post(url_persistent_id, data=payload, files=files) print('-' * 40) try: print(r.json()) except: print(r.content) print(r.status_code) return def collect_urls(self): contents = self.repo.get_contents("") DEBUG = False while contents: file_content = contents.pop(0) urlfullpath = "%s/%s/%s/%s" % (gitroot, self.REPO, gitblob, file_content.path) rawurl = "%s/%s/%s/%s" % (gituserroot, self.REPO, gitmaster, file_content.path) rawurl = rawurl.replace(' ', '%20') if file_content.type == "dir": contents.extend(self.repo.get_contents(file_content.path)) continue if len(PARSABLE_EXTENSIONS) == 0 or file_content.name.split( '.')[-1] in PARSABLE_EXTENSIONS: if DEBUG: print("%s -> %s" % (urlfullpath, rawurl)) self.urls_found[file_content.path] = rawurl print('Found {} URLs'.format(len(self.urls_found))) return self.urls_found def upload_files_to_dataverse(self, ds_id, urls_found): for file, url in urls_found.items(): columns = [] #for url in urls: if file: print(url) try: tmpfile = urllib.request.urlretrieve( url ) # retrieve the csv in a temp file, if there is a problem with the URL it throws and we continue except: continue try: filename = 'file://{}'.format(tmpfile[0]) # TODO: try gzipped datasets as well #if not re.findall(r'(gz$|np$|nt$)', filename): # pd.read_csv(filename) # try reading it as csv, if fails continue print("%s -> %s" % (filename, url)) if self.validate_df: if re.search(r"(xls|xlsx)", url): df = pd.read_excel(filename) columns = list(df.columns) elif re.search(r"json", url): df = pd.read_excel(filename) columns = list(df.columns) else: df = pd.read_csv(filename) columns = list(df.columns) if self.DEBUG: print("Columns: %s" % df.columns) metadata = self.make_file_metadata(REPO, file, url) print('- uploading the following dataset {}'.format(url)) except: continue self.upload_datafile(BASE_URL, API_TOKEN, self.ds_id, self.REPO, tmpfile[0], file, url, columns) return