Ejemplo n.º 1
0
    def test_dataset_init_invalid(self):
        """Test Dataset.init() with invalid data."""
        pdv = Dataset()

        # invalid data
        for data in test_config["invalid_set_types"]:
            with pytest.raises(AssertionError):
                pdv.set(data)
Ejemplo n.º 2
0
    def test_dataset_set_dv_up(self, import_dataset_min_dict):
        """Test Dataset.set() with format=`dv_up`.

        Parameters
        ----------
        import_dataset_min_dict : dict
            Fixture, which returns a flat dataset dict().

        """
        ds = Dataset()
        data = import_dataset_min_dict
        ds.set(data)
        """dataset"""
        assert ds.license == 'CC0'
        assert ds.termsOfUse == 'CC0 Waiver'
        assert ds.termsOfAccess == 'Terms of Access'
        """citation"""
        assert ds.citation_displayName == 'Citation Metadata'
        assert ds.title == 'Replication Data for: Title'
Ejemplo n.º 3
0
def create_testdata(config_file: str, force: bool) -> None:
    """Create testdata defined in a config file.

    Creates a pre-defined set of testdata on your
    instance. By default, the function uses the
    AUSSDA test data repository, which is so far not
    publicly available. If `PRODUCTION` is `true`,
    this function will not execute, as long as you
    not add `--force` to the function call. This is
    to protect from unwanted changes on a production
    instance.

    """
    # Init
    if config.PRODUCTION and not force:
        print(
            "Create testdata on a PRODUCTION instance not allowed. Use --force to force it."
        )
        sys.exit()
    pid_idx = []
    users = read_json(config.USER_FILENAME)
    workflow = read_json(os.path.join(ROOT_DIR, config_file))

    # Dataverses
    for dv_conf in workflow["dataverses"]:
        dv_alias = None
        if "create" in dv_conf:
            api = NativeApi(
                config.BASE_URL,
                users[dv_conf["create"]["user-handle"]]["api-token"])
            dv = Dataverse()
            dv_filename = os.path.join(ROOT_DIR,
                                       dv_conf["create"]["metadata-filename"])
            dv.from_json(read_file(dv_filename))
            if "update" in dv_conf["create"]:
                for key, val in dv_conf["create"]["update"].items():
                    kwargs = {key: val}
                    dv.set(kwargs)
            dv_alias = dv.get()["alias"]
            resp = api.create_dataverse(dv_conf["create"]["parent"], dv.json())

        if "publish" in dv_conf:
            api = NativeApi(
                config.BASE_URL,
                users[dv_conf["publish"]["user-handle"]]["api-token"])
            if not dv_alias and "alias" in dv_conf["publish"]:
                dv_alias = dv_conf["publish"]["alias"]
            resp = api.publish_dataverse(dv_alias)

    # Datasets
    for ds_conf in workflow["datasets"]:
        pid = None
        if "create" in ds_conf:
            api = NativeApi(
                config.BASE_URL,
                users[ds_conf["create"]["user-handle"]]["api-token"])
            ds = Dataset()
            ds_filename = os.path.join(ROOT_DIR,
                                       ds_conf["create"]["metadata-filename"])
            ds.from_json(read_file(ds_filename))
            if "update" in ds_conf["create"]:
                for key, val in ds_conf["create"]["update"].items():
                    kwargs = {key: val}
                    ds.set(kwargs)
            resp = api.create_dataset(dv_alias, ds.json())
            pid = resp.json()["data"]["persistentId"]
            pid_idx.append(pid)

        if "publish" in ds_conf:
            if not pid:
                print("ERROR: PID missing!")
                sys.exit()
            api = NativeApi(
                config.BASE_URL,
                users[ds_conf["publish"]["user-handle"]]["api-token"])
            resp = api.publish_dataset(pid, release_type="major")

    # Datafiles
    for dataset_id, ds_datafiles in workflow["datafiles"].items():
        if int(dataset_id) == workflow["datasets"][int(dataset_id)]["id"]:
            pid = pid_idx[int(dataset_id)]
        else:
            print("ERROR: Dataset ID not matching.")
            sys.exit()
        for df_conf in ds_datafiles:
            if "upload" in df_conf:
                api = NativeApi(
                    config.BASE_URL,
                    users[df_conf["upload"]["user-handle"]]["api-token"],
                )
                metadata = read_json(df_conf["upload"]["metadata-filename"])
                df = Datafile()
                df.set(metadata)
                if "update" in df_conf["upload"]:
                    for key, val in df_conf["upload"]["update"].items():
                        kwargs = {key: val}
                        df.set(kwargs)
                df.set({"pid": pid})
                filename = df_conf["upload"]["filename"]
                resp = api.upload_datafile(pid, filename, df.json())
                if filename[-4:] == ".sav" or filename[-4:] == ".dta":
                    sleep(30)
                else:
                    sleep(3)
        if "publish-dataset" in df_conf:
            api = NativeApi(
                config.BASE_URL,
                users[df_conf["publish-dataset"]["user-handle"]]["api-token"],
            )
            if df_conf["publish-dataset"]:
                resp = api.publish_dataset(pid, release_type="major")
class DataverseData():
    def __init__(self, REPO, validate=False):
        self.ext = PARSABLE_EXTENSIONS
        self.REPO = REPO
        self.mapping_dsid2pid = {}
        self.validate_df = validate
        self.g = Github(GITHUB_TOKEN)
        self.repo = self.g.get_repo(REPO)
        self.urls_found = {}
        self.ds_id = 0
        self.DEBUG = True

    def githubsearch(self, thisquery):
        repositories = self.g.search_repositories(query=thisquery,
                                                  sort='updated')
        return repositories

    def search(self, thisquery):
        search_api = SearchApi(BASE_URL, API_TOKEN)
        return search_api.search(thisquery).json()['data']

    def if_exist(self, thisquery):
        self.exists = False
        repoquery = "authorName:%s" % (thisquery)
        try:
            for item in self.search(repoquery)['items'][0]['authors']:
                if item == thisquery:
                    self.exists = True
                print(item)
        except:
            self.exists = False
        if self.DEBUG:
            print(self.exists)
        return self.exists

    def datasync(self):
        native_api = NativeApi(BASE_URL, API_TOKEN)
        self.ds_id = str(
            int(self.make_dataset_id(self.REPO).hexdigest(),
                16))[:6]  ## turn the md5 string into a 6 digits integer
        metadata = self.make_dataset_metadata(self.REPO)
        print(metadata)
        self.ds = Dataset()
        self.ds.set(metadata)
        self.ds.displayName = metadata['title']
        self.ds.json = metadata
        print(self.ds.get())
        if self.DEBUG:
            print("[datasync]")
            print(self.ds)
            print(self.ds_id)
            print(self.ds.displayName)
        self.create_dataset(native_api, self.ds, DV_ALIAS, self.ds_id,
                            BASE_URL)
        if self.DEBUG:
            print(metadata)
        self.upload_files_to_dataverse(self.ds_id, self.urls_found)
        return True

    def extract_urls(self, content: str) -> list:
        matches = re.findall(r"(http[^\s'\"\\]+)", content)
        pattern = re.compile(r"([^/\w]+)$")
        return [pattern.sub("", match) for match in matches]

    def decode_github_content(self, content: str) -> str:
        return base64.b64decode(content).decode("utf-8")

    def make_dataset_id(self, repo_name):
        return hashlib.md5(repo_name.encode("utf-8"))

    def make_default_dataset(self, data, repo_name):
        ds_id = self.make_dataset_id(repo_name)
        data[ds_id] = {'metadata': make_dataset_metadata(repo_name)}
        return data

    def make_dataset_metadata(self, repo_name):
        metadata = {}
        repo = self.g.get_repo(repo_name)
        metadata['termsOfAccess'] = ''
        metadata[
            'title'] = 'Automatic uploads from {} github repository'.format(
                repo_name)
        metadata[
            'subtitle'] = 'Automatic uploads from {} github repository'.format(
                repo_name)
        metadata['author'] = [{
            "authorName": repo_name,
            "authorAffiliation": "CoronaWhy"
        }]
        metadata['dsDescription'] = [{'dsDescriptionValue': ''}]
        metadata['dsDescription'] = [{
            'dsDescriptionValue':
            format(repo.get_topics())
        }]
        if len(metadata['dsDescription']) < 3:
            metadata['dsDescription'] = [{'dsDescriptionValue': 'coronavirus'}]

        metadata['subject'] = ['Medicine, Health and Life Sciences']
        metadata['keyword'] = repo.get_topics()
        metadata['datasetContact'] = [{
            'datasetContactName':
            'https://github.com/{}'.format(repo_name),
            'datasetContactEmail':
            '*****@*****.**'
        }]

        return metadata

    def make_file_metadata(self, repo_name, file, url):
        metadata = {}

        metadata['description'] = file
        metadata['filename'] = url
        metadata['datafile_id'] = hashlib.md5(url.encode("utf-8"))
        metadata['dataset_id'] = hashlib.md5(repo_name.encode("utf-8"))
        return metadata

    def create_dataset(self, api, ds, dv_alias, ds_id, base_url):
        if self.DEBUG:
            print("\n\n[create_dataset]")
            print(ds.get())
            # print(ds.to_json())
        resp = ''
        try:
            resp = api.create_dataset(dv_alias, ds.json())
            pid = resp.json()['data']['persistentId']
        except:
            # print(resp.content)
            return resp, self.mapping_dsid2pid

        self.mapping_dsid2pid[ds_id] = pid
        time.sleep(1)
        print('{0}/dataset.xhtml?persistentId={1}&version=DRAFT'.format(
            base_url, pid))
        return resp

    # Implementation adapted from http://guides.dataverse.org/en/latest/api/native-api.html#id62
    def upload_datafile(self, server, api_key, p_id, repo_name, filename,
                        repo_file, url, columns):
        dataverse_server = server
        api_key = api_key
        persistentId = p_id

        files = {'file': (url.split('/')[-1], open(filename, 'rb'))}
        desc = "Data snapshot from %s" % url
        cat = [repo_name.split('/')[1]]
        for col in columns:
            cat.append(col)
        params = dict(description=desc,
                      directoryLabel=repo_file,
                      categories=cat)

        params_as_json_string = json.dumps(params)

        payload = dict(jsonData=params_as_json_string)

        url_persistent_id = '%s/api/datasets/:persistentId/add?persistentId=%s&key=%s' % (
            dataverse_server, persistentId, api_key)

        print('-' * 40)
        print('making request')
        r = requests.post(url_persistent_id, data=payload, files=files)

        print('-' * 40)
        try:
            print(r.json())
        except:
            print(r.content)
        print(r.status_code)
        return

    def collect_urls(self):
        contents = self.repo.get_contents("")
        DEBUG = False
        while contents:
            file_content = contents.pop(0)
            urlfullpath = "%s/%s/%s/%s" % (gitroot, self.REPO, gitblob,
                                           file_content.path)
            rawurl = "%s/%s/%s/%s" % (gituserroot, self.REPO, gitmaster,
                                      file_content.path)
            rawurl = rawurl.replace(' ', '%20')
            if file_content.type == "dir":
                contents.extend(self.repo.get_contents(file_content.path))
                continue

            if len(PARSABLE_EXTENSIONS) == 0 or file_content.name.split(
                    '.')[-1] in PARSABLE_EXTENSIONS:
                if DEBUG:
                    print("%s -> %s" % (urlfullpath, rawurl))
                self.urls_found[file_content.path] = rawurl

        print('Found {} URLs'.format(len(self.urls_found)))
        return self.urls_found

    def upload_files_to_dataverse(self, ds_id, urls_found):
        for file, url in urls_found.items():
            columns = []
            #for url in urls:
            if file:
                print(url)
                try:
                    tmpfile = urllib.request.urlretrieve(
                        url
                    )  # retrieve the csv in a temp file, if there is a problem with the URL it throws and we continue
                except:
                    continue

                try:
                    filename = 'file://{}'.format(tmpfile[0])
                    # TODO: try gzipped datasets as well
                    #if not re.findall(r'(gz$|np$|nt$)', filename):
                    #    pd.read_csv(filename) # try reading it as csv, if fails continue
                    print("%s -> %s" % (filename, url))
                    if self.validate_df:
                        if re.search(r"(xls|xlsx)", url):
                            df = pd.read_excel(filename)
                            columns = list(df.columns)
                        elif re.search(r"json", url):
                            df = pd.read_excel(filename)
                            columns = list(df.columns)
                        else:
                            df = pd.read_csv(filename)
                            columns = list(df.columns)
                        if self.DEBUG:
                            print("Columns: %s" % df.columns)
                    metadata = self.make_file_metadata(REPO, file, url)
                    print('- uploading the following dataset {}'.format(url))
                except:
                    continue

                self.upload_datafile(BASE_URL, API_TOKEN, self.ds_id,
                                     self.REPO, tmpfile[0], file, url, columns)
        return