def get_citation_from_dataset_schema_or_None(self): """ Return the citation text from self.dataset_schema_info (a bit ugly...) Trying to return string from: self.dataset.dataset_schema_info['citation'][0] """ if self.has_error(): # Shouldn't happen... return err_resp(self.get_err_msg()) if not self.dataset.dataset_schema_info: return err_resp('".dataset_schema_info" is empty') if not 'citation' in self.dataset.dataset_schema_info: return ok_resp(None) # If the citation key is found, then do error checking.... if (not self.dataset.dataset_schema_info['citation']) or \ (not isinstance(self.dataset.dataset_schema_info['citation'], list)): return err_resp( '"citation" within ".dataset_schema_info" is empty or not a list' ) if not 'text' in self.dataset.dataset_schema_info['citation'][0]: return err_resp( '"[\'citation\'][0][\'text\']" not found in ".dataset_schema_info"' ) return ok_resp(self.dataset.dataset_schema_info['citation'][0]['text'])
def get_file_specific_schema_info(full_schema_info, file_id=None, file_persistent_id=None): """ Navigate the JSON-LD schema.org info to retrieve file specific info "distribution":[ { "@type":"DataDownload", "name":"Crisis.PDF", "fileFormat":"application/pdf", "contentSize":677112, "description":"Article related to this study: The Supreme Court During Crisis: How War Affects Only Nonwar Cases", "@id":"https://doi.org/10.7910/DVN/OLD7MB/PZPDJF", "identifier":"https://doi.org/10.7910/DVN/OLD7MB/PZPDJF", "contentUrl":"https://dataverse.harvard.edu/api/access/datafile/101646" }, (etc) ] """ print('get_file_specific_schema_info', file_id, file_persistent_id) if not isinstance(full_schema_info, dict): return err_resp('"full_schema_info" must be a Python dict') if not dv_static.SCHEMA_KEY_DISTRIBUTION in full_schema_info: return err_resp( f'"{dv_static.SCHEMA_KEY_DISTRIBUTION}" not found in the schema' ) url_ending_1 = f'/{file_id}' file_doi = file_persistent_id.split( ':')[-1] if file_persistent_id else None print('file_doi', file_doi) for file_info in full_schema_info[dv_static.SCHEMA_KEY_DISTRIBUTION]: # Try to match the the /{fileId} id to the end of the contentURL # example "contentUrl": https://dataverse.harvard.edu/api/access/datafile/101646" # if dv_static.SCHEMA_KEY_CONTENTURL in file_info: content_url = file_info[dv_static.SCHEMA_KEY_CONTENTURL] if content_url and content_url.endswith(url_ending_1): return ok_resp(file_info) # If there's there's a file DOI, try to match it with the identifier # # example "identifier": "https://doi.org/10.7910/DVN/B7DHBK/BSNYLQ" # if file_doi and dv_static.SCHEMA_KEY_IDENTIFIER in file_info: identifier = file_info[dv_static.SCHEMA_KEY_IDENTIFIER] if identifier and identifier.endswith(file_doi): return ok_resp(file_info) if file_id: user_msg = f'Did not find fileId "{file_id}"' elif file_info: user_msg = f'Did not find file DOI "{file_doi}"' else: user_msg = '' return err_resp(f'Info for file not found in the schema. {user_msg}')
def save(self, **kwargs): """ Validate each release request and return any errors that arise. A bit of a misuse of the "save" terminology since we aren't creating any rows in the database, but consistent with the fact that this is a post. Expects a request of the form: { "analysis_plan_id": abcd-1234, "dp_statistics": [{ "error": "", "label": "EyeHeight", "locked": false, "epsilon": 0.0625, "variable": "eyeHeight", "statistic": "mean", "fixed_value": "5", "handle_as_fixed": true, "missing_values_handling": "insert_fixed" }, { "error": "", "label": "EyeHeight", "locked": false, "epsilon": 0.0625, "variable": "eyeHeight", "statistic": "count", "fixed_value": "5", "handle_as_fixed": true, "missing_values_handling": "insert_fixed" } ] } :param kwargs: :return: """ opendp_user = kwargs.get('opendp_user') if not isinstance(opendp_user, get_user_model()): user_msg = 'Not an OpenDP User' return err_resp(user_msg) analysis_plan_id = self.validated_data['analysis_plan_id'] dp_statistics = self.validated_data['dp_statistics'] # import json; print('dp_statistics', json.dumps(dp_statistics, indent=4)) validate_util = ValidateReleaseUtil.validate_mode( opendp_user, analysis_plan_id, dp_statistics) if validate_util.has_error(): # This is a big error, check for it before evaluating individual statistics # user_msg = validate_util.get_err_msg() # Can you return a 400 / raise an Exception here with the error message? # How should this be used? return err_resp(user_msg) #dict(success=False, message=user_msg) #print('(validate_util.validation_info)', validate_util.validation_info) return ok_resp(validate_util.validation_info)
def get_variable_index(self, var_name: str) -> BasicResponse: """Retrieve the variable index from the data_profile for a specific variable name Example data structure: {"dataset":{ "rowCount":6610, "variableCount":20, "variableOrder":[ [0, "ccode"], [1, "country"], [2, "cname" ], ] } etc } :param var_name - variable name, e.g. "cname" would return 1 """ if not self.data_profile: return err_resp('Data profile not available') if 'dataset' not in self.data_profile: return err_resp('Dataset information not available in profile') if 'variableOrder' not in self.data_profile['dataset']: return err_resp( '"variableOrder" information not available in profile (id:2') variable_order = self.data_profile['dataset']['variableOrder'] if not variable_order: return err_resp('Bad "variableOrder" information in profile.') try: for idx, feature in self.data_profile['dataset']['variableOrder']: if feature == var_name: return ok_resp(idx) elif feature == camel_to_snake(var_name): # Temp workaround!!! # Temp workaround!!! See Issue #300 # https://github.com/opendp/dpcreator/issues/300 return ok_resp(idx) except ValueError: return err_resp( 'Bad "variableOrder" information in profile. (id:3)') return err_resp(f'Index not found for variable "{var_name}"')
def get_dataset_info(self) -> BasicResponse: """Get the related DataSetInfo object""" assert self.is_valid( ), "Do not call this method before checking \".is_valid()\"" try: dsi = DataSetInfo.objects.get( object_id=self.validated_data.get('object_id')) except DataSetInfo.DoesNotExist: return err_resp(dstatic.ERR_MSG_DATASET_INFO_NOT_FOUND) return ok_resp(dsi)
def get_variable_order(self, as_indices=False) -> BasicResponse: """ Retrieve the variableOrder list from the data_profile Example data structure: {"dataset":{ "rowCount":6610, "variableCount":20, "variableOrder":[ [0, "ccode"], [1, "country"], [2, "cname" ], ] } etc } :param as_indices, if True, return [0, 1, 2], etc. """ if not self.data_profile: return err_resp('Data profile not available') if not 'dataset' in self.data_profile: return err_resp('Dataset information not available in profile') if not 'variableOrder' in self.data_profile['dataset']: return err_resp( '"variableOrder" information not available in profile (id:2') variable_order = self.data_profile['dataset']['variableOrder'] if as_indices: try: return ok_resp([idx for idx, _var_name in variable_order]) except Exception as ex_obj: user_msg = ( f'"variableOrder" information not in proper format: {variable_order}' f' (exception: {ex_obj}') return err_resp(user_msg) return ok_resp(variable_order)
def get_dataset_info_with_user_check( self, user: get_user_model()) -> BasicResponse: """Get the related DataSetInfo object and check that the user matches the creator""" assert self.is_valid( ), "Do not call this method before checking \".is_valid()\"" try: dsi = DataSetInfo.objects.get( object_id=self.validated_data.get('object_id'), creator=user) except DataSetInfo.DoesNotExist: return err_resp( dstatic.ERR_MSG_DATASET_INFO_NOT_FOUND_CURRENT_USER) return ok_resp(dsi)
def create_plan( dataset_object_id: str, opendp_user: get_user_model()) -> BasicResponse: """ Create an AnalysisPlan object Input: DatasetInfo.object_id Initial settings: analyst - logged in user user_step - (initial step, check branch) variable_info - default to DepositorSetup values """ if not dataset_object_id: return err_resp(astatic.ERR_MSG_DATASET_ID_REQUIRED, data=status.HTTP_400_BAD_REQUEST) if not isinstance(opendp_user, get_user_model()): return err_resp(astatic.ERR_MSG_USER_REQUIRED, data=status.HTTP_400_BAD_REQUEST) # ------------------------------- # Retrieve DataSetInfo object # ------------------------------- try: ds_info = DataSetInfo.objects.get(object_id=dataset_object_id, creator=opendp_user) except DataSetInfo.DoesNotExist: return err_resp(astatic.ERR_MSG_NO_DATASET, data=status.HTTP_400_BAD_REQUEST) # ------------------------------------ # Is the DepositorSetupInfo complete? # ------------------------------------ depositor_info = ds_info.depositor_setup_info if not depositor_info.is_complete: return err_resp(astatic.ERR_MSG_SETUP_INCOMPLETE, data=status.HTTP_422_UNPROCESSABLE_ENTITY) # ------------------------------------ # Create the plan! # ------------------------------------ plan = AnalysisPlan(\ analyst=opendp_user, name=f'Plan {get_rand_alphanumeric(7)}', # need a better name here! dataset=ds_info, is_complete=False, variable_info=ds_info.depositor_setup_info.variable_info, user_step=AnalysisPlan.AnalystSteps.STEP_0700_VARIABLES_CONFIRMED) plan.save() return ok_resp(plan, message='Plan created!')
def get_dataset_size(self) -> BasicResponse: """Retrieve the rowCount index from the data_profile -- not always avaiable""" if not self.data_profile: return err_resp('Data profile not available') if 'dataset' not in self.data_profile: return err_resp('Dataset information not available in profile') if 'rowCount' not in self.data_profile['dataset']: return err_resp('"rowCount" information not available in profile.') row_count = self.data_profile['dataset']['rowCount'] if row_count is None: return err_resp( '"rowCount" information not available in profile (id:2') return ok_resp(self.data_profile['dataset']['rowCount'])
def get_user_info(self, user_api_token=None): """ Placeholder until pyDataverse API is updated """ api_token = user_api_token if user_api_token else self.api_token # remove any trailing "/" ye_host = RegisteredDataverse.format_dv_url(self._host) #while ye_host.endswith('/'): # ye_host = ye_host[:-1] # format url dv_url = f'{ye_host}/api/v1/users/:me' # make the request headers = {'X-Dataverse-key': api_token} try: response = requests.get(dv_url, headers=headers) except ConnectionError as err_obj: return err_resp(f'Failed to connect. {err_obj}') if response.status_code == 200: if not response.content: # In this instance the response content is an empty string or None -- shouldn't happen... # return err_resp(f"Dataverse returned an HTTP 200 status code but failed to return a response.") resp_json = response.json() dv_status = resp_json.get(dv_static.DV_KEY_STATUS) if not dv_status: return err_resp(f"Dataverse response failed to return a 'status'.") if dv_status == dv_static.STATUS_VAL_ERROR: user_msg = resp_json.get(dv_static.DV_KEY_MESSAGE, '(No message from Dataverse)') return err_resp(f"Dataverse error: {user_msg}") return ok_resp(response.json()) try: json_resp = response.json() if 'message' in json_resp: return err_resp(json_resp['message']) except ValueError: pass return err_resp(f'Status code: {response.status_code} {response.text}')
def make_test_handoff_object(self): """For unit tests, make a DataverseHandoff object with the same params""" params = self.as_dict() if dv_static.DV_PARAM_SITE_URL in params: del params[dv_static.DV_PARAM_SITE_URL] dv_handoff = DataverseHandoff(**params) reg_dv = RegisteredDataverse.get_registered_dataverse(self.site_url) if not reg_dv: return err_resp( 'No RegisteredDataverse for site_url {self.site_url}') dv_handoff.dv_installation = reg_dv dv_handoff.save() return ok_resp(data=dv_handoff)
def get_name_from_dataset_schema(self) -> BasicResponse: """ Return the "name" text from self.dataset_schema_info (a bit ugly...) Trying to return string from: self.dataset.dataset_schema_info['name'] """ if self.has_error(): # Shouldn't happen... return err_resp(self.get_err_msg()) if not self.dataset.dataset_schema_info: return err_resp('".dataset_schema_info" is empty') if not 'name' in self.dataset.dataset_schema_info: return err_resp( '"name" not found in ".dataset_schema_info" not found') ds_name = self.dataset.dataset_schema_info['name'] if not ds_name: return err_resp('"name" within ".dataset_schema_info" is empty') return ok_resp(ds_name)
def profile_dataset_info(dataset_object_id: DataSetInfo.object_id, websocket_id=None) -> BasicResponse: """ Using the DataSetInfo object_id, download and profile a dataset. If the "websocket_id" is defined, send back websocket messages Assumes: if websocket_id is None, then assume this is being called w/o celery and can return complex objects such as the DownloadAndProfileUtil. If websocket_id is defined, this function returns a dict: {'success': True/False, 'message': "A user message"} """ dp_util = DownloadAndProfileUtil(dataset_object_id, websocket_id) if dp_util.has_error(): if websocket_id: return dict(success=False, messsage=dp_util.get_err_msg()) return err_resp(dp_util.get_err_msg()) # direct error `message` if websocket_id: return dict(success=True, messsage='Profile in process') return ok_resp(dp_util)
def update_dataverse_user(self): """Update the DataverseUser parameters""" try: # print([x.__dict__ for x in DataverseUser.objects.all()]) # print(f"dv_user_handler: opendp_user {self.opendp_user.id}") # print(f"dv_user_handler: registered dataverse {self.registered_dataverse.id}") dataverse_user = DataverseUser.objects.get( user=self.opendp_user, dv_installation=self.registered_dataverse) except DataverseUser.DoesNotExist as ex: return err_resp('Dataverse user does not exist') # Update the parameters dataverse_user.persistent_id = self.dataverse_persistent_id dataverse_user.first_name = self.first_name dataverse_user.last_name = self.last_name dataverse_user.email = self.email # Save it! dataverse_user.save() return ok_resp(dataverse_user)
def get_file_info(self): """ Return information from the "DataverseFileInfo.file_schema_info" field Ideal: { "name": "crisis.tab" "identifier": "https://doi.org/10.7910/DVN/OLD7MB/ZI4N3J", "fileFormat": "text/tab-separated-values", } """ if self.has_error(): # Shouldn't happen! return err_resp(self.get_err_msg()) if not self.dataset.file_schema_info: return err_resp('".file_schema_info" is empty') file_dict = {} if 'name' in self.dataset.file_schema_info: file_dict['name'] = self.dataset.file_schema_info['name'] else: return err_resp( '"name" not found in ".file_schema_info" not found') if 'identifier' in self.dataset.file_schema_info: file_dict['identifier'] = self.dataset.file_schema_info[ 'identifier'] else: file_dict['identifier'] = None if 'fileFormat' in self.dataset.file_schema_info: file_dict['fileFormat'] = self.dataset.file_schema_info[ 'fileFormat'] else: file_dict['fileFormat'] = None return ok_resp(file_dict)
def retrieve_analysis( analysis_object_id: str, opendp_user: get_user_model()) -> BasicResponse: """ Retrieve an existing AnalysisPlan object by its object_id and analyst """ if not analysis_object_id: return err_resp(astatic.ERR_MSG_ANALYSIS_ID_REQUIRED, data=status.HTTP_400_BAD_REQUEST) if not isinstance(opendp_user, get_user_model()): return err_resp(astatic.ERR_MSG_USER_REQUIRED, data=status.HTTP_400_BAD_REQUEST) # ------------------------------- # Retrieve AnalysisPlan object # ------------------------------- try: plan = AnalysisPlan.objects.get(object_id=analysis_object_id, analyst=opendp_user) except AnalysisPlan.DoesNotExist: return err_resp(astatic.ERR_MSG_NO_ANALYSIS_PLAN, data=status.HTTP_400_BAD_REQUEST) return ok_resp(plan, message='Plan created!')