def inherit_metadata(self, reference_target_id): """ using reference_target as the basis, copy metadata across to description targets :param reference_target_id: :return: """ reference_description = DataFile().get_record(reference_target_id).get("description", dict()) reference_attributes = reference_description.get("attributes", dict()) reference_stages = reference_description.get("stages", list()) for target in self.description_targets: # 'focus' on target self.set_datafile_id(target["recordID"]) # use batch stages to update targets self.update_datafile_stage(reference_stages) # find and attributes from the reference for k, v in reference_attributes.items(): if k not in self.get_datafile_attributes(): self.update_datafile_attributes({'ref': k, 'data': v}) self.update_targets_datafiles() return
def get_datafiles_rating(self): """ function handles the evaluation of metadata rating for datafiles :return: dictionary of datafiles with associated metadata rating """ datafiles_rating = list() for df_id in self.item_ids: default_rating = \ d_utils.json_to_pytype(lkup.METADATA_RATING_TEMPLATE_LKUPS["rating_template"])["properties"][-1] item_rating = dict() item_rating["rating_level"] = default_rating["rating_level"] item_rating["rating_level_description"] = default_rating[ "rating_level_description"] d_r = dict(item_id=df_id, item_rating=item_rating) attributes = DataFile().get_record_property( df_id, "description_attributes") deposition_context = DataFile().get_record_property( df_id, "target_repository") if deposition_context: d_r["item_rating"] = self.rate_metadata( attributes, deposition_context) datafiles_rating.append(d_r) return datafiles_rating
def hash_upload(request): # utility method to create an md5 hash of a given file path # open uploaded file file_id = request.GET['file_id'] print('hash started ' + file_id) file_obj = ChunkedUpload.objects.get(pk=file_id) file_name = os.path.join(settings.MEDIA_ROOT, file_obj.file.name) # now hash opened file md5 = hashlib.md5() with open(file_name, 'rb') as f: for chunk in iter(lambda: f.read(8192), b''): md5.update(chunk) file_obj.hash = md5.hexdigest() file_obj.save() output_dict = {'output_hash': md5.hexdigest(), 'file_id': file_id} # update record in mongo record_object = DataFile().get_by_file_id(file_id) auto_fields = dict() auto_fields[DataFile().get_qualified_field("file_hash")] = file_obj.hash profile_id = request.session['profile_id'] component = "datafile" BrokerDA(target_id=str(record_object.get("_id", str())), component=component, auto_fields=auto_fields).do_save_edit() out = jsonpickle.encode(output_dict) print('hash complete ' + file_id) return HttpResponse(out, content_type='json')
def get_datafiles_json(target_id=None): """ returns all datafile record :return: """ from dal.copo_da import DataFile profile_id = get_current_request().session['profile_id'] if target_id: datafiles = list() datafiles.append(DataFile().get_record(target_id)) else: datafiles = DataFile(profile_id).get_all_records() value_field = str("id") label_field = str("datafile_name") search_field = ["id", "datafile_name"] secondary_label_field = ["meta_datafile_name"] elem_json = dict(value_field=value_field, label_field=label_field, secondary_label_field=secondary_label_field, search_field=search_field, options=list()) for sd in datafiles: elem_json.get("options").append({ value_field: str(sd["_id"]), label_field: sd["name"], secondary_label_field[0]: sd["name"] }) return elem_json
def generate_copo_datafiles_data(profile_id, data_file=None): d = DataFile(profile_id) # branch out, if a single record is provided if data_file: chunked_upload = ChunkedUpload.objects.get(id=int(data_file["file_id"])) row = [chunked_upload.filename, str(data_file["_id"])] return {"row_data": row, "table_id": "datafile_table"} datafiles = d.get_all_datafiles() # headers columns = [{"title": "File"}, {"title": " "}] dataSet = [] # data for df in datafiles: # get details of the file from the file object chunked_upload = ChunkedUpload.objects.get(id=int(df["file_id"])) row = [chunked_upload.filename, str(df["_id"])] dataSet.append(row) # define action buttons for the table. ALWAYS include the class 'copo-dt' in className!!! action_buttons = [ {'text': 'Describe', 'className': 'copo-dt btn btn-primary', 'iconClass': 'fa fa-tags'}, {'text': 'Delete', 'className': 'copo-dt btn btn-danger', 'iconClass': 'fa fa-trash-o'} ] return {"columns": columns, "dataSet": dataSet, "table_id": "datafile_table", "action_buttons": action_buttons}
def hash_upload(request): # utility method to create an md5 hash of a given file path # open uploaded file file_id = request.GET['file_id'] print('hash started ' + file_id) file_obj = ChunkedUpload.objects.get(pk=file_id) file_name = os.path.join(settings.MEDIA_ROOT, file_obj.file.name) # now hash opened file md5 = hashlib.md5() with open(file_name, 'rb') as f: for chunk in iter(lambda: f.read(8192), b''): md5.update(chunk) file_obj.hash = md5.hexdigest() file_obj.save() output_dict = {'output_hash': md5.hexdigest(), 'file_id': file_id} # update record in mongo record_object = DataFile().get_by_file_id(file_id) auto_fields = dict() auto_fields[DataFile().get_qualified_field("file_hash")] = file_obj.hash BrokerDA(target_id=str(record_object.get("_id", str())), component="datafile", auto_fields=auto_fields ).do_save_edit() out = jsonpickle.encode(output_dict) print('hash complete ' + file_id) return HttpResponse(out, content_type='json')
def submit(self, sub_id, dataFile_ids): # physically transfer files path2library = os.path.join(BASE_DIR, REPOSITORIES['ASPERA']['resource_path']) # change these to be collected properly user_name = REPOSITORIES['ASPERA']['user_token'] password = REPOSITORIES['ASPERA']['password'] # create transfer record transfer_token = RemoteDataFile().create_transfer(sub_id)['_id'] self.submission = Submission().get_record(sub_id) self.profile = Profile().get_record(self.submission['profile_id']) remote_path = d_utils.get_ena_remote_path(sub_id) # get each file in the bundle file_path = [] for idx, f_id in enumerate(dataFile_ids): mongo_file = DataFile().get_record(f_id) self.d_files.append(mongo_file) file_path.append(mongo_file.get("file_location", str())) self._do_aspera_transfer(transfer_token=transfer_token, user_name=user_name, password=password, remote_path=remote_path, file_path=file_path, path2library=path2library, sub_id=sub_id)
def save_stage_data(auto_fields): d = DataFile() datafile_id = auto_fields["datafile"] current_stage = auto_fields["current_stage"] description_stages = d.GET(datafile_id)["description"]["stages"] stage = [elem for elem in description_stages if elem["ref"] == current_stage] # get schema for resolving ontology terms onto_schema = d_utils.json_to_pytype(lkup.DB_TEMPLATES["ONTOLOGY_ANNOTATION"]) if stage: data = {} stage_items = stage[0]["items"] if stage_items: for sti in stage_items: # handle ontology term if sti["control"].lower() == "ontology term": a = {} for k in onto_schema["properties"]: if sti["id"] + "." + k in auto_fields.keys(): a[k] = auto_fields[sti["id"] + "." + k] data[sti["id"]] = a else: data[sti["id"]] = auto_fields[sti["id"]] d.save_description_stage(datafile_id, {"ref": current_stage, "data": data})
def do_sanitise_submissions(self): records = self.da_object.get_all_records() for submission in records: if "bundle_meta" not in submission: bundle_meta = list() for file_id in submission.get("bundle", list()): datafile = DataFile().get_record(file_id) if datafile: upload_status = False if str(submission.get("complete", False)).lower() == 'true': upload_status = True bundle_meta.append( dict(file_id=file_id, file_path=datafile.get( "file_location", str()), upload_status=upload_status)) submission["bundle_meta"] = bundle_meta submission['target_id'] = str(submission.pop('_id')) self.da_object.save_record(dict(), **submission) self.context["sanitise_status"] = True return self.context
def zip_file(request): # need to get a reference to the file to zip file_id = request.GET['file_id'] print("zip started " + file_id) file_obj = ChunkedUpload.objects.get(pk=file_id) # get the name of the file to zip and change its suffix to .gz output_file_location = os.path.join(settings.MEDIA_ROOT, file_obj.file.name) output_file_name = file_obj.filename + '.gz' try: # open the file as gzip acrchive...set compression level temp_name = os.path.join(settings.MEDIA_ROOT, str(uuid.uuid4()) + '.tmp') myzip = gzip.open(temp_name, 'wb', compresslevel=1) src = open(output_file_location, 'r') # write input file to gzip archive in n byte chunks n = 100000000 for chunk in iter(lambda: src.read(n), ''): myzip.write(bytes(chunk, 'UTF-8')) finally: myzip.close() src.close() print('zip complete ' + file_id) # now need to delete the old file and update the file record with the new file new_file_name = output_file_location + '.gz' os.rename(temp_name, new_file_name) os.remove(output_file_location) # calculate new file size stats = os.stat(new_file_name) new_file_size = stats.st_size / 1000 / 1000 # update filename file_obj.filename = output_file_name file_obj.file.name = new_file_name # update file size file_obj.offset = stats.st_size file_obj.save() out = {'zipped': True, 'file_name': output_file_name, 'file_size': new_file_size} # update record in mongo record_object = DataFile().get_by_file_id(file_id) auto_fields = dict() auto_fields[DataFile().get_qualified_field("file_size")] = u.filesize_toString(file_obj.offset) auto_fields[DataFile().get_qualified_field("name")] = output_file_name auto_fields[DataFile().get_qualified_field("file_location")] = new_file_name BrokerDA(target_id=str(record_object.get("_id", str())), component="datafile", auto_fields=auto_fields ).do_save_edit() out = jsonpickle.encode(out) return HttpResponse(out, content_type='text/plain')
def ena_description(auto_fields): # get current stage, output next stage stage_dict = {} datafile_id = auto_fields["datafile"] current_stage = auto_fields["current_stage"] static_list = d_utils.json_to_pytype(lkup.WIZARD_FILES["ena"])["properties"] unified_list = static_list # get stages from db if exist. stages (both static and dynamic) are held in the db, # to provide a single or unified point of reference d = DataFile() description = d.GET(datafile_id)["description"] description_stages = description["stages"] if len(description_stages) > 0: unified_list = description_stages else: description["stages"] = unified_list fields = {"description": description} d.edit_datafile(datafile_id, fields) # generate and merge dynamic stages with static if not already generated if unified_list == static_list: # only static stages exist, generate dynamic dynamic_elements = get_dynamic_elements_ena(datafile_id) # ENA dynamic stages, contingent upon study_type if dynamic_elements: unified_list = unified_list + dynamic_elements # merge and save stages description["stages"] = unified_list fields = {"description": description} d.edit_datafile(datafile_id, fields) # now, resolve stages for the wizard next_stage_indx = 0 listed_stage = [indx for indx, elem in enumerate(unified_list) if elem["ref"] == current_stage] if listed_stage: next_stage_indx = listed_stage[0] + 1 try: elem = unified_list[next_stage_indx] if not is_stage_present(datafile_id, elem["ref"]): stage_dict = get_stage_display(elem, datafile_id) except: pass if not stage_dict and current_stage == unified_list[-1]["ref"]: # reached last stage of wizard, 'review' now # stage_dict = wtags.get_review_html(get_stages_display(datafile_id)) pass return stage_dict
def setUpClass(cls): settings.UNIT_TESTING = True # create user cls.user = User.objects.create_user(username='******', first_name="jonny", last_name="appleseed", email='*****@*****.**', password='******') cls.user.save() # create profile p_dict = {"copo_id": "000000000", "description": "Test Description", "user_id": 1, "title": "Test Title"} cls.pid = Profile().save_record(dict(), **p_dict) # create datafile p = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures", "dummy_datafile_cgcore.json") with open(p) as f: p_dict = json.loads(f.read()) p_dict["file_location"] = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures", "fish.png") p_dict["name"] = "fish.png" profile = Profile().get_collection_handle().find_one({"copo_id": "000000000"}) p_dict["profile_id"] = str(cls.pid["_id"]) cls.d = DataFile().get_collection_handle().insert(p_dict) # create submission p = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures", "dummy_cgcore_dataverse_submission_existing.json") with open(p) as f: p_dict = json.loads(f.read()) p_dict["bundle_meta"][0]["file_path"] = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures", "fish.png") p_dict["bundle_meta"][0]["file_id"] = str(cls.d) p_dict["profile_id"] = str(cls.pid["_id"]) p_dict["bundle"].append(str(cls.d)) cls.s_dv = Submission().get_collection_handle().insert(p_dict)
def get_info_for_new_dataverse(request): # method to prepopulate dataverse creation form with currently available metadata values out = dict() p_id = request.session['profile_id'] profile = Profile().get_record(p_id) out['dvAlias'] = str(profile['title']).lower() person_list = list(Person(p_id).get_people_for_profile()) out['dvPerson'] = person_list orcid = Orcid().get_orcid_profile(request.user) try: affiliation = orcid.get('op', {}).get('activities_summary', {}).get('employments', {}) \ .get('employment_summary', {})[0].get('organization', "").get('name', "") except: affiliation = "" out['dsAffiliation'] = affiliation df = list(DataFile().get_for_profile(p_id)) file = df[0] out['dvName'] = profile.get('title', "") out['dsTitle'] = file.get('description', {}).get('attributes', {}) \ .get('title_author_contributor', {}).get('dcterms:title', "") out['dsDescriptionValue'] = file.get('description', {}).get('attributes', {}) \ .get('subject_description', {}).get('dcterms:description', "") out['dsSubject'] = file.get('description', {}).get('attributes', {}) \ .get('subject_description', {}).get('dcterms:subject', "") return HttpResponse(json_util.dumps(out))
def annotate_meta(request, file_id): if "ss_data" in request.session: del request.session["ss_data"] if "ss_sheet_names" in request.session: del request.session["ss_sheet_names"] df = DataFile().get_record(ObjectId(file_id)) name = df["name"] if name.endswith(('xls', 'xlsx')): return render(request, 'copo/copo_annotate_spreadsheet.html', {'file_id': file_id, 'file_name': name, 'file_type': "ss"}) elif name.endswith("csv"): return render(request, 'copo/copo_annotate_spreadsheet.html', {'file_id': file_id, 'file_name': name, 'file_type': "csv"}) elif name.endswith(("txt", "tsv")): return render(request, 'copo/copo_annotate_spreadsheet.html', {'file_id': file_id, 'file_name': name, 'file_type': "tab"}) elif name.endswith(('pdf')): html = "" records = Annotation().get_all_records() if "annotation_html" not in request.session: # if True: folder_name = str(uuid.uuid1()) full_path = os.path.join(settings.MEDIA_ROOT, folder_name) os.makedirs(full_path) run("ebook-convert " + df[ "file_location"] + " " + full_path + " --no-images --pretty-print --insert-blank-line") with open(os.path.join(full_path, "index.html"), 'r') as f: html = f.read() shutil.rmtree(full_path) request.session["annotation_html"] = html else: print("using session text data") html = request.session["annotation_html"] return render(request, 'copo/copo_annotate_pdf.html', {'html': html, 'file_id': file_id, 'file_name': name, "file_type": "pdf"})
def _make_dataset_xml(self, sub): meta = sub['meta'] # iterate through meta to get fields d = dict() datafile = DataFile().get_record(ObjectId(sub['bundle'][0])) df = datafile['description']['attributes'] xml = '<?xml version="1.0"?>' xml = xml + '<entry xmlns="http://www.w3.org/2005/Atom" xmlns:dcterms="http://purl.org/dc/terms/">' xml = xml + '<dcterms:contributor>' + "*****@*****.**" + '</dcterms:contributor>' for item in meta["fields"]: if type(item["vals"]) == type(""): tail = item["dc"].split(".")[1] xml = xml + "<dcterms:" + tail + '>' + item[ "vals"] + "</dcterms:" + tail + '>' elif type(item["vals"] == type(list())): for val in item["vals"]: tail = item["dc"].split(".")[1] xml = xml + '<dcterms:' + tail + '>' + val + '</dcterms:' + tail + '>' xml = xml + "</entry>" path = os.path.dirname(datafile['file_location']) xml_path = os.path.join(path, 'xml.xml') with open(xml_path, 'w+') as f: f.write(xml) return xml_path
def do_submission_xml(sub_id): sub = Submission().get_record(sub_id) dfs = list() for d in sub["bundle"]: dfs.append(DataFile().get_record(d)) df = dfs[0] submission = Element("SUBMISSION") # get names of files in bundle and append here # do alias alias = make_alias(sub) submission.set("alias", alias + "_sub") submission.set( "broker_name", df["description"]["attributes"]["study_type"]["study_broker"]) submission.set( "center_name", df["description"]["attributes"]["study_type"] ["study_analysis_center_name"]) submission_date = datetime.datetime.now().isoformat() submission.set("submission_date", submission_date) submission.set("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance") submission.set( "xsi:noNamespaceSchemaLocation", "ftp://ftp.sra.ebi.ac.uk/meta/xsd/sra_1_5/SRA.submission.xsd") contacts = Element("CONTACTS") copo_contact = Element("CONTACT") copo_contact.set("inform_on_error", "*****@*****.**") copo_contact.set("inform_on_status", "*****@*****.**") copo_contact.set("name", "COPO Support") contacts.append(copo_contact) people = Person(sub["profile_id"]).get_people_for_profile() for p in people: c = Element("CONTACT") c.set("name", p["firstName"] + " " + p["lastName"]) if [ x for x in p["roles"] if x["annotationValue"] == "SRA Inform On Status" ]: c.set("inform_on_status", p["email"]) if [ x for x in p["roles"] if x["annotationValue"] == "SRA Inform On Error" ]: c.set("inform_on_error", p["email"]) contacts.append(c) submission.append(contacts) actions = Element("ACTIONS") action = Element("ACTION") add = Element("ADD") add.set("schema", "analysis") add.set("source", "analysis.xml") action.append(add) actions.append(action) submission.append(actions) return prettify(submission)
def make_alias(sub): bundle = sub['bundle'] filenames = "" for b in bundle: file = DataFile().get_record(b) filenames = filenames + "-" + file['name'] alias = str(sub["_id"]) + ':' + sub['repository'] + ":" + filenames return alias
def tearDownClass(cls): u = User.objects.get(pk=1) u.delete() Profile().get_collection_handle().remove({"copo_id": "000000000"}) DataFile().get_collection_handle().remove({"_id": cls.d}) # Submission().get_collection_handle().remove({"_id": cls.s_dv}) Submission().get_collection_handle().remove({"_id": cls.s_ckan_new}) Submission().get_collection_handle().remove({"_id": cls.s_ckan_existing})
def tearDownClass(cls): u = User.objects.get(username=settings.TEST_USER_NAME) u.delete() Profile().get_collection_handle().remove({"copo_id": "000000000"}) DataFile().get_collection_handle().remove({"test_file": True}) Repository().get_collection_handle().remove({"_id": cls.r["_id"]}) Submission().get_collection_handle().remove({"_id": cls.s_dv}) Submission().get_collection_handle().remove({"_id": cls.s_ds_new}) Submission().get_collection_handle().remove({"_id": cls.s_ds_existing})
def send_files(self, sub, ds): for id in sub['bundle']: file = DataFile().get_record(ObjectId(id)) file_location = file['file_location'] file_name = file['name'] with open(file_location, 'rb') as f: contents = f.read() ds.upload_file(file_name, contents, zip_files=False)
def do_description_summary(self): record = DataFile().get_record(self.param_dict.get("target_id")) self.context['description'] = htags.resolve_description_data( record.get("description", dict()), dict()) description_token = record.get('description_token', str()) self.context['description']['description_record'] = dict() if description_token: description_record = Description().GET(description_token) if description_record: if not description_record["name"]: description_record["name"] = "N/A" self.context['description']['description_record'] = dict( name=description_record["name"], id=str(description_record["_id"])) return self.context
def automate_num_cols(request): file_id = request.GET.get("file_id", "") file_obj = DataFile().get_record(file_id) try: d = pandas.read_csv(file_obj["file_location"], nrows=4) except UnicodeDecodeError as e: d = pandas.read_excel(file_obj["file_location"], nrows=4) headers = d.columns.values.tolist() cols = len(d.columns) output = {"num": cols, "headers": headers} return HttpResponse(json.dumps(output))
def submit(self, sub_id, dataFile_ids): submission_record = Submission().get_record(sub_id) # bundle_meta, if present, should provide a better picture of what datafiles need to be uploaded if "bundle_meta" in submission_record: pending_files = [ x["file_id"] for x in submission_record['bundle_meta'] if not x["upload_status"] ] dataFile_ids = pending_files # physically transfer files path2library = os.path.join(BASE_DIR, REPOSITORIES['ASPERA']['resource_path']) # change these to be collected properly user_name = REPOSITORIES['ASPERA']['user_token'] password = REPOSITORIES['ASPERA']['password'] # create transfer record transfer_token = RemoteDataFile().create_transfer(sub_id)['_id'] self.submission = Submission().get_record(sub_id) self.profile = Profile().get_record(self.submission['profile_id']) remote_path = d_utils.get_ena_remote_path(sub_id) # get each file in the bundle file_path = [] for idx, f_id in enumerate(dataFile_ids): mongo_file = DataFile().get_record(ObjectId(f_id)) self.d_files.append(mongo_file) file_path.append(mongo_file.get("file_location", str())) case = self._do_aspera_transfer(transfer_token=transfer_token, user_name=user_name, password=password, remote_path=remote_path, file_path=file_path, path2library=path2library, sub_id=sub_id) return case
def delete_annotation(request): col_idx = request.GET["col_idx"] sheet_name = request.GET["sheet_name"] file_id = request.GET["file_id"] iri = request.GET["iri"] uid = request.user.id doc = Annotation().decrement_or_delete_annotation(uid, iri) doc = DataFile().delete_annotation(col_idx=col_idx, sheet_name=sheet_name, file_id=file_id) return HttpResponse("Hello World")
def do_study_xml(sub_id): # get submission object from mongo sub = Submission().get_record(sub_id) # get datafile objects dfs = list() for d in sub["bundle"]: dfs.append(DataFile().get_record(d)) df = dfs[0] # get profile object p = Profile().get_record(df["profile_id"]) # Do STUDY_SET study_set = Element("STUDY_SET") study_set.set("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance") study_set.set("xsi:noNamespaceSchemaLocation", "ftp://ftp.sra.ebi.ac.uk/meta/xsd/sra_1_5/SRA.study.xsd") # Do STUDY study = Element("STUDY") study.set("alias", str(sub["_id"])) study.set( "center_name", df["description"]["attributes"]["study_type"] ["study_analysis_center_name"]) study_set.append(study) # Do DESCRIPTOR descriptor = Element("DESCRIPTOR") # create element, append to parent and add text SubElement(descriptor, "STUDY_TITLE").text = p["title"] study_type = Element("STUDY_TYPE") es = get_study_type_enumeration( df["description"]["attributes"]["study_type"]["study_type"]) # es = df["description"]["attributes"]["study_type"]["study_type"] study_type.set("existing_study_type", es) descriptor.append(study_type) SubElement(descriptor, "STUDY_ABSTRACT").text = p["description"] study.append(descriptor) # Do STUDY_ATTRIBUTES study_attributes = Element("STUDY_ATTRIBUTES") # do attribute for date study_attribute = Element("STUDY_ATTRIBUTE") SubElement(study_attribute, "TAG").text = "Submission Date" SubElement(study_attribute, "VALUE").text = datetime.datetime.now().strftime('%Y-%m-%d') study_attributes.append(study_attribute) # here we can loop to add other STUDY_ATTRIBUTES study.append(study_attributes) return prettify(study_set)
def refresh_display(request): file_id = request.GET["file_id"] file = DataFile().get_record(file_id) path = file["file_location"] data = list() filetype = None if file["name"].endswith("csv"): filetype = "csv" elif file["name"].endswith(("txt", "tsv")): filetype = "tab" elif file["name"].endswith(("xls", "xlsx")): filetype = "xls" if "ss_data" in request.session: # if data previously loaded then just load from session data = json_util.loads(request.session["ss_data"]) sheet_names = json_util.loads(request.session["ss_sheet_names"]) else: try: sheet_names = pandas.ExcelFile(path).sheet_names except Exception as e: # support CSV here (N.B. CSV does not support multiple sheets) sheet_names = [file["name"]] # read entire spreadsheet if filetype == "xls": for name in sheet_names: d = pandas.read_excel(path, sheet_name=name, nrows=4).fillna(0) out = list() out.append(d.columns.tolist()) out.extend(d.values.tolist()) data.append(out) try: request.session["ss_data"] = json_util.dumps(data) request.session["ss_sheet_names"] = json_util.dumps( sheet_names) except: pass elif filetype == "csv": d = pandas.read_csv(path, nrows=4) d = d.fillna('') out = list() out.append(d.columns.tolist()) out.extend(d.values.tolist()) data.append(out) elif filetype == "tab": d = pandas.read_csv(path, sep='\t', nrows=4) d = d.fillna('') out = list() out.append(d.columns.tolist()) out.extend(d.values.tolist()) data.append(out) return HttpResponse(json_util.dumps({"data": data, "names": sheet_names}))
def do_un_describe(self): datafile_ids = [ ObjectId(i) for i in self.param_dict.get("datafile_ids") ] DataFile().get_collection_handle().update_many( {"_id": { "$in": datafile_ids }}, {"$set": { "description": dict() }}) return self.context
def resolve_submission_id(request, submission_id): sub = Submission().get_record(submission_id) # get all file metadata output = dict() files = list() for f in sub.get("bundle", list()): file = DataFile().get_record(f) files.append(file["description"]["attributes"]) output["files"] = files output["accessions"] = sub["accessions"] output["metadata"] = {} output["metadata"]["dc"] = sub["meta"]["fields"] return HttpResponse(j.dumps(output))
def do_assembly_submission(self, sub_id, remote_path, transfer_token): # make dir for manifest file conv_dir = os.path.join(self._dir, sub_id) if not os.path.exists(conv_dir): os.makedirs(conv_dir) # file for metadata sub = Submission().get_record(sub_id) datafile = DataFile().get_record(sub["bundle"][0]) metadata = datafile["description"]["attributes"]["study_type"] # make manifest with open(os.path.join(conv_dir, "manifest.manifest"), 'w+') as manifest: for key in metadata.keys(): line = key.upper() + "\t" + metadata[key] + "\n" manifest.write(line) agp_flag = False fasta_flag = False for f in sub["bundle"]: file = DataFile().get_record(ObjectId(f)) if file["name"].endswith("fasta"): fasta = "FASTA\t" + file["file_location"] + "\n" fasta_flag = True manifest.write(fasta) if file["name"].endswith('agp'): agp = "AGP\t" + file["file_location"] + "\n" agp_flag = True manifest.write(agp) if agp_flag and fasta_flag: # proceed to submission pass else: return { "status": 428, "message": "You must supply an AGP file and a FASTA file" }
def resolve_deposition_context(self): """ this returns an inferred deposition destination for a datafile. we assume here that the target destination of the file can be inferred based on its type :param: :return string destination: """ # get file details datafile = DataFile().get_record(self.datafile_id) ft = datafile.get("file_type", "unknown") if ft == '': ft = 'unknown' deposition_context = 'default' # match against documented destinations for k, v in lkup.REPO_FILE_EXTENSIONS.items(): if ft in v: deposition_context = k break return deposition_context
def send_file_annotation(request): col_idx = request.POST["col_idx"] sheet_name = request.POST["sheet_name"] col_header = request.POST["col_header"] iri = request.POST["iri"] label = request.POST["label"] id = request.POST["id"] obo_id = request.POST.get("obo_id", "") ontology_name = request.POST["ontology_name"] ontology_prexfix = request.POST["ontology_prefix"] short_form = request.POST["short_form"] type = request.POST["type"] file_id = request.POST["file_id"] file_name = request.POST["file_name"] description = request.POST["description"] data = { "column_idx": col_idx, "column_header": col_header, "sheet_name": sheet_name, "iri": iri, "obo_id": obo_id, "label": label, "id": id, "ontology_name": ontology_name, "ontology_prefix": ontology_prexfix, "short_form": short_form, "type": type, "description": description, "uid": request.user.id, "file_id": file_id, "file_name": file_name } if Annotation().add_or_increment_term(data): annotations = DataFile().update_file_level_metadata(file_id, data) else: annotations = {"status": 500, "message": "Could not add annotation"} return HttpResponse(json_util.dumps({"annotation": annotations}))
def data_wiz(request): context = {} step = int(request.POST['step']) datafile = request.POST['datafile'] d = DataFile() dispatch_description = { 'ena': wizh.ena_description, 'figshare': wizh.figshare_description, 'default': wizh.default_description } if step == 1: # first stage in the process where the wizard has only just been initiated # infer the deposition context (target repository) based on the file # if we can infer the context, switch immediately to that, else request destination from user description_attributes = d.GET(datafile)['description']['attributes'] try: deposition_context = description_attributes['deposition_context']['deposition_context'] except: deposition_context = None if not deposition_context: # try to resolve, and save deposition context as an implicit stage deposition_context = wizh.get_deposition_context(datafile) if deposition_context: d.save_description_stage(datafile, {'ref': 'deposition_context', 'data': {'deposition_context': deposition_context}}) if deposition_context: # as there might be previous description data, # we want to be able to load this, else follow the step-wise wizard path if len(description_attributes) > 1: context['stages'] = wizh.get_stages_display(datafile) else: # follow the step-wise wizard path auto_fields = {"current_stage": "deposition_context", "datafile": datafile} context['stage'] = dispatch_description[deposition_context](auto_fields) else: # if we couldn't infer deposition context, ask user's intervention df_wizard_dict = copy.deepcopy(lkup.DF_WIZARD) context['stage'] = { "title": "Verify Destination", "content": wtags.get_deposition_html(deposition_context) } elif step >= 2: auto_fields = ast.literal_eval(request.POST['auto_fields']) deposition_context = auto_fields["deposition_context"] if deposition_context == "": deposition_context = "default" auto_fields["datafile"] = datafile context['stage'] = dispatch_description[deposition_context](auto_fields) elif step == -1: # save all stages auto_fields = ast.literal_eval(request.POST['auto_fields']) # get the deposition context from the first element for a_f in auto_fields: auto_field = ast.literal_eval(a_f) auto_field["datafile"] = datafile wizh.save_stage_data(auto_field) out = jsonpickle.encode(context) return HttpResponse(out, content_type='json')
def refresh_annotations(request): file_id = request.GET["file_id"] sheet_name = request.GET["sheet_name"] annotations = DataFile().get_file_level_metadata_for_sheet( file_id, sheet_name) return HttpResponse(json_util.dumps({"annotations": annotations}))
def _submit(self, sub_id, dataFile_ids): for f_id in dataFile_ids: mongo_file = DataFile().get_record(f_id) c = ChunkedUpload.objects.get(pk=int(mongo_file["file_id"])) file_path = os.path.join(self.MEDIA_ROOT, str(c.file)) orig_name = c.filename sub = mongo_file['description']['attributes'] data = dict() data['defined_type'] = sub.get('type_category', dict()).get('type') data['title'] = sub.get('title_author_description', dict()).get('title') authors = sub.get('title_author_description', dict()).get('author').split(',') lst = list() for x in authors: lst.append({'name': x}) data['authors'] = lst data['description'] = sub.get('title_author_description', dict()).get('description') cat = sub.get('type_category', dict()).get('categories') if cat: cat = cat.split(',') cat = list(map(int, cat)) data['categories'] = cat else: data['categories'] = list() data['tags'] = sub.get('tags', dict()).get('keywords').split(',') for idx, t in enumerate(data['tags']): if len(t) < 3: if len(t) == 1: t = t + (2 * t) elif len(t) == 2: t = t + t data['tags'][idx] = t data['references'] = sub.get('tags', dict()).get('references').split(',') for idx, x in enumerate(data['references']): if x != '': if (not x.startswith('http')) or ( not x.startswith('https')): if (not x.startswith('www')): data['references'][idx] = 'http://www.' + x else: data['references'][idx] = 'http://' + x if len(data['references']) == 1 and data['references'][0] == '': # if blank ref, pop data.pop('references') data['funding'] = sub.get('tags', dict()).get('funding') data['licenses'] = sub.get('tags', dict()).get('licenses') data['publish'] = sub.get('figshare_publish', dict()).get('should_publish') # Create article #data = json.dumps({'title': orig_name, 'defined_type': 'figure'}) endpoint = 'account/articles' resp = requests.post(self.BASE_URL.format(endpoint=endpoint), headers=self.HEADERS, data=json.dumps(data)) article_id = json.loads( resp.content.decode('utf8'))['location'].rsplit('/', 1)[1] # Get file info #with open(file_path, 'rb') as fin: # fin.seek(0, 2) # Go to end of file # size = fin.tell() size = c.offset info = json.dumps({'name': orig_name, 'size': size}) # Initiate upload endpoint = 'account/articles/{}/files'.format(article_id) resp = requests.post(self.BASE_URL.format(endpoint=endpoint), headers=self.HEADERS, data=info) file_id = json.loads( resp.content.decode('utf-8'))['location'].rsplit('/', 1)[1] # Get upload/parts info endpoint = 'account/articles/{}/files/{}'.format( article_id, file_id) resp = requests.get(self.BASE_URL.format(endpoint=endpoint), headers=self.HEADERS) url = '{upload_url}'.format( **json.loads(resp.content.decode('utf-8'))) parts = json.loads( requests.get(url).content.decode('utf-8'))['parts'] # start upload timer t = datetime.datetime.now() # Upload parts with open(file_path, 'rb') as fin: for idx, part in enumerate(parts): percent_done = idx / len(parts) * 100 size = part['endOffset'] - part['startOffset'] + 1 address = '{}/{}'.format(url, part['partNo']) x = datetime.datetime.now() requests.put(address, data=fin.read(size)) delta = datetime.datetime.now() - x # calculate current upload rate in MB per second bw = (size / delta.total_seconds()) / 1000 / 1000 fields = { 'transfer_rate': bw, 'pct_completed': percent_done } RemoteDataFile().update_transfer(self.transfer_token, fields) # Mark file upload as completed upload_time = datetime.datetime.now() - t requests.post(self.BASE_URL.format(endpoint=endpoint), headers=self.HEADERS) fields = { 'pct_completed': 100, 'transfer_status': 'success', 'completed_on': str(datetime.datetime.now()), 'article_id': article_id } RemoteDataFile().update_transfer(self.transfer_token, fields) if data['publish'] == 'True': # publish api endpoint = 'account/articles/{}/publish'.format(article_id) resp = requests.post(self.BASE_URL.format(endpoint=endpoint), headers=self.HEADERS) location = json.loads(resp.content.decode('utf8'))['location'] # get accession data endpoint = 'articles/{}'.format(article_id) resp = requests.get(self.BASE_URL.format(endpoint=endpoint), headers=self.HEADERS) # save accessions to mongo profile record s = Submission().get_record(sub_id) s['article_id'] = json.loads( resp.content.decode('utf8'))['figshare_url'] s['complete'] = True s['status'] = 'published' s['target_id'] = str(s.pop('_id')) Submission().save_record(dict(), **s) else: # save accessions to mongo profile record s = Submission().get_record(sub_id) s['article_id'] = article_id s['complete'] = True s['status'] = 'not published' s['target_id'] = str(s.pop('_id')) Submission().save_record(dict(), **s) # mark submission as complete Submission().mark_submission_complete(sub_id, article_id=article_id) Submission().mark_submission_complete(sub_id) Submission().mark_figshare_article_id(sub_id=sub_id, article_id=article_id)
def test_get_datafile(self): df = DataFile().get_record(self.d) self.assertEquals(df["name"], "fish.png")
def setUpClass(cls): cls.factory = RequestFactory() settings.UNIT_TESTING = True # create user cls.user = User.objects.create_user(username='******', first_name=settings.TEST_USER_NAME, last_name="appleseed", email='*****@*****.**', password='******') cls.user.save() # create profile p_dict = { "copo_id": "000000000", "description": "Test Description", "user_id": cls.user.id, "title": "Test Title" } cls.pid = Profile().save_record(dict(), **p_dict) # create datafile p = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures", "dummy_datafile.json") with open(p) as f: p_dict = json.loads(f.read()) p_dict["file_location"] = os.path.join( os.path.dirname(os.path.realpath(__file__)), "fixtures", "fish.png") p_dict["name"] = "fish.png" profile = Profile().get_collection_handle().find_one( {"copo_id": "000000000"}) p_dict["profile_id"] = str(cls.pid["_id"]) cls.d = DataFile().get_collection_handle().insert(p_dict) # create dataverse repository p = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures", "dummy_dataverse_repo.json") with open(p) as f: p_dict = json.loads(f.read()) cls.r = Repository().save_record(dict(), **p_dict) # create submission record for dataverse p = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures", "dummy_dataverse_submission.json") with open(p) as f: p_dict = json.loads(f.read()) p_dict["bundle_meta"][0]["file_path"] = os.path.join( os.path.dirname(os.path.realpath(__file__)), "fixtures", "fish.png") p_dict["bundle_meta"][0]["file_id"] = str(cls.d) p_dict["profile_id"] = str(cls.pid["_id"]) p_dict["bundle"].append(str(cls.d)) cls.s_dv = Submission().get_collection_handle().insert(p_dict) # create submission record for new dspace p = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures", "dummy_dspace_submission.json") with open(p) as f: p_dict = json.loads(f.read()) p_dict["bundle_meta"][0]["file_path"] = os.path.join( os.path.dirname(os.path.realpath(__file__)), "fixtures", "fish.png") p_dict["bundle_meta"][0]["file_id"] = str(cls.d) p_dict["profile_id"] = str(cls.pid["_id"]) p_dict["bundle"].append(str(cls.d)) p_dict["meta"]["new_or_existing"] = "new" # query for item id resp = requests.post("http://demo.dspace.org/rest/collections") collections = json.loads(resp.content.decode("utf-8")) collection = collections[0] p_dict["meta"]["identifier"] = collection["uuid"] cls.s_ds_new = Submission().get_collection_handle().insert(p_dict) # create submission record for existing dspace p = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures", "dummy_dspace_submission.json") with open(p) as f: p_dict = json.loads(f.read()) p_dict["bundle_meta"][0]["file_path"] = os.path.join( os.path.dirname(os.path.realpath(__file__)), "fixtures", "fish.png") p_dict["bundle_meta"][0]["file_id"] = str(cls.d) p_dict["profile_id"] = str(cls.pid["_id"]) p_dict["bundle"].append(str(cls.d)) p_dict["meta"]["new_or_existing"] = "existing" # query for item id resp = requests.post("http://demo.dspace.org/rest/items") items = json.loads(resp.content.decode("utf-8")) item = items[0] p_dict["meta"]["identifier"] = item["uuid"] p_dict["item_id"] = item["uuid"] cls.s_ds_existing = Submission().get_collection_handle().insert(p_dict) cls.ckan_api = "http://demo.ckan.org/api/3/action/"
def update_targets_datafiles(self): bulk = DataFile().get_collection_handle().initialize_unordered_bulk_op() for k, v in self.targets_datafiles.items(): bulk.find({'_id': ObjectId(k)}).update({'$set': {"description": v.get("description", dict())}}) bulk.execute()
def extract_repo_fields(self, datafile_id=str(), repo=str()): """ given a datafile id, and repository type function returns a list of dictionaries of fields matching the repo :param datafile_id: :param repo: :return: """ from dal.copo_da import DataFile, CGCore from dal.copo_base_da import DataSchemas if not repo: # no repository to filter by return list() repo_type_option = lkup.DROP_DOWNS["REPO_TYPE_OPTIONS"] repo_type_option = [ x for x in repo_type_option if x["value"].lower() == repo.lower() ] if not repo_type_option: return list() repo_type_option = repo_type_option[0] cg_schema = DataSchemas("COPO").get_ui_template_node('cgCore') # filter schema items by repo cg_schema = [ x for x in cg_schema if x.get("target_repo", str()).strip() != str() and repo_type_option.get("abbreviation", str()) in [y.strip() for y in x.get("target_repo").split(',')] ] record = DataFile().get_record(datafile_id) description = record.get("description", dict()) attributes = description.get("attributes", dict()) stages = description.get("stages", list()) schema_df = pd.DataFrame(cg_schema) schema_df.id = schema_df.id.str.lower().str.split(".").str[-1] schema_df.index = schema_df.id schema_df = schema_df[['ref', 'id', 'prefix']] schema_df = schema_df[~schema_df['ref'].isna()] # get all stage items all_items = [item for st in stages for item in st.get("items", list())] # filter stage items - stage items should conform to specifications of the repo schema_ids = list(schema_df.id) items = { item.get("id", str()).lower().split(".")[-1]: st.get("ref", "").lower() for st in stages for item in st.get("items", list()) if item.get("id", str()).lower().split(".")[-1] in schema_ids } # ...also, account for any filtering performed by client agents (e.g., dependencies in COPO Wizard), # within the context of the target repo schema_df = schema_df[schema_df.index.isin(items.keys())] # obtain attributes for filtered stage items target_stages = list(set(items.values())) datafile_attributes = [ v for k, v in attributes.items() if k in target_stages ] new_dict = dict() for d in datafile_attributes: new_dict.update(d) new_dict_series = pd.Series(new_dict) new_dict_series.index = new_dict_series.index.str.lower() schema_df['vals'] = new_dict_series schema_df['vals'] = schema_df['vals'].fillna('') schema_df = schema_df[['ref', 'id', 'vals', 'prefix']] # get composite attributes composite_attrib = [ x for x in all_items if x["id"] in list(schema_df.id) and x.get("create_new_item", False) ] # expand composite attributes for cattrib in composite_attrib: comp_series = schema_df.loc[cattrib["id"]] schema_df = schema_df[~schema_df.id.isin([cattrib["id"]])] children_schemas = [ x for x in cg_schema if x.get("dependency", str()).lower() == comp_series.ref.lower() ] accessions = comp_series.vals if isinstance(accessions, str): accessions = accessions.split(",") object_ids = [ObjectId(x) for x in accessions if x.strip()] records = list() if len(object_ids): records = cursor_to_list(CGCore().get_collection_handle().find( {"_id": { "$in": object_ids }})) attr_list = list() for child in children_schemas: child_dict = dict(ref=child["ref"], id=child["id"].split(".")[-1], prefix=child["prefix"], vals=[]) attr_list.append(child_dict) for rec in records: child_dict["vals"].append(rec.get(child_dict["id"], str())) if attr_list: attr_df = pd.DataFrame(attr_list) attr_df.index = attr_df.id schema_df = pd.concat([schema_df, attr_df]) schema_df.rename(index=str, columns={ "ref": "dc", "id": "copo_id" }, inplace=True) dc_list = schema_df.to_dict('records') return dc_list
def zip_file(request): # need to get a reference to the file to zip file_id = request.GET['file_id'] print("zip started " + file_id) file_obj = ChunkedUpload.objects.get(pk=file_id) # get the name of the file to zip and change its suffix to .gz output_file_location = os.path.join(settings.MEDIA_ROOT, file_obj.file.name) output_file_name = file_obj.filename + '.gz' try: # open the file as gzip acrchive...set compression level temp_name = os.path.join(settings.MEDIA_ROOT, str(uuid.uuid4()) + '.tmp') myzip = gzip.open(temp_name, 'wb', compresslevel=1) src = open(output_file_location, 'r') # write input file to gzip archive in n byte chunks n = 100000000 for chunk in iter(lambda: src.read(n), ''): myzip.write(bytes(chunk, 'UTF-8')) finally: myzip.close() src.close() print('zip complete ' + file_id) # now need to delete the old file and update the file record with the new file new_file_name = output_file_location + '.gz' os.rename(temp_name, new_file_name) os.remove(output_file_location) # calculate new file size stats = os.stat(new_file_name) new_file_size = stats.st_size / 1000 / 1000 # update filename file_obj.filename = output_file_name file_obj.file.name = new_file_name # update file size file_obj.offset = stats.st_size file_obj.save() out = { 'zipped': True, 'file_name': output_file_name, 'file_size': new_file_size } # update record in mongo record_object = DataFile().get_by_file_id(file_id) auto_fields = dict() auto_fields[DataFile().get_qualified_field( "file_size")] = u.filesize_toString(file_obj.offset) auto_fields[DataFile().get_qualified_field("name")] = output_file_name auto_fields[DataFile().get_qualified_field( "file_location")] = new_file_name profile_id = request.session['profile_id'] component = "datafile" BrokerDA(target_id=str(record_object.get("_id", str())), component=component, auto_fields=auto_fields).do_save_edit() out = jsonpickle.encode(out) return HttpResponse(out, content_type='json')
def submit(self, sub_id, dataFile_ids=None): s = Submission().get_record(ObjectId(sub_id)) if s["meta"]["new_or_existing"] == "new": # create and get item_id data = self._create_ckan_metadata(s) fullurl = self.host["url"] + "package_create" resp = requests.post(fullurl, json=data, headers=self.headers) if resp.status_code == 200: # package was created normally data = json.loads(resp.content.decode("utf-8")) dataset_id = data["result"]["id"] data = {"package_id": dataset_id} fullurl = self.host["url"] + "resource_create" elif resp.status_code == 400: instance = re.findall("https", fullurl) if len(instance) == 0: fullurl = fullurl.replace("http", "https") resp = requests.post(fullurl, json=data, headers=self.headers) if resp.status_code != 200: details = json.loads(resp.content.decode("utf-8")) try: msg = details["error"]["message"] except KeyError: msg = details["error"]["name"][0] return json.dumps({ "status": resp.status_code, "message": msg }) else: data = json.loads(resp.content.decode("utf-8")) dataset_id = data["result"]["id"] data = {"package_id": dataset_id} fullurl = self.host["url"] + "resource_create" elif resp.status_code == 409: # there is a conflict so update rather than create print(resp.reason) fullurl = self.host["url"] + "package_show" resp = requests.post(fullurl, json={"name_or_id": data["name"]}) data = json.loads(resp.content.decode("utf-8")) dataset_id = data["result"]["id"] data = {"package_id": dataset_id} fullurl = self.host["url"] + "resource_create" else: return json.dumps({ "status": resp.status_code, "message": resp.reason + " - " + resp.text }) else: data = {"package_id": s["meta"]["identifier"]} # now we have a dataset id to which to add the datafile for f in s["bundle"]: # data = dict() df = DataFile().get_record(ObjectId(f)) # upload file # get correct bitstream file extension lookup try: filename, file_extension = os.path.splitext(df["name"]) if "." in file_extension: file_extension = file_extension.replace(".", "") ext = self.get_media_type_from_file_ext(file_extension) except: ext = "" now = str(datetime.date.today()) print(df["name"]) data["name"] = df["name"] data["created"] = now data["mimetype"] = ext fullurl = self.host["url"] + "resource_create" url = parse.urlparse(self.host["url"]) #data["url"] = urljoin(self.hostname, "dataset/" + str(uuid.uuid4())) with open(df["file_location"], 'rb') as f: files = [('upload', (df["name"], f, ext))] # data["upload"] = files try: print(self.headers) resp = requests.post(fullurl, data=data, files=files, headers=self.headers) # print(resp.json()['headers']) except (TypeError, ValueError) as e: print(e) # for some reason this fails the first time resp = requests.post(fullurl, data=data, files=files, headers=self.headers) except TypeError as t: print(t) if resp.status_code == 200: req = ThreadLocal.get_current_request() details = json.loads(resp.content.decode("utf-8")) details["result"]["repo_url"] = self.host["url"] #details["result"]["url"] = req.build_absolute_uri("/") + "rest/get_accession_data?sub_id=" + sub_id self._update_and_complete_submission(details, sub_id) elif resp.status_code == 400: # try again checking for https instance = re.findall("https", fullurl) if len(instance) == 0: fullurl = fullurl.replace("http", "https") resp = requests.post(fullurl, data=data, files=f, headers=self.headers) if resp.status_code != 200: msg = json.loads( resp.content.decode("utf-8"))["error"]["message"] return {"status": resp.status_code, "message": msg} details = json.loads(resp.content.decode("utf-8")) details["result"]["repo_url"] = self.host["url"] self._update_and_complete_submission(details, sub_id) elif resp.status_code == 409: fullurl = self.host["url"] + "package_show" resp = requests.post(fullurl, data={"id": dataset_id}) # now iterate through resources to get matching name resources = json.dumps( resp.content.decode("utf-8"))["result"]["resources"] fullurl = self.host["url"] + "resource_update" # Submission().mark_submission_complete(ObjectId(sub_id)) else: return json.dumps({ "status": resp.status_code, "message": resp.reason + " - " + resp.text }) Submission().mark_submission_complete(ObjectId(sub_id)) return True