def inherit_metadata(self, reference_target_id): """ using reference_target as the basis, copy metadata across to description targets :param reference_target_id: :return: """ reference_description = DataFile().get_record(reference_target_id).get("description", dict()) reference_attributes = reference_description.get("attributes", dict()) reference_stages = reference_description.get("stages", list()) for target in self.description_targets: # 'focus' on target self.set_datafile_id(target["recordID"]) # use batch stages to update targets self.update_datafile_stage(reference_stages) # find and attributes from the reference for k, v in reference_attributes.items(): if k not in self.get_datafile_attributes(): self.update_datafile_attributes({'ref': k, 'data': v}) self.update_targets_datafiles() return
def submit(self, sub_id, dataFile_ids): # physically transfer files path2library = os.path.join(BASE_DIR, REPOSITORIES['ASPERA']['resource_path']) # change these to be collected properly user_name = REPOSITORIES['ASPERA']['user_token'] password = REPOSITORIES['ASPERA']['password'] # create transfer record transfer_token = RemoteDataFile().create_transfer(sub_id)['_id'] self.submission = Submission().get_record(sub_id) self.profile = Profile().get_record(self.submission['profile_id']) remote_path = d_utils.get_ena_remote_path(sub_id) # get each file in the bundle file_path = [] for idx, f_id in enumerate(dataFile_ids): mongo_file = DataFile().get_record(f_id) self.d_files.append(mongo_file) file_path.append(mongo_file.get("file_location", str())) self._do_aspera_transfer(transfer_token=transfer_token, user_name=user_name, password=password, remote_path=remote_path, file_path=file_path, path2library=path2library, sub_id=sub_id)
def hash_upload(request): # utility method to create an md5 hash of a given file path # open uploaded file file_id = request.GET['file_id'] print('hash started ' + file_id) file_obj = ChunkedUpload.objects.get(pk=file_id) file_name = os.path.join(settings.MEDIA_ROOT, file_obj.file.name) # now hash opened file md5 = hashlib.md5() with open(file_name, 'rb') as f: for chunk in iter(lambda: f.read(8192), b''): md5.update(chunk) file_obj.hash = md5.hexdigest() file_obj.save() output_dict = {'output_hash': md5.hexdigest(), 'file_id': file_id} # update record in mongo record_object = DataFile().get_by_file_id(file_id) auto_fields = dict() auto_fields[DataFile().get_qualified_field("file_hash")] = file_obj.hash profile_id = request.session['profile_id'] component = "datafile" BrokerDA(target_id=str(record_object.get("_id", str())), component=component, auto_fields=auto_fields).do_save_edit() out = jsonpickle.encode(output_dict) print('hash complete ' + file_id) return HttpResponse(out, content_type='json')
def do_sanitise_submissions(self): records = self.da_object.get_all_records() for submission in records: if "bundle_meta" not in submission: bundle_meta = list() for file_id in submission.get("bundle", list()): datafile = DataFile().get_record(file_id) if datafile: upload_status = False if str(submission.get("complete", False)).lower() == 'true': upload_status = True bundle_meta.append( dict(file_id=file_id, file_path=datafile.get( "file_location", str()), upload_status=upload_status)) submission["bundle_meta"] = bundle_meta submission['target_id'] = str(submission.pop('_id')) self.da_object.save_record(dict(), **submission) self.context["sanitise_status"] = True return self.context
def hash_upload(request): # utility method to create an md5 hash of a given file path # open uploaded file file_id = request.GET['file_id'] print('hash started ' + file_id) file_obj = ChunkedUpload.objects.get(pk=file_id) file_name = os.path.join(settings.MEDIA_ROOT, file_obj.file.name) # now hash opened file md5 = hashlib.md5() with open(file_name, 'rb') as f: for chunk in iter(lambda: f.read(8192), b''): md5.update(chunk) file_obj.hash = md5.hexdigest() file_obj.save() output_dict = {'output_hash': md5.hexdigest(), 'file_id': file_id} # update record in mongo record_object = DataFile().get_by_file_id(file_id) auto_fields = dict() auto_fields[DataFile().get_qualified_field("file_hash")] = file_obj.hash BrokerDA(target_id=str(record_object.get("_id", str())), component="datafile", auto_fields=auto_fields ).do_save_edit() out = jsonpickle.encode(output_dict) print('hash complete ' + file_id) return HttpResponse(out, content_type='json')
def do_description_summary(self): record = DataFile().get_record(self.param_dict.get("target_id")) self.context['description'] = htags.resolve_description_data( record.get("description", dict()), dict()) description_token = record.get('description_token', str()) self.context['description']['description_record'] = dict() if description_token: description_record = Description().GET(description_token) if description_record: if not description_record["name"]: description_record["name"] = "N/A" self.context['description']['description_record'] = dict( name=description_record["name"], id=str(description_record["_id"])) return self.context
def zip_file(request): # need to get a reference to the file to zip file_id = request.GET['file_id'] print("zip started " + file_id) file_obj = ChunkedUpload.objects.get(pk=file_id) # get the name of the file to zip and change its suffix to .gz output_file_location = os.path.join(settings.MEDIA_ROOT, file_obj.file.name) output_file_name = file_obj.filename + '.gz' try: # open the file as gzip acrchive...set compression level temp_name = os.path.join(settings.MEDIA_ROOT, str(uuid.uuid4()) + '.tmp') myzip = gzip.open(temp_name, 'wb', compresslevel=1) src = open(output_file_location, 'r') # write input file to gzip archive in n byte chunks n = 100000000 for chunk in iter(lambda: src.read(n), ''): myzip.write(bytes(chunk, 'UTF-8')) finally: myzip.close() src.close() print('zip complete ' + file_id) # now need to delete the old file and update the file record with the new file new_file_name = output_file_location + '.gz' os.rename(temp_name, new_file_name) os.remove(output_file_location) # calculate new file size stats = os.stat(new_file_name) new_file_size = stats.st_size / 1000 / 1000 # update filename file_obj.filename = output_file_name file_obj.file.name = new_file_name # update file size file_obj.offset = stats.st_size file_obj.save() out = {'zipped': True, 'file_name': output_file_name, 'file_size': new_file_size} # update record in mongo record_object = DataFile().get_by_file_id(file_id) auto_fields = dict() auto_fields[DataFile().get_qualified_field("file_size")] = u.filesize_toString(file_obj.offset) auto_fields[DataFile().get_qualified_field("name")] = output_file_name auto_fields[DataFile().get_qualified_field("file_location")] = new_file_name BrokerDA(target_id=str(record_object.get("_id", str())), component="datafile", auto_fields=auto_fields ).do_save_edit() out = jsonpickle.encode(out) return HttpResponse(out, content_type='text/plain')
def submit(self, sub_id, dataFile_ids): submission_record = Submission().get_record(sub_id) # bundle_meta, if present, should provide a better picture of what datafiles need to be uploaded if "bundle_meta" in submission_record: pending_files = [ x["file_id"] for x in submission_record['bundle_meta'] if not x["upload_status"] ] dataFile_ids = pending_files # physically transfer files path2library = os.path.join(BASE_DIR, REPOSITORIES['ASPERA']['resource_path']) # change these to be collected properly user_name = REPOSITORIES['ASPERA']['user_token'] password = REPOSITORIES['ASPERA']['password'] # create transfer record transfer_token = RemoteDataFile().create_transfer(sub_id)['_id'] self.submission = Submission().get_record(sub_id) self.profile = Profile().get_record(self.submission['profile_id']) remote_path = d_utils.get_ena_remote_path(sub_id) # get each file in the bundle file_path = [] for idx, f_id in enumerate(dataFile_ids): mongo_file = DataFile().get_record(ObjectId(f_id)) self.d_files.append(mongo_file) file_path.append(mongo_file.get("file_location", str())) case = self._do_aspera_transfer(transfer_token=transfer_token, user_name=user_name, password=password, remote_path=remote_path, file_path=file_path, path2library=path2library, sub_id=sub_id) return case
def resolve_deposition_context(self): """ this returns an inferred deposition destination for a datafile. we assume here that the target destination of the file can be inferred based on its type :param: :return string destination: """ # get file details datafile = DataFile().get_record(self.datafile_id) ft = datafile.get("file_type", "unknown") if ft == '': ft = 'unknown' deposition_context = 'default' # match against documented destinations for k, v in lkup.REPO_FILE_EXTENSIONS.items(): if ft in v: deposition_context = k break return deposition_context
def extract_repo_fields(self, datafile_id=str(), repo=str()): """ given a datafile id, and repository type function returns a list of dictionaries of fields matching the repo :param datafile_id: :param repo: :return: """ from dal.copo_da import DataFile, CGCore from dal.copo_base_da import DataSchemas if not repo: # no repository to filter by return list() repo_type_option = lkup.DROP_DOWNS["REPO_TYPE_OPTIONS"] repo_type_option = [ x for x in repo_type_option if x["value"].lower() == repo.lower() ] if not repo_type_option: return list() repo_type_option = repo_type_option[0] cg_schema = DataSchemas("COPO").get_ui_template_node('cgCore') # filter schema items by repo cg_schema = [ x for x in cg_schema if x.get("target_repo", str()).strip() != str() and repo_type_option.get("abbreviation", str()) in [y.strip() for y in x.get("target_repo").split(',')] ] record = DataFile().get_record(datafile_id) description = record.get("description", dict()) attributes = description.get("attributes", dict()) stages = description.get("stages", list()) schema_df = pd.DataFrame(cg_schema) schema_df.id = schema_df.id.str.lower().str.split(".").str[-1] schema_df.index = schema_df.id schema_df = schema_df[['ref', 'id', 'prefix']] schema_df = schema_df[~schema_df['ref'].isna()] # get all stage items all_items = [item for st in stages for item in st.get("items", list())] # filter stage items - stage items should conform to specifications of the repo schema_ids = list(schema_df.id) items = { item.get("id", str()).lower().split(".")[-1]: st.get("ref", "").lower() for st in stages for item in st.get("items", list()) if item.get("id", str()).lower().split(".")[-1] in schema_ids } # ...also, account for any filtering performed by client agents (e.g., dependencies in COPO Wizard), # within the context of the target repo schema_df = schema_df[schema_df.index.isin(items.keys())] # obtain attributes for filtered stage items target_stages = list(set(items.values())) datafile_attributes = [ v for k, v in attributes.items() if k in target_stages ] new_dict = dict() for d in datafile_attributes: new_dict.update(d) new_dict_series = pd.Series(new_dict) new_dict_series.index = new_dict_series.index.str.lower() schema_df['vals'] = new_dict_series schema_df['vals'] = schema_df['vals'].fillna('') schema_df = schema_df[['ref', 'id', 'vals', 'prefix']] # get composite attributes composite_attrib = [ x for x in all_items if x["id"] in list(schema_df.id) and x.get("create_new_item", False) ] # expand composite attributes for cattrib in composite_attrib: comp_series = schema_df.loc[cattrib["id"]] schema_df = schema_df[~schema_df.id.isin([cattrib["id"]])] children_schemas = [ x for x in cg_schema if x.get("dependency", str()).lower() == comp_series.ref.lower() ] accessions = comp_series.vals if isinstance(accessions, str): accessions = accessions.split(",") object_ids = [ObjectId(x) for x in accessions if x.strip()] records = list() if len(object_ids): records = cursor_to_list(CGCore().get_collection_handle().find( {"_id": { "$in": object_ids }})) attr_list = list() for child in children_schemas: child_dict = dict(ref=child["ref"], id=child["id"].split(".")[-1], prefix=child["prefix"], vals=[]) attr_list.append(child_dict) for rec in records: child_dict["vals"].append(rec.get(child_dict["id"], str())) if attr_list: attr_df = pd.DataFrame(attr_list) attr_df.index = attr_df.id schema_df = pd.concat([schema_df, attr_df]) schema_df.rename(index=str, columns={ "ref": "dc", "id": "copo_id" }, inplace=True) dc_list = schema_df.to_dict('records') return dc_list
def zip_file(request): # need to get a reference to the file to zip file_id = request.GET['file_id'] print("zip started " + file_id) file_obj = ChunkedUpload.objects.get(pk=file_id) # get the name of the file to zip and change its suffix to .gz output_file_location = os.path.join(settings.MEDIA_ROOT, file_obj.file.name) output_file_name = file_obj.filename + '.gz' try: # open the file as gzip acrchive...set compression level temp_name = os.path.join(settings.MEDIA_ROOT, str(uuid.uuid4()) + '.tmp') myzip = gzip.open(temp_name, 'wb', compresslevel=1) src = open(output_file_location, 'r') # write input file to gzip archive in n byte chunks n = 100000000 for chunk in iter(lambda: src.read(n), ''): myzip.write(bytes(chunk, 'UTF-8')) finally: myzip.close() src.close() print('zip complete ' + file_id) # now need to delete the old file and update the file record with the new file new_file_name = output_file_location + '.gz' os.rename(temp_name, new_file_name) os.remove(output_file_location) # calculate new file size stats = os.stat(new_file_name) new_file_size = stats.st_size / 1000 / 1000 # update filename file_obj.filename = output_file_name file_obj.file.name = new_file_name # update file size file_obj.offset = stats.st_size file_obj.save() out = { 'zipped': True, 'file_name': output_file_name, 'file_size': new_file_size } # update record in mongo record_object = DataFile().get_by_file_id(file_id) auto_fields = dict() auto_fields[DataFile().get_qualified_field( "file_size")] = u.filesize_toString(file_obj.offset) auto_fields[DataFile().get_qualified_field("name")] = output_file_name auto_fields[DataFile().get_qualified_field( "file_location")] = new_file_name profile_id = request.session['profile_id'] component = "datafile" BrokerDA(target_id=str(record_object.get("_id", str())), component=component, auto_fields=auto_fields).do_save_edit() out = jsonpickle.encode(out) return HttpResponse(out, content_type='json')