def refactor_ena_study_samples(self, ena_collection_id, study_id): normalised_ranked_list = self.get_normalised_ranked_list([], DataSchemas("ENA").get_ui_template()[ "studies"]["study"][ "studySamples"]) study_samples = [] samples = self.get_study_samples(ena_collection_id, study_id) if samples: for sd in samples: sample_id = sd["id"] sample_details = self.get_ena_sample(ena_collection_id, sample_id) if sample_details: study_sample = [] modified_ranked_list = self.get_modified_ranked_list(normalised_ranked_list) for elem_dict in modified_ranked_list: entry_dict = elem_dict # get target values from sample_details and delete irrelevant entries in "entry_dict" # also sort out "items" if entries exist for them if entry_dict["ref"] in sample_details: entry_dict["value"] = sample_details[entry_dict["ref"]] structure = "" if entry_dict["items"]: # it might well be an "all for one, one for all" arrangement here... # i.e., structure for a single entry under "items" suffices for the rest structure = entry_dict["items"][0]["structure"].replace(" ", "").lower() # now safe to delete redundant keys from entry_dict del entry_dict["ref"] del entry_dict["items"] study_sample.append(entry_dict) # another entry, this time for "items", that is, if exist! if structure and structure in sample_details: entry_dict = d_utils.get_isajson_refactor_type(structure) entry_dict["items"] = sample_details[structure] study_sample.append(entry_dict) study_samples.append(study_sample) return study_samples
def get_modified_ranked_list(self, ranked_list): # this method produces a list of elements and their associated structured fields (items) # first, get document context by comparing bases of element's id d = difflib.Differ() context = ranked_list[0]["id"] for elem_dict in ranked_list[1:]: v = list(d.compare(context, elem_dict["id"])) h = ''.join(e.strip() for e in v) context = h.split("-")[0] context = context.strip(".").rsplit(".", 1)[1] structured_labels = ["characteristics", "factor value", "parameter value"] base_nodes_gap = [] modified_ranked_list = [] for indx, elem_dict in enumerate(ranked_list): # grab key elements: i.e., non-structured nodes element_base = elem_dict["id"].rsplit(".", 2)[1] if not element_base == context: # possible target for a protocol node # add protocol node if not already added entry_dict = d_utils.get_isajson_refactor_type("protocol") entry_dict["value"] = re.sub("([a-z])([A-Z])", "\g<1> \g<2>", element_base).lower() entry_dict["ref"] = element_base entry_dict["items"] = [] if entry_dict not in modified_ranked_list: base_nodes_gap.append(indx) modified_ranked_list.append(entry_dict) if elem_dict["ref"].split("[", 1)[0].lower() not in structured_labels: if elem_dict["ref"].split("[", 1)[0].lower() == "comment": # handles for comment elements entry_dict = d_utils.get_isajson_refactor_type("comment") entry_dict["commentTerm"] = elem_dict["ref"].split("[", 1)[1].strip("]") elif elem_dict["control"].lower() == "file": # handles for files entry_dict = d_utils.get_isajson_refactor_type("file") entry_dict["name"] = elem_dict["ref"] elif d_utils.get_isajson_refactor_type( elem_dict["id"].rsplit(".", 1)[1].lower()): # handles for very specific elements entry_dict = d_utils.get_isajson_refactor_type(elem_dict["id"].rsplit(".", 1)[1].lower()) entry_dict["name"] = elem_dict["ref"] else: entry_dict = d_utils.get_isajson_refactor_type("generic") entry_dict["name"] = elem_dict["ref"] entry_dict["ref"] = elem_dict["id"].rsplit(".", 1)[1] entry_dict["items"] = [] if entry_dict not in modified_ranked_list: base_nodes_gap.append(indx) modified_ranked_list.append(entry_dict) base_nodes_gap.append(len(ranked_list)) # will allow interval to include last element in list # now attach structured nodes as items to their respective "parents" # exploit "base_nodes_gap" to inform this process for indx, elem_dict in enumerate(modified_ranked_list): for gap in range(base_nodes_gap[indx], base_nodes_gap[indx + 1]): # search for and append structured items, if found, within the search "gap" if ranked_list[gap]["ref"].split("[", 1)[0].lower() in structured_labels: elem_dict["items"].append( {"id": ranked_list[gap]["id"], "structure": ranked_list[gap]["ref"].split("[", 1)[0], "term": ranked_list[gap]["ref"].split("[", 1)[1].strip("]")} ) return modified_ranked_list
def refactor_ena_study_assays(self, ena_collection_id, study_id): study_assay = d_utils.get_db_template("ENA")['studies'][0]['study']['assays'][0] # get study type to determine the context to represent study_type = self.get_ena_study(study_id, ena_collection_id)["studyCOPOMetadata"]["studyType"] normalised_ranked_list = self.get_normalised_ranked_list([], DataSchemas("ENA").get_ui_template()[ "studies"]["study"][ "assays"]["assaysTable"][study_type]) assays_table = [] datafiles = self.get_study_datafiles(ena_collection_id, study_id) if datafiles: for df in datafiles: # get samples, every sample attached to a file will also have an entry in assaysTables if df["samples"]: for sample_id in df["samples"]: sample_details = self.get_ena_sample(ena_collection_id, sample_id) if sample_details: temp_dict = sample_details # sort out data file temp_dict["rawDataFile"] = ChunkedUpload.objects.get(id=int(df["fileId"])).file.name # sort out elements captured under attributes temp_dict["attributes"] = df["attributes"] # now start making entries assay = [] modified_ranked_list = self.get_modified_ranked_list(normalised_ranked_list) for elem_dict in modified_ranked_list: entry_dict = elem_dict if entry_dict["ref"] in temp_dict: entry_dict["value"] = temp_dict[entry_dict["ref"]] items = entry_dict["items"] # remove redundant fields del entry_dict["ref"] del entry_dict["items"] assay.append(entry_dict) if items: # spin off another entry_dict to cater for these items # it might well be an "all for one, one for all" arrangement here... # i.e., structure for a single entry under "items" suffices for the rest structure = items[0]["structure"].replace(" ", "").lower() entry_dict = d_utils.get_isajson_refactor_type(structure) # delete the blank entry in items del entry_dict["items"][0] for item in items: for attribute in temp_dict["attributes"]: if attribute["question"] == item["id"]: # get the template, and sort out this entry items_entry = d_utils.get_isajson_refactor_type(structure)["items"][0] if "parameter" in structure: items_entry["parameterTerm"] = item["term"] items_entry["parameterValue"] = attribute["answer"]["value"] items_entry["termAccessionNumber"] = attribute["answer"][ "termAccessionNumber"] items_entry["termSourceREF"] = attribute["answer"]["termSourceREF"] elif "characteristics" in structure: pass elif "factor" in structure: pass entry_dict["items"].append(items_entry) break # del entry_dict["items"][0] # no need keeping this dummy entry assay.append(entry_dict) assays_table.append(assay) study_assay["assaysTable"] = assays_table return study_assay