def get_lookup_type(self): projection = dict(label=1, accession=1, description=1) filter_by = dict(type=self.data_source) records = list() if self.accession or self.search_term: if self.accession: bn = list() bn.append(self.accession) if isinstance( self.accession, str) else bn.extend(self.accession) filter_by["accession"] = {'$in': bn} elif self.search_term: filter_by["label"] = { '$regex': self.search_term, "$options": 'i' } records = cursor_to_list(Lookups.find(filter_by, projection)) if not records and self.search_term: del filter_by["label"] records = cursor_to_list(Lookups.find(filter_by, projection)) return records
def get_all_records(self, sort_by='_id', sort_direction=-1, **kwargs): doc = dict(deleted=data_utils.get_not_deleted_flag()) if self.profile_id: doc["profile_id"] = self.profile_id return cursor_to_list(self.get_collection_handle().find(doc).sort( [[sort_by, sort_direction]]))
def get_all_records_columns_server(self, sort_by='_id', sort_direction=-1, projection=dict(), filter_by=dict(), search_term=str(), limit=0, skip=0): filter_by["deleted"] = data_utils.get_not_deleted_flag() # 'name' seems to be the only reasonable field to restrict searching; others fields are resolved filter_by["name"] = {'$regex': search_term, "$options": 'i'} if self.profile_id: filter_by["profile_id"] = self.profile_id if skip > 0: records = self.get_collection_handle().find( filter_by, projection).sort([[sort_by, sort_direction]]).skip(skip).limit(limit) else: records = self.get_collection_handle().find( filter_by, projection).sort([[sort_by, sort_direction]]).limit(limit) return cursor_to_list(records)
def get_all_records_columns(self, sort_by='_id', sort_direction=-1, projection=dict(), filter_by=dict()): return cursor_to_list( self.DescriptionCollection.find(filter_by, projection).sort( [[sort_by, sort_direction]]))
def get_repos_for_group_info(self, uid, group_id): g = CopoGroup().get_record(ObjectId(group_id)) docs = cursor_to_list(Repository().Repository.find({'users.uid': uid})) for d in docs: if d['_id'] in g['repo_ids']: d['selected'] = True else: d['selected'] = False return list(docs)
def get_profiles_for_group_info(self, group_id): p_list = cursor_to_list(Profile().get_for_user( data_utils.get_user_id())) group = CopoGroup().get_record(ObjectId(group_id)) for p in p_list: if p['_id'] in group['shared_profile_ids']: p['selected'] = True else: p['selected'] = False return p_list
def get_all_records_columns(self, sort_by='_id', sort_direction=-1, projection=dict(), filter_by=dict()): filter_by["deleted"] = data_utils.get_not_deleted_flag() if self.profile_id: filter_by["profile_id"] = self.profile_id return cursor_to_list(self.get_collection_handle().find( filter_by, projection).sort([[sort_by, sort_direction]]))
def set_targets_datafiles(self): targets_datafiles = dict() object_list = list() for target in self.description_targets: object_list.append(ObjectId(target["recordID"])) datafiles = cursor_to_list(DataFile().get_collection_handle().find({"_id": {"$in": object_list}})) for df in datafiles: targets_datafiles[str(df["_id"])] = df return targets_datafiles
def get_terms_for_user_by_dataset(self, uid): docs = self.get_collection_handle().aggregate([{ "$match": { "uid": uid } }, { "$group": { "_id": "$file_id", "annotations": { "$push": "$$ROOT" } } }]) data = cursor_to_list(docs) return data
def get_elapsed_time_dataframe(self): pipeline = [{ "$project": { "_id": 1, "diff_days": { "$divide": [{ "$subtract": [data_utils.get_datetime(), "$created_on"] }, 1000 * 60 * 60 * 24] } } }] description_df = pd.DataFrame( cursor_to_list(self.DescriptionCollection.aggregate(pipeline))) return description_df
def get_file_level_metadata_for_sheet(self, file_id, sheetname): docs = self.get_collection_handle().aggregate([{ "$match": { "_id": ObjectId(file_id) } }, { "$unwind": "$file_level_annotation" }, { "$match": { "file_level_annotation.sheet_name": sheetname } }, { "$project": { "file_level_annotation": 1, "_id": 0 } }, { "$sort": { "file_level_annotation.column_idx": 1 } }]) return cursor_to_list(docs)
def get_all_samples(self): doc = {'profile_id': self.profile_id} return cursor_to_list(self.SampleCollection.find(doc))
def get_all_people(self): doc = {'profile_id': self.profile_id, 'deleted': "0"} return cursor_to_list(self.PersonCollection.find(doc))
def get_sra_samples(self, submission_location=str()): """ function retrieves study samples and presents them in a format for building an sra sample set :param submission_location: :return: """ sra_samples = list() # get datafiles datafiles = cursor_to_list(ghlper.get_datafiles_handle().find( { "description_token": self.description_token, 'deleted': data_utils.get_not_deleted_flag() }, { '_id': 1, 'file_location': 1, "description.attributes": 1, "name": 1, "file_hash": 1 })) if not len(datafiles): self.__converter_errors.append("No datafiles found in submission!") return sra_samples df = pd.DataFrame(datafiles) df['file_id'] = df._id.astype(str) df['file_path'] = df['file_location'].fillna('') df['upload_status'] = False df = df[['file_id', 'file_path', 'upload_status']] bundle = list(df.file_id) bundle_meta = df.to_dict('records') submission_record = dict(bundle=bundle, bundle_meta=bundle_meta) ghlper.get_submission_handle().update( {"_id": ObjectId(self.submission_id)}, {'$set': submission_record}) samples_id = list() df_attributes = [] # datafiles attributes for datafile in datafiles: datafile_attributes = [ v for k, v in datafile.get("description", dict()).get( "attributes", dict()).items() ] new_dict = dict() for d in datafile_attributes: new_dict.update(d) new_dict['datafile_id'] = str(datafile['_id']) new_dict['datafile_name'] = datafile.get('name', str()) new_dict['datafile_hash'] = datafile.get('file_hash', str()) new_dict['datafile_location'] = datafile.get( 'file_location', str()) df_attributes.append(new_dict) # process datafiles attributes df_attributes_df = pd.DataFrame(df_attributes) df_columns = df_attributes_df.columns # replace null values for k in df_columns: df_attributes_df[k].fillna('', inplace=True) if 'study_samples' in df_columns: df_attributes_df['study_samples'] = df_attributes_df[ 'study_samples'].apply(lambda x: x[0] if isinstance(x, list) else x.split(",")[-1]) samples_id = list(df_attributes_df['study_samples'].unique()) samples_id = [x for x in samples_id if x] if not samples_id: self.__converter_errors.append( "No samples associated with datafiles!") return sra_samples file_path = os.path.join(submission_location, "datafiles.csv") df_attributes_df.to_csv(path_or_buf=file_path, index=False) samples_id_object_list = [ ObjectId(sample_id) for sample_id in samples_id ] sample_records = ghlper.get_samples_handle().find( {"_id": { "$in": samples_id_object_list }}) # get sources sources = ghlper.get_sources_handle().find({ "profile_id": self.profile_id, 'deleted': data_utils.get_not_deleted_flag() }) sra_sources = dict() for source in sources: sra_source = dict() sra_sources[str(source["_id"])] = sra_source sra_source["name"] = source["name"] sra_source["taxon_id"] = source.get("organism", dict()).get( 'termAccession', str()) if 'NCBITaxon_' in sra_source["taxon_id"]: sra_source["taxon_id"] = sra_source["taxon_id"].split( 'NCBITaxon_')[-1] sra_source["scientific_name"] = source.get("organism", dict()).get( 'annotationValue', str()) sra_source['attributes'] = self.get_attributes( source.get("characteristics", list())) sra_source[ 'attributes'] = sra_source['attributes'] + self.get_attributes( source.get("factorValues", list())) for sample in sample_records: sra_sample = dict() sra_sample['sample_id'] = str(sample['_id']) sra_sample['name'] = sample['name'] sra_sample['attributes'] = self.get_attributes( sample.get("characteristics", list())) sra_sample[ 'attributes'] = sra_sample['attributes'] + self.get_attributes( sample.get("factorValues", list())) # retrieve sample source source_id = sample.get("derivesFrom", list()) source_id = source_id[0] if source_id else '' sample_source = sra_sources.get(source_id, dict()) if sample_source: sra_sample['attributes'].append( dict(tag="Source Name", value=sample_source.get("name", str()))) else: self.__converter_errors.append("Sample: " + sample['name'] + " has no source information") if sample_source.get("taxon_id", str()): sra_sample['taxon_id'] = sample_source.get("taxon_id", str()) else: self.__converter_errors.append( "Sample: " + sample['name'] + " has no TAXON_ID. Please make sure an organism has " "been set for the source of this sample from the NCBITAXON ontology." ) if sample_source.get("scientific_name", str()): sra_sample['scientific_name'] = sample_source.get( "scientific_name", str()) else: self.__converter_errors.append( "Sample: " + sample['name'] + " has no SCIENTIFIC_NAME. Please make sure an organism has " "been set for the source of this sample from an ontology.") if sample_source.get("attributes", list()): sra_sample['attributes'] = sra_sample[ 'attributes'] + sample_source.get("attributes", list()) sra_samples.append(sra_sample) return sra_samples
def get_by_ids(self, uids): doc = list() if (uids): oids = list(map(lambda x: ObjectId(x), uids)) doc = self.get_collection_handle().find({"_id": {"$in": oids}}) return cursor_to_list(doc)
def get_allsamples(self): """ lookup for all samples irrespective of sample type :return: """ import web.apps.web_copo.templatetags.html_tags as htags df = pd.DataFrame() if self.accession: if isinstance(self.accession, str): self.accession = self.accession.split(",") object_ids = [ObjectId(x) for x in self.accession if x.strip()] records = cursor_to_list(Sample().get_collection_handle().find( {"_id": { "$in": object_ids }})) if records: df = pd.DataFrame(records) df['accession'] = df._id.astype(str) df['label'] = df['name'] df['desc'] = df['accession'].apply( lambda x: htags.generate_attributes("sample", x)) df['description'] = df['desc'].apply( lambda x: self.format_description(x)) df['server-side'] = True # ...to request callback to server for resolving item description elif self.search_term: projection = dict(name=1) filter_by = dict() filter_by["name"] = {'$regex': self.search_term, "$options": 'i'} sort_by = 'name' sort_direction = -1 records = Sample( profile_id=self.profile_id).get_all_records_columns( filter_by=filter_by, projection=projection, sort_by=sort_by, sort_direction=sort_direction) if not records: # try getting all records del filter_by['name'] records = Sample( profile_id=self.profile_id).get_all_records_columns( filter_by=filter_by, projection=projection, sort_by=sort_by, sort_direction=sort_direction) if records: df = pd.DataFrame(records) df['accession'] = df._id.astype(str) df['label'] = df['name'] df['description'] = '' df['server-side'] = True # ...to request callback to server for resolving item description result = list() if not df.empty: df = df[['accession', 'label', 'description', 'server-side']] result = df.to_dict('records') return result
def cg_dependency_lookup(self): """ lookup for cgcore dependent components :return: """ import web.apps.web_copo.templatetags.html_tags as htags result = list() df = pd.DataFrame() dependent_record_label = 'copo_name' if self.accession: if isinstance(self.accession, str): self.accession = self.accession.split(",") object_ids = [ObjectId(x) for x in self.accession if x.strip()] records = cursor_to_list(CGCore().get_collection_handle().find( {"_id": { "$in": object_ids }})) result = list() if records: for record in records: referenced_field = record.get("dependency_id", str()) kwargs = dict() kwargs["referenced_field"] = referenced_field schema = CGCore().get_component_schema(**kwargs) label = record.get(dependent_record_label, str()) # modify schema before generating description schema = [ x for x in schema if 'dependency' in x and x['dependency'] == referenced_field and x.get("show_in_table", True) ] resolved = htags.resolve_display_data(schema, record) description = self.format_description(resolved) item_dict = dict(accession=str(record["_id"]), label=label, description=description) item_dict[ 'server-side'] = True # ...to request callback to server for resolving item description result.append(item_dict) elif self.search_term: referenced_field = self.referenced_field filter_name = dependent_record_label projection = {filter_name: 1} filter_by = dict(dependency_id=referenced_field) filter_by[filter_name] = { '$regex': self.search_term, "$options": 'i' } sort_by = filter_name sort_direction = -1 records = CGCore( profile_id=self.profile_id).get_all_records_columns( filter_by=filter_by, projection=projection, sort_by=sort_by, sort_direction=sort_direction) if not records: # try getting all records del filter_by[filter_name] records = CGCore( profile_id=self.profile_id).get_all_records_columns( filter_by=filter_by, projection=projection, sort_by=sort_by, sort_direction=sort_direction) if records: df = pd.DataFrame(records) df['accession'] = df._id.astype(str) df['label'] = df[filter_name] df['description'] = '' df['server-side'] = True # ...to request callback to server for resolving item description if not df.empty: df = df[['accession', 'label', 'description', 'server-side']] result = df.to_dict('records') return result
def get_all_records(self): doc = { 'profile_id': self.profile_id, 'deleted': data_utils.get_not_deleted_flag() } return cursor_to_list(self.RemoteFileCollection.find(doc))
def view_groups(request): # g = Group().create_group(description="test descrition") profile_list = cursor_to_list(Profile().get_for_user()) group_list = cursor_to_list(CopoGroup().get_by_owner(request.user.id)) return render(request, 'copo/copo_group.html', {'request': request, 'profile_list': profile_list, 'group_list': group_list})
def get_by_datafile(self, datafile_id): doc = {'datafile_id': ObjectId(datafile_id), 'deleted': data_utils.get_not_deleted_flag()} return cursor_to_list(self.RemoteFileCollection.find(doc))
def execute_query(self, query_dict=dict()): if self.profile_id: query_dict["profile_id"] = self.profile_id return cursor_to_list(self.get_collection_handle().find(query_dict))
def extract_repo_fields(self, datafile_id=str(), repo=str()): """ given a datafile id, and repository type function returns a list of dictionaries of fields matching the repo :param datafile_id: :param repo: :return: """ from dal.copo_da import DataFile, CGCore from dal.copo_base_da import DataSchemas if not repo: # no repository to filter by return list() repo_type_option = lkup.DROP_DOWNS["REPO_TYPE_OPTIONS"] repo_type_option = [ x for x in repo_type_option if x["value"].lower() == repo.lower() ] if not repo_type_option: return list() repo_type_option = repo_type_option[0] cg_schema = DataSchemas("COPO").get_ui_template_node('cgCore') # filter schema items by repo cg_schema = [ x for x in cg_schema if x.get("target_repo", str()).strip() != str() and repo_type_option.get("abbreviation", str()) in [y.strip() for y in x.get("target_repo").split(',')] ] record = DataFile().get_record(datafile_id) description = record.get("description", dict()) attributes = description.get("attributes", dict()) stages = description.get("stages", list()) schema_df = pd.DataFrame(cg_schema) schema_df.id = schema_df.id.str.lower().str.split(".").str[-1] schema_df.index = schema_df.id schema_df = schema_df[['ref', 'id', 'prefix']] schema_df = schema_df[~schema_df['ref'].isna()] # get all stage items all_items = [item for st in stages for item in st.get("items", list())] # filter stage items - stage items should conform to specifications of the repo schema_ids = list(schema_df.id) items = { item.get("id", str()).lower().split(".")[-1]: st.get("ref", "").lower() for st in stages for item in st.get("items", list()) if item.get("id", str()).lower().split(".")[-1] in schema_ids } # ...also, account for any filtering performed by client agents (e.g., dependencies in COPO Wizard), # within the context of the target repo schema_df = schema_df[schema_df.index.isin(items.keys())] # obtain attributes for filtered stage items target_stages = list(set(items.values())) datafile_attributes = [ v for k, v in attributes.items() if k in target_stages ] new_dict = dict() for d in datafile_attributes: new_dict.update(d) new_dict_series = pd.Series(new_dict) new_dict_series.index = new_dict_series.index.str.lower() schema_df['vals'] = new_dict_series schema_df['vals'] = schema_df['vals'].fillna('') schema_df = schema_df[['ref', 'id', 'vals', 'prefix']] # get composite attributes composite_attrib = [ x for x in all_items if x["id"] in list(schema_df.id) and x.get("create_new_item", False) ] # expand composite attributes for cattrib in composite_attrib: comp_series = schema_df.loc[cattrib["id"]] schema_df = schema_df[~schema_df.id.isin([cattrib["id"]])] children_schemas = [ x for x in cg_schema if x.get("dependency", str()).lower() == comp_series.ref.lower() ] accessions = comp_series.vals if isinstance(accessions, str): accessions = accessions.split(",") object_ids = [ObjectId(x) for x in accessions if x.strip()] records = list() if len(object_ids): records = cursor_to_list(CGCore().get_collection_handle().find( {"_id": { "$in": object_ids }})) attr_list = list() for child in children_schemas: child_dict = dict(ref=child["ref"], id=child["id"].split(".")[-1], prefix=child["prefix"], vals=[]) attr_list.append(child_dict) for rec in records: child_dict["vals"].append(rec.get(child_dict["id"], str())) if attr_list: attr_df = pd.DataFrame(attr_list) attr_df.index = attr_df.id schema_df = pd.concat([schema_df, attr_df]) schema_df.rename(index=str, columns={ "ref": "dc", "id": "copo_id" }, inplace=True) dc_list = schema_df.to_dict('records') return dc_list
def get_all_datafiles(self): doc = {'profile_id': self.profile_id, 'deleted': "0"} return cursor_to_list(self.DataFileCollection.find(doc))
def get_by_datafile(self, datafile_id): doc = { 'datafile_id': ObjectId(datafile_id), 'deleted': data_utils.get_not_deleted_flag() } return cursor_to_list(self.RemoteFileCollection.find(doc))
def get_all_descriptions(self): return cursor_to_list(self.DescriptionCollection.find())
def get_all_publications(self): doc = {'profile_id': self.profile_id, 'deleted': "0"} return cursor_to_list(self.PubCollection.find(doc))
def get_all_records(self, sort_by='_id', sort_direction=-1): doc = dict(deleted=data_utils.get_not_deleted_flag()) if self.profile_id: doc["profile_id"] = self.profile_id return cursor_to_list(self.get_collection_handle().find(doc).sort([[sort_by, sort_direction]]))
def get_terms_for_user_alphabetical(self, uid): a = self.get_collection_handle().find({ "uid": uid }).sort("label", pymongo.ASCENDING) return cursor_to_list(a)
def get_all_records(self): doc = {'profile_id': self.profile_id, 'deleted': data_utils.get_not_deleted_flag()} return cursor_to_list(self.RemoteFileCollection.find(doc))
def get_terms_for_user_ranked(self, uid): a = self.get_collection_handle().find({ "uid": uid }).sort("count", pymongo.DESCENDING) return cursor_to_list(a)
def perform_datafile_pairing(self, next_stage_index): """ stage callback function: determines if the pairing of datafiles should be performed given the 'library_layout' :param next_stage_index: :return: """ description = Description().GET(self.__wzh.description_token) stages = description["stages"] attributes = description["attributes"] meta = description.get("meta", dict()) # validate stage stage = dict() if next_stage_index < len(stages): stage = stages[next_stage_index] # first, target repository relevant_repos = [ "ena" ] # add a repo to this list if it requires datafile pairing target_repository = attributes.get("target_repository", dict()).get("deposition_context", str()) if target_repository not in relevant_repos: # no items to pair, clear any previous pairing information self.remove_pairing_info(stage["ref"], attributes, meta) return False # get records in bundle records = cursor_to_list(DataFile().get_collection_handle().find( { "$and": [{ "description_token": self.__wzh.description_token, 'deleted': d_utils.get_not_deleted_flag() }, { 'description.attributes': { "$exists": True } }] }, { 'description.attributes': 1, 'name': 1 })) if not records: # no items to pair, clear any previous pairing information self.remove_pairing_info(stage["ref"], attributes, meta) return False for rec in records: datafile_attributes = [ v for k, v in rec['description'].get('attributes', dict()).items() ] new_dict = dict() for d in datafile_attributes: new_dict.update(d) rec['attributes'] = new_dict rec['pairing'] = rec['attributes'].get('library_layout', '').upper() df = pd.DataFrame(records) df._id = df['_id'].astype(str) df.index = df._id df = df[df['pairing'] == 'PAIRED'] if not len(df): # no items to pair, clear any previous pairing information self.remove_pairing_info(stage["ref"], attributes, meta) return False # remove extraneous columns df = df.drop(columns=['description']) if not len(df) % 2 == 0: stage["error"] = "Pairing requires even number of datafiles!" stage["refresh_wizard"] = True else: # get previously pairing candidates paired_candidates_old = meta.get( stage["ref"] + "_paired_candidates", list()) paired_candidates = list(df.index) paired_candidates_old.sort() paired_candidates.sort() if not paired_candidates_old == paired_candidates: stage["refresh_wizard"] = True # if there's a valid stored map, use it stage_data = list() saved_copy = attributes.get(stage["ref"], list()) if saved_copy: stored_pairs_df = pd.DataFrame(saved_copy) stored_pairs_list = list(stored_pairs_df._id) + list( stored_pairs_df._id2) stored_pairs_list.sort() if stored_pairs_list == paired_candidates: df_dict = df.to_dict() df_dict = df_dict["name"] stored_pairs_df["name"] = stored_pairs_df['_id'].apply( lambda x: str(df_dict[x])) stored_pairs_df["name2"] = stored_pairs_df['_id2'].apply( lambda x: str(df_dict[x])) df_result = stored_pairs_df[['name', 'name2']] df_result.columns = ['file1', 'file2'] stage_data = df_result.to_dict('records') if not stage_data: # define fresh pairing map # sort by file name to reflect pairing df = df.sort_values(by=['name']) s_even = df._id.iloc[1::2] s_odd = df._id.iloc[::2] df_odd = df[df.index.isin(s_odd)].copy() df_even = df[df.index.isin(s_even)].copy() df_even['_id2'] = df_even['_id'] df_even['name2'] = df_even['name'] df_even = df_even[['_id2', 'name2']] df_odd = df_odd[['_id', 'name']] df_odd.index = range(0, len(df_odd)) df_even.index = range(0, len(df_even)) df_result = pd.concat([df_odd, df_even], axis=1).reindex(df_odd.index) saved_copy = df_result[['_id', '_id2']].to_dict('records') df_result = df_result[['name', 'name2']] df_result.columns = ['file1', 'file2'] stage_data = df_result.to_dict('records') stage["data"] = stage_data # save state attributes[stage["ref"]] = saved_copy meta[stage["ref"] + "_paired_candidates"] = paired_candidates save_dict = dict(attributes=attributes, meta=meta) Description().edit_description(self.__wzh.description_token, save_dict) stage["message"] = self.__wzh.wiz_message[ "datafiles_pairing_message"]["text"] return stage