def test_dataframe_from_template(self): template = PrepTemplate(1) obs = template.to_dataframe() # 27 samples self.assertEqual(len(obs), 27) self.assertTrue(set(obs.index), { u'SKB1.640202', u'SKB2.640194', u'SKB3.640195', u'SKB4.640189', u'SKB5.640181', u'SKB6.640176', u'SKB7.640196', u'SKB8.640193', u'SKB9.640200', u'SKD1.640179', u'SKD2.640178', u'SKD3.640198', u'SKD4.640185', u'SKD5.640186', u'SKD6.640190', u'SKD7.640191', u'SKD8.640184', u'SKD9.640182', u'SKM1.640183', u'SKM2.640199', u'SKM3.640197', u'SKM4.640180', u'SKM5.640177', u'SKM6.640187', u'SKM7.640188', u'SKM8.640201', u'SKM9.640192'}) self.assertTrue(set(obs.columns), { u'tot_org_carb', u'common_name', u'has_extracted_data', u'required_sample_info_status', u'water_content_soil', u'env_feature', u'assigned_from_geo', u'altitude', u'env_biome', u'texture', u'has_physical_specimen', u'description_duplicate', u'physical_location', u'latitude', u'ph', u'host_taxid', u'elevation', u'description', u'collection_timestamp', u'taxon_id', u'samp_salinity', u'host_subject_id', u'sample_type', u'season_environment', u'temp', u'country', u'longitude', u'tot_nitro', u'depth', u'anonymized_name', u'target_subfragment', u'sample_center', u'samp_size', u'run_date', u'experiment_center', u'pcr_primers', u'center_name', u'barcodesequence', u'run_center', u'run_prefix', u'library_construction_protocol', u'emp_status', u'linkerprimersequence', u'experiment_design_description', u'target_gene', u'center_project_name', u'illumina_technology', u'sequencing_meth', u'platform', u'experiment_title', u'study_center'})
def prep_template_get_req(prep_id, user_id): """Gets the json of the full prep template Parameters ---------- prep_id : int PrepTemplate id to get info for user_id : str User requesting the sample template info Returns ------- dict of objects {'status': status, 'message': message, 'template': {sample: {column: value, ...}, ...} """ exists = _check_prep_template_exists(int(prep_id)) if exists['status'] != 'success': return exists prep = PrepTemplate(int(prep_id)) access_error = check_access(prep.study_id, user_id) if access_error: return access_error df = prep.to_dataframe() return {'status': 'success', 'message': '', 'template': df.to_dict(orient='index')}
def prep_template_get_req(prep_id, user_id): """Gets the json of the full prep template Parameters ---------- prep_id : int PrepTemplate id to get info for user_id : str User requesting the sample template info Returns ------- dict of objects {'status': status, 'message': message, 'template': {sample: {column: value, ...}, ...} """ exists = _check_prep_template_exists(int(prep_id)) if exists['status'] != 'success': return exists prep = PrepTemplate(int(prep_id)) access_error = check_access(prep.study_id, user_id) if access_error: return access_error df = prep.to_dataframe() return {'status': 'success', 'message': '', 'template': df.to_dict(orient='index')}
def prep_template_summary_get_req(prep_id, user_id): """Get the summarized prep template data for each metadata column Parameters ---------- prep_id : int PrepTemplate id to get info for user_id : str User requesting the sample template info Returns ------- dict of objects Dictionary object where the keys are the metadata categories and the values are list of tuples. Each tuple is an observed value in the category and the number of times its seen. Format {'status': status, 'message': message, 'num_samples': value, 'category': [(val1, count1), (val2, count2), ...], 'editable': bool} """ exists = _check_prep_template_exists(int(prep_id)) if exists['status'] != 'success': return exists prep = PrepTemplate(int(prep_id)) access_error = check_access(prep.study_id, user_id) if access_error: return access_error editable = Study(prep.study_id).can_edit(User(user_id)) df = prep.to_dataframe() out = { 'num_samples': df.shape[0], 'summary': [], 'status': 'success', 'message': '', 'editable': editable } cols = sorted(list(df.columns)) for column in cols: counts = df[column].value_counts(dropna=False) out['summary'].append( (str(column), [(str(key), counts[key]) for key in natsorted(counts.index)])) return out
def prep_template_summary_get_req(prep_id, user_id): """Get the summarized prep template data for each metadata column Parameters ---------- prep_id : int PrepTemplate id to get info for user_id : str User requesting the sample template info Returns ------- dict of objects Dictionary object where the keys are the metadata categories and the values are list of tuples. Each tuple is an observed value in the category and the number of times its seen. Format {'status': status, 'message': message, 'num_samples': value, 'category': [(val1, count1), (val2, count2), ...], 'editable': bool} """ exists = _check_prep_template_exists(int(prep_id)) if exists['status'] != 'success': return exists prep = PrepTemplate(int(prep_id)) access_error = check_access(prep.study_id, user_id) if access_error: return access_error editable = Study(prep.study_id).can_edit(User(user_id)) df = prep.to_dataframe() out = {'num_samples': df.shape[0], 'summary': [], 'status': 'success', 'message': '', 'editable': editable} cols = sorted(list(df.columns)) for column in cols: counts = df[column].value_counts() out['summary'].append( (str(column), [(str(key), counts[key]) for key in natsorted(counts.index)])) return out
st_df = st.to_dataframe()[columns] # converting to datetime for col in columns: st_df[col] = st_df[col].apply(transform_date) st.update(st_df) if cols_prep: with TRN: # a few notes: just getting the preps with duplicated values; ignoring # column 'sample_id' and tables 'study_sample', 'prep_template', # 'prep_template_sample' sql = """SELECT table_name, array_agg(column_name::text) FROM information_schema.columns WHERE column_name IN %s AND table_name LIKE 'prep_%%' AND table_name NOT IN ( 'prep_template', 'prep_template_sample') GROUP BY table_name""" # note that we are looking for those columns with duplicated names in # the headers TRN.add(sql, [tuple(set(cols_prep))]) for table, columns in dict(TRN.execute_fetchindex()).items(): # [1] the format is table_# so taking the # pt = PrepTemplate(int(table.split('_')[1])) # getting just the columns of interest pt_df = pt.to_dataframe()[columns] # converting to datetime for col in columns: pt_df[col] = pt_df[col].apply(transform_date) pt.update(pt_df)
def study_files_get_req(user_id, study_id, prep_template_id, artifact_type): """Returns the uploaded files for the study id categorized by artifact_type It retrieves the files uploaded for the given study and tries to guess on how those files should be added to the artifact of the given type. Uses information on the prep template to try to do a better guess. Parameters ---------- user_id : str The id of the user making the request study_id : int The study id prep_template_id : int The prep template id artifact_type : str The artifact type Returns ------- dict of {str: object} A dict of the form {'status': str, 'message': str, 'remaining': list of str, 'file_types': list of (str, bool, list of str), 'num_prefixes': int} where 'status' is a string specifying whether the query is successfull, 'message' is a human-readable description of the error (optional), 'remaining' is the list of files that could not be categorized, 'file_types' is a list of the available filetypes, if it is required or not and the list of categorized files for the given artifact type and 'num_prefixes' is the number of different run prefix values in the given prep template. """ supp_file_types = supported_filepath_types(artifact_type) selected = [] remaining = [] message = [] pt = PrepTemplate(prep_template_id) if pt.study_id != study_id: raise IncompetentQiitaDeveloperError( "The requested prep id (%d) doesn't belong to the study " "(%d)" % (pt.study_id, study_id)) uploaded = get_files_from_uploads_folders(study_id) pt = pt.to_dataframe() ftypes_if = (ft.startswith('raw_') for ft, _ in supp_file_types if ft != 'raw_sff') if any(ftypes_if) and 'run_prefix' in pt.columns: prep_prefixes = tuple(set(pt['run_prefix'])) num_prefixes = len(prep_prefixes) # sorting prefixes by length to avoid collisions like: 100 1002 # 10003 prep_prefixes = sorted(prep_prefixes, key=len, reverse=True) # group files by prefix sfiles = defaultdict(list) for p in prep_prefixes: to_remove = [] for fid, f, _ in uploaded: if f.startswith(p): sfiles[p].append(f) to_remove.append((fid, f)) uploaded = [x for x in uploaded if x not in to_remove] inuse = [y for x in sfiles.values() for y in x] remaining.extend([f for _, f, _ in uploaded if f not in inuse]) supp_file_types_len = len(supp_file_types) for k, v in sfiles.items(): len_files = len(v) # if the number of files in the k group is larger than the # available columns add to the remaining group, if not put them in # the selected group if len_files > supp_file_types_len: remaining.extend(v) message.append("'%s' has %d matches." % (k, len_files)) else: v.sort() selected.append(v) else: num_prefixes = 0 remaining = [f for _, f, _ in uploaded] # get file_types, format: filetype, required, list of files file_types = [(t, req, [x[i] for x in selected if i + 1 <= len(x)]) for i, (t, req) in enumerate(supp_file_types)] # Create a list of artifacts that the user has access to, in case that # he wants to import the files from another artifact user = User(user_id) artifact_options = [] user_artifacts = user.user_artifacts(artifact_type=artifact_type) study = Study(study_id) if study not in user_artifacts: user_artifacts[study] = study.artifacts(artifact_type=artifact_type) for study, artifacts in user_artifacts.items(): study_label = "%s (%d)" % (study.title, study.id) for a in artifacts: artifact_options.append( (a.id, "%s - %s (%d)" % (study_label, a.name, a.id))) message = ('' if not message else '\n'.join(['Check these run_prefix:'] + message)) return { 'status': 'success', 'message': message, 'remaining': sorted(remaining), 'file_types': file_types, 'num_prefixes': num_prefixes, 'artifacts': artifact_options }
def study_files_get_req(user_id, study_id, prep_template_id, artifact_type): """Returns the uploaded files for the study id categorized by artifact_type It retrieves the files uploaded for the given study and tries to guess on how those files should be added to the artifact of the given type. Uses information on the prep template to try to do a better guess. Parameters ---------- user_id : str The id of the user making the request study_id : int The study id prep_template_id : int The prep template id artifact_type : str The artifact type Returns ------- dict of {str: object} A dict of the form {'status': str, 'message': str, 'remaining': list of str, 'file_types': list of (str, bool, list of str), 'num_prefixes': int} where 'status' is a string specifying whether the query is successfull, 'message' is a human-readable description of the error (optional), 'remaining' is the list of files that could not be categorized, 'file_types' is a list of the available filetypes, if it is required or not and the list of categorized files for the given artifact type and 'num_prefixes' is the number of different run prefix values in the given prep template. """ supp_file_types = supported_filepath_types(artifact_type) selected = [] remaining = [] message = [] pt = PrepTemplate(prep_template_id) if pt.study_id != study_id: raise IncompetentQiitaDeveloperError( "The requested prep id (%d) doesn't belong to the study " "(%d)" % (pt.study_id, study_id)) uploaded = get_files_from_uploads_folders(study_id) pt = pt.to_dataframe() ftypes_if = (ft.startswith('raw_') for ft, _ in supp_file_types if ft != 'raw_sff') if any(ftypes_if) and 'run_prefix' in pt.columns: prep_prefixes = tuple(set(pt['run_prefix'])) num_prefixes = len(prep_prefixes) # sorting prefixes by length to avoid collisions like: 100 1002 # 10003 prep_prefixes = sorted(prep_prefixes, key=len, reverse=True) # group files by prefix sfiles = defaultdict(list) for p in prep_prefixes: to_remove = [] for fid, f in uploaded: if f.startswith(p): sfiles[p].append(f) to_remove.append((fid, f)) uploaded = [x for x in uploaded if x not in to_remove] inuse = [y for x in sfiles.values() for y in x] remaining.extend([f for _, f in uploaded if f not in inuse]) supp_file_types_len = len(supp_file_types) for k, v in viewitems(sfiles): len_files = len(v) # if the number of files in the k group is larger than the # available columns add to the remaining group, if not put them in # the selected group if len_files > supp_file_types_len: remaining.extend(v) message.append("'%s' has %d matches." % (k, len_files)) else: v.sort() selected.append(v) else: num_prefixes = 0 remaining = [f for _, f in uploaded] # get file_types, format: filetype, required, list of files file_types = [(t, req, [x[i] for x in selected if i+1 <= len(x)]) for i, (t, req) in enumerate(supp_file_types)] # Create a list of artifacts that the user has access to, in case that # he wants to import the files from another artifact user = User(user_id) artifact_options = [] user_artifacts = user.user_artifacts(artifact_type=artifact_type) study = Study(study_id) if study not in user_artifacts: user_artifacts[study] = study.artifacts(artifact_type=artifact_type) for study, artifacts in viewitems(user_artifacts): study_label = "%s (%d)" % (study.title, study.id) for a in artifacts: artifact_options.append( (a.id, "%s - %s (%d)" % (study_label, a.name, a.id))) message = ('' if not message else '\n'.join(['Check these run_prefix:'] + message)) return {'status': 'success', 'message': message, 'remaining': sorted(remaining), 'file_types': file_types, 'num_prefixes': num_prefixes, 'artifacts': artifact_options}
st_df = st.to_dataframe()[columns] # converting to datetime for col in columns: st_df[col] = st_df[col].apply(transform_date) st.update(st_df) if cols_prep: with TRN: # a few notes: just getting the preps with duplicated values; ignoring # column 'sample_id' and tables 'study_sample', 'prep_template', # 'prep_template_sample' sql = """SELECT table_name, array_agg(column_name::text) FROM information_schema.columns WHERE column_name IN %s AND table_name LIKE 'prep_%%' AND table_name NOT IN ( 'prep_template', 'prep_template_sample') GROUP BY table_name""" # note that we are looking for those columns with duplicated names in # the headers TRN.add(sql, [tuple(set(cols_prep))]) for table, columns in viewitems(dict(TRN.execute_fetchindex())): # [1] the format is table_# so taking the # pt = PrepTemplate(int(table.split('_')[1])) # getting just the columns of interest pt_df = pt.to_dataframe()[columns] # converting to datetime for col in columns: pt_df[col] = pt_df[col].apply(transform_date) pt.update(pt_df)