def slice_data_files(dir, factor_selection=None): #Gets list of dictionaries with each one being the data file string(s) for #the sample results = list() # first collect matching samples for table_file in glob.iglob(os.path.join(dir, '[a|s]_*')): log.info("Loading {}".format(table_file)) with open(table_file, encoding='utf-8') as fp: df = isatab.load_table(fp) if factor_selection is None: matches = df['Sample Name'].items() for indx, match in matches: sample_name = match if len([r for r in results if r['sample'] == sample_name]) == 1: continue else: results.append({ "sample": sample_name, "data_files": [] }) else: for factor_name, factor_value in factor_selection.items(): if 'Factor Value[{}]'.format(factor_name) in list( df.columns.values): matches = df.loc[df['Factor Value[{}]'.format( factor_name)] == factor_value]['Sample Name'].items() for indx, match in matches: sample_name = match if len([ r for r in results if r['sample'] == sample_name ]) == 1: continue else: results.append({ "sample": sample_name, "data_files": [], "query_used": factor_selection }) # now collect the data files relating to the samples for result in results: sample_name = result['sample'] for table_file in glob.iglob(os.path.join(dir, 'a_*')): with open(table_file, encoding='utf-8') as fp: df = isatab.load_table(fp) data_files = list() table_headers = list(df.columns.values) sample_rows = df.loc[df['Sample Name'] == sample_name] if 'Raw Spectral Data File' in table_headers: data_files = sample_rows['Raw Spectral Data File'] elif 'Free Induction Decay Data File' in table_headers: data_files = sample_rows['Free Induction Decay Data File'] result['data_files'] = [ i for i in list(data_files) if str(i) != 'nan' ] return results
def insert_distinct_parameter(table_fp, protocol_ref_to_unpool): reader = csv.reader(table_fp, dialect="excel-tab") headers = next(reader) # get column headings table_fp.seek(0) df = load_table(table_fp) protocol_ref_indices = [ x for x, y in enumerate(df.columns) if df[y][0] == protocol_ref_to_unpool ] # find protocol ref column by index if len(protocol_ref_indices) != 1: raise IndexError( "Could not find Protocol REF with provided value {}".format( protocol_ref_to_unpool)) distindex = list() for i in range(0, len(df.index)): distindex.append(str(uuid.uuid4())[:8]) protocol_ref_index = protocol_ref_indices[0] name_header = None head_from_prot = headers[protocol_ref_index:] for x, y in enumerate(head_from_prot): if y.endswith(" Name"): name_header = y break if name_header is not None: print( "Are you sure you want to add a column of hash values in {}? Y/(N)" .format(name_header)) confirm = input() if confirm == "Y": df[name_header] = distindex table_fp.seek(0) df.to_csv(table_fp, index=None, header=headers, sep="\t") else: print("Could not find appropriate column to fill with hashes")
def get_factor_values(mtbls_study_id, factor_name): """ This function gets the factor values of a factor in a MetaboLights study :param mtbls_study_id: Accession number of the MetaboLights study :param factor_name: The factor name for which values are being queried :return: A set of factor values associated with the factor and study Example usage: factor_values = get_factor_values('MTBLS1', 'genotype') """ tmp_dir = get(mtbls_study_id) from isatools import isatab fvs = set() for table_file in glob.iglob(os.path.join(tmp_dir, '[a|s]_*')): with open(os.path.join(tmp_dir, table_file), encoding='utf-8') as fp: df = isatab.load_table(fp) if 'Factor Value[{}]'.format(factor_name) in list(df.columns.values): for _, match in df['Factor Value[{}]'.format(factor_name)].iteritems(): try: match = match.item() except AttributeError: pass if isinstance(match, (str, int, float)): if str(match) != 'nan': fvs.add(match) shutil.rmtree(tmp_dir) return fvs
def get_factor_names(mtbls_study_id): """ This function gets the factor names used in a MetaboLights study :param mtbls_study_id: Accession number of the MetaboLights study :return: A set of factor names used in the study Example usage: factor_names = get_factor_names('MTBLS1') """ tmp_dir = get(mtbls_study_id) table_files = [f for f in os.listdir(tmp_dir) if f.startswith(("a_", "s_"))] from isatools import isatab factors = set() import re for table_file in table_files: df = isatab.load_table(os.path.join(tmp_dir, table_file)) factors_headers = [ header for header in list(df.columns.values) if re.compile("Factor Value\[(.*?)\]").match(header) ] for header in factors_headers: factors.add(header[13:-1]) return factors
def get_factor_values(mtbls_study_id, factor_name): """ This function gets the factor values of a factor in a MetaboLights study :param mtbls_study_id: Accession number of the MetaboLights study :param factor_name: The factor name for which values are being queried :return: A set of factor values associated with the factor and study Example usage: factor_values = get_factor_values('MTBLS1', 'genotype) """ tmp_dir = get(mtbls_study_id) table_files = [f for f in os.listdir(tmp_dir) if f.startswith(("a_", "s_"))] from isatools import isatab fvs = set() for table_file in table_files: df = isatab.load_table(os.path.join(tmp_dir, table_file)) if "Factor Value[{}]".format(factor_name) in list(df.columns.values): for indx, match in df["Factor Value[{}]".format(factor_name)].items(): if isinstance(match, (str, int, float)): if str(match) != "nan": fvs.add(match) shutil.rmtree(tmp_dir) return fvs
def get_factor_values(mtbls_study_id, factor_name): """ This function gets the factor values of a factor in a MetaboLights study :param mtbls_study_id: Accession number of the MetaboLights study :param factor_name: The factor name for which values are being queried :return: A set of factor values associated with the factor and study Example usage: factor_values = get_factor_values('MTBLS1', 'genotype) """ tmp_dir = get(mtbls_study_id) table_files = [ f for f in os.listdir(tmp_dir) if f.startswith(('a_', 's_')) ] from isatools import isatab fvs = set() for table_file in table_files: df = isatab.load_table(os.path.join(tmp_dir, table_file)) if 'Factor Value[{}]'.format(factor_name) in list(df.columns.values): for indx, match in df['Factor Value[{}]'.format( factor_name)].items(): if isinstance(match, (str, int, float)): if str(match) != 'nan': fvs.add(match) shutil.rmtree(tmp_dir) return fvs
def get_factor_names(mtbls_study_id): """ This function gets the factor names used in a MetaboLights study :param mtbls_study_id: Accession number of the MetaboLights study :return: A set of factor names used in the study Example usage: factor_names = get_factor_names('MTBLS1') """ tmp_dir = get(mtbls_study_id) table_files = [ f for f in os.listdir(tmp_dir) if f.startswith(('a_', 's_')) ] from isatools import isatab factors = set() import re for table_file in table_files: df = isatab.load_table(os.path.join(tmp_dir, table_file)) factors_headers = [ header for header in list(df.columns.values) if re.compile('Factor Value\[(.*?)\]').match(header) ] for header in factors_headers: factors.add(header[13:-1]) return factors
def get_factor_names(mtbls_study_id): """ This function gets the factor names used in a MetaboLights study :param mtbls_study_id: Accession number of the MetaboLights study :return: A set of factor names used in the study Example usage: factor_names = get_factor_names('MTBLS1') """ tmp_dir = get(mtbls_study_id) factors = set() for table_file in glob.iglob(os.path.join(tmp_dir, '[a|s]_*')): with open(os.path.join(tmp_dir, table_file), encoding='utf-8') as fp: df = isatab.load_table(fp) factors_headers = [ header for header in list(df.columns.values) if _RX_FACTOR_VALUE.match(header) ] for header in factors_headers: factors.add(header[13:-1]) return factors
def get_filtered_df_on_factors_list(mtbls_study_id): factors_list = get_study_group_factors(mtbls_study_id=mtbls_study_id) queries = [] for item in factors_list: query_str = [] for k, v in item.items(): k = k.replace(' ', '_').replace('[', '_').replace(']', '_') if isinstance(v, str): v = v.replace(' ', '_').replace('[', '_').replace(']', '_') query_str.append("{k} == '{v}' and ".format(k=k, v=v)) query_str = ''.join(query_str)[:-4] queries.append(query_str) tmp_dir = get(mtbls_study_id) for table_file in glob.iglob(os.path.join(tmp_dir, '[a|s]_*')): with open(os.path.join(tmp_dir, table_file), encoding='utf-8') as fp: df = isatab.load_table(fp) cols = df.columns cols = cols.map( lambda x: x.replace(' ', '_') if isinstance(x, str) else x) df.columns = cols cols = df.columns cols = cols.map( lambda x: x.replace('[', '_') if isinstance(x, str) else x) df.columns = cols cols = df.columns cols = cols.map( lambda x: x.replace(']', '_') if isinstance(x, str) else x) df.columns = cols for query in queries: df2 = df.query(query) # query uses pandas.eval, which evaluates # queries like pure Python notation if 'Sample_Name' in df.columns: print('Group: {query} / Sample_Name: {sample_name}'.format( query=query, sample_name=list(df2['Sample_Name']))) if 'Source_Name' in df.columns: print('Group: {} / Sources_Name: {}'.format( query, list(df2['Source_Name']))) if 'Raw_Spectral_Data_File' in df.columns: print('Group: {query} / Raw_Spectral_Data_File: {filename}' .format( query=query[13:-2], filename=list(df2['Raw_Spectral_Data_File']))) return queries
def get_factor_names_m(directory): #Get the metadata of the study: factors = set() for table_file in glob.iglob(os.path.join(directory, '[a|s]_*')): with open(os.path.join(directory, table_file), encoding='utf-8') as fp: df = isatab.load_table(fp) factors_headers = [ header for header in list(df.columns.values) if _RX_FACTOR_VALUE.match(header) ] for header in factors_headers: factors.add(header[13:-1]) return factors
def get_study_group_factors(mtbls_study_id): factors_list = [] tmp_dir = get(mtbls_study_id) if tmp_dir is None: raise FileNotFoundError("Could not download {}".format(mtbls_study_id)) for table_file in glob.iglob(os.path.join(tmp_dir, '[a|s]_*')): with open(os.path.join(tmp_dir, table_file), encoding='utf-8') as fp: df = isatab.load_table(fp) factor_columns = [ x for x in df.columns if x.startswith("Factor Value") ] if len(factor_columns) > 0: factors_list = df[factor_columns].drop_duplicates()\ .to_dict(orient='records') return factors_list
def insert_distinct_parameter(table_fp, protocol_ref_to_unpool): """A curation function to fix pooling problems :param table_fp: A file-like buffer object pointing to a table file :param protocol_ref_to_unpool: Reference to which protocol REF column to 'unpool' :return: None """ reader = csv.reader(table_fp, dialect='excel-tab') headers = next(reader) # get column headings table_fp.seek(0) df = isatab.load_table(table_fp) # find protocol ref column by index protocol_ref_indices = [ x for x, y in enumerate(df.columns) if df[y][0] == protocol_ref_to_unpool ] if len(protocol_ref_indices) != 1: raise IndexError( 'Could not find Protocol REF with provided value {}'.format( protocol_ref_to_unpool)) distindex = [] for i in range(0, len(df.index)): distindex.append(str(uuid.uuid4())[:8]) protocol_ref_index = protocol_ref_indices[0] name_header = None head_from_prot = headers[protocol_ref_index:] for x, y in enumerate(head_from_prot): if y.endswith(' Name'): name_header = y break if name_header is not None: print('Are you sure you want to add a column of hash values in {}? ' 'Y/(N)'.format(name_header)) confirm = input() if confirm == 'Y': df[name_header] = distindex table_fp.seek(0) df.to_csv(table_fp, index=None, header=headers, sep='\t') else: print('Could not find appropriate column to fill with hashes')
def get_filtered_df_on_factors_list(mtbls_study_id): factors_list = get_study_group_factors(mtbls_study_id=mtbls_study_id) queries = [] for item in factors_list: query_str = [] for k, v in item.items(): k = k.replace(' ', '_').replace('[', '_').replace(']', '_') if isinstance(v, str): v = v.replace(' ', '_').replace('[', '_').replace(']', '_') query_str.append("{0} == '{1}' and ".format(k, v)) query_str = ''.join(query_str)[:-4] queries.append(query_str) tmp_dir = get(mtbls_study_id) for table_file in glob.iglob(os.path.join(tmp_dir, '[a|s]_*')): with open(os.path.join(tmp_dir, table_file), encoding='utf-8') as fp: df = isatab.load_table(fp) cols = df.columns cols = cols.map(lambda x: x.replace(' ', '_') if isinstance(x, str) else x) df.columns = cols cols = df.columns cols = cols.map(lambda x: x.replace('[', '_') if isinstance(x, str) else x) df.columns = cols cols = df.columns cols = cols.map(lambda x: x.replace(']', '_') if isinstance(x, str) else x) df.columns = cols from pandas.computation.ops import UndefinedVariableError for query in queries: try: df2 = df.query( query ) # query uses pandas.eval, which evaluates queries like pure Python notation if "Sample_Name" in df.columns: print("Group: {} / Sample_Name: {}".format( query, list(df2["Sample_Name"]))) if "Source_Name" in df.columns: print("Group: {} / Sources_Name: {}".format( query, list(df2["Source_Name"]))) if "Raw_Spectral_Data_File" in df.columns: print("Group: {} / Raw_Spectral_Data_File: {}".format( query[13:-2], list(df2["Raw_Spectral_Data_File"]))) except UndefinedVariableError: pass return queries
def slice_data_files(dir, factor_selection=None): """ This function gets a list of samples and related data file URLs for a given MetaboLights study, optionally filtered by factor value (currently by matching on exactly 1 factor value) :param mtbls_study_id: Study identifier for MetaboLights study to get, as a str (e.g. MTBLS1) :param factor_selection: Selected factor values to filter on samples :return: A list of dicts {sample_name, list of data_files} containing sample names with associated data filenames Example usage: samples_and_data = mtbls.get_data_files('MTBLS1', {'Gender': 'Male'}) TODO: Need to work on more complex filters e.g.: {"gender": ["male", "female"]} selects samples matching "male" or "female" factor value {"age": {"equals": 60}} selects samples matching age 60 {"age": {"less_than": 60}} selects samples matching age less than 60 {"age": {"more_than": 60}} selects samples matching age more than 60 To select samples matching "male" and age less than 60: { "gender": "male", "age": { "less_than": 60 } } """ results = list() # first collect matching samples for table_file in glob.iglob(os.path.join(dir, '[a|s]_*')): logger.info("Loading {}".format(table_file)) with open(table_file, encoding='utf-8') as fp: df = isatab.load_table(fp) if factor_selection is None: matches = df['Sample Name'].items() for indx, match in matches: sample_name = match if len([r for r in results if r['sample'] == sample_name]) == 1: continue else: results.append( { "sample": sample_name, "data_files": [] } ) else: for factor_name, factor_value in factor_selection.items(): if 'Factor Value[{}]'.format(factor_name) in list(df.columns.values): matches = df.loc[df['Factor Value[{}]'.format(factor_name)] == factor_value]['Sample Name'].items() for indx, match in matches: sample_name = match if len([r for r in results if r['sample'] == sample_name]) == 1: continue else: results.append( { "sample": sample_name, "data_files": [], "query_used": factor_selection } ) # now collect the data files relating to the samples for result in results: sample_name = result['sample'] for table_file in glob.iglob(os.path.join(dir, 'a_*')): with open(table_file, encoding='utf-8') as fp: df = isatab.load_table(fp) data_files = list() table_headers = list(df.columns.values) sample_rows = df.loc[df['Sample Name'] == sample_name] if 'Raw Spectral Data File' in table_headers: data_files = sample_rows['Raw Spectral Data File'] elif 'Free Induction Decay Data File' in table_headers: data_files = sample_rows['Free Induction Decay Data File'] result['data_files'] = [i for i in list(data_files) if str(i) != 'nan'] return results
def generate_study_design_report(self, get_num_study_groups=True, get_factors=True, get_num_levels=True, get_levels=True, get_study_groups=True): """Generates a study design report :return: JSON report """ isa = isatab.load(self.path, skip_load_tables=False) study_design_report = [] raw_data_file_prefix = ('Raw', 'Array', 'Free Induction Decay') for study in isa.studies: study_key = study.identifier if study.identifier != '' \ else study.filename study_design_report.append({ 'study_key': study_key, 'total_sources': len(study.sources), 'total_samples': len(study.samples), 'assays': [] }) with open(os.path.join(self.path, study.filename)) as s_fp: s_df = isatab.load_table(s_fp) for assay in study.assays: assay_key = '/'.join([ assay.filename, assay.measurement_type.term, assay.technology_type.term, assay.technology_platform ]) assay_report = { 'assay_key': assay_key, 'num_sources': len(assay.samples), 'num_samples': len([ x for x in assay.data_files if x.label.startswith(raw_data_file_prefix) ]) } with open(os.path.join(self.path, assay.filename)) as a_fp: a_df = isatab.load_table(a_fp) merged_df = pd.merge(s_df, a_df, on='Sample Name') factor_cols = [ x for x in merged_df.columns if x.startswith("Factor Value") ] if len(factor_cols) > 0: # add branch to get all if no FVs study_group_factors_df = \ merged_df[factor_cols].drop_duplicates() factors_list = [ x[13:-1] for x in study_group_factors_df.columns ] queries = [] factors_and_levels = {} for i, row in study_group_factors_df.iterrows(): fvs = [] for x, y in zip(factors_list, row): fvs.append(' == '.join([x, str(y)])) try: factor_and_levels = \ factors_and_levels[x] except KeyError: factors_and_levels[x] = set() factor_and_levels = \ factors_and_levels[x] factor_and_levels.add(str(y)) queries.append(' and '.join(fvs)) assay_report['total_study_groups'] = len(queries) assay_report['factors_and_levels'] = [] assay_report['group_summary'] = [] for k, v in factors_and_levels.items(): assay_report['factors_and_levels'].append({ 'factor': k, 'num_levels': len(v), }) for query in queries: try: columns = merged_df.columns columns = recast_columns(columns=columns) for i, column in enumerate(columns): columns[i] = pyvar(column) if \ column.startswith( 'Factor Value[') else column merged_df.columns = columns qlist = query.split(' and ') fmt_query = [] for factor_query in qlist: factor_value = \ factor_query.split(' == ') fmt_query_part = \ "Factor_Value_{0}_ == '{1}'"\ .format(pyvar(factor_value[0]), factor_value[1]) fmt_query.append(fmt_query_part) fmt_query = ' and '.join(fmt_query) log.debug( 'running query: {}'.format(fmt_query)) df2 = merged_df.query(fmt_query) data_column = [ x for x in merged_df.columns if x.startswith(raw_data_file_prefix) and x.endswith('Data File') ][0] assay_report['group_summary'].append( dict(study_group=query, sources=len( list(df2['Source Name']. drop_duplicates())), samples=len( list(df2['Sample Name']. drop_duplicates())), raw_files=len( list(df2[data_column]. drop_duplicates())))) except Exception as e: print("error in query, {}".format(e)) study_design_report[-1]['assays'].append(assay_report) return study_design_report
def get_data_files(mtbls_study_id, factor_selection=None): """ This function gets a list of samples and related data file URLs for a given MetaboLights study, optionally filtered by factor value (currently by matching on exactly 1 factor value) :param mtbls_study_id: Study identifier for MetaboLights study to get, as a str (e.g. MTBLS1) :param factor_selection: Selected factor values to filter on samples :return: A list of dicts {sample_name, list of data_files} containing sample names with associated data filenames Example usage: samples_and_data = mtbls.get_data_files('MTBLS1', {'Gender': 'Male'}) TODO: Need to work on more complex filters e.g.: {"gender": ["male", "female"]} selects samples matching "male" or "female" factor value {"age": {"equals": 60}} selects samples matching age 60 {"age": {"less_than": 60}} selects samples matching age less than 60 {"age": {"more_than": 60}} selects samples matching age more than 60 To select samples matching "male" and age less than 60: { "gender": "male", "age": { "less_than": 60 } } """ tmp_dir = get(mtbls_study_id) if tmp_dir is None: raise IOError("There was a problem retrieving study {}. Does it exist?".format(mtbls_study_id)) table_files = [f for f in os.listdir(tmp_dir) if f.startswith(("a_", "s_"))] from isatools import isatab results = list() # first collect matching samples for table_file in table_files: df = isatab.load_table(os.path.join(tmp_dir, table_file)) if factor_selection is None: matches = df["Sample Name"].items() for indx, match in matches: sample_name = match if len([r for r in results if r["sample"] == sample_name]) == 1: continue else: results.append({"sample": sample_name, "data_files": []}) else: for factor_name, factor_value in factor_selection.items(): if "Factor Value[{}]".format(factor_name) in list(df.columns.values): matches = df.loc[df["Factor Value[{}]".format(factor_name)] == factor_value]["Sample Name"].items() for indx, match in matches: sample_name = match if len([r for r in results if r["sample"] == sample_name]) == 1: continue else: results.append({"sample": sample_name, "data_files": [], "query_used": factor_selection}) # now collect the data files relating to the samples for result in results: sample_name = result["sample"] for table_file in [f for f in os.listdir(tmp_dir) if f.startswith("a_")]: df = isatab.load_table(os.path.join(tmp_dir, table_file)) data_files = list() table_headers = list(df.columns.values) sample_rows = df.loc[df["Sample Name"] == sample_name] if "Raw Spectral Data File" in table_headers: data_files = sample_rows["Raw Spectral Data File"] elif "Free Induction Decay Data File" in table_headers: data_files = sample_rows["Free Induction Decay Data File"] result["data_files"] = [i for i in list(data_files) if str(i) != "nan"] shutil.rmtree(tmp_dir) return results
def slice_data_files(dir, factor_selection=None): """ This function gets a list of samples and related data file URLs for a given MetaboLights study, optionally filtered by factor value (currently by matching on exactly 1 factor value) :param mtbls_study_id: Study identifier for MetaboLights study to get, as a str (e.g. MTBLS1) :param factor_selection: A list of selected factor values to filter on samples :return: A list of dicts {sample_name, list of data_files} containing sample names with associated data filenames Example usage: samples_and_data = mtbls.get_data_files('MTBLS1', [{'Gender': 'Male'}]) TODO: Need to work on more complex filters e.g.: {"gender": ["male", "female"]} selects samples matching "male" or "female" factor value {"age": {"equals": 60}} selects samples matching age 60 {"age": {"less_than": 60}} selects samples matching age less than 60 {"age": {"more_than": 60}} selects samples matching age more than 60 To select samples matching "male" and age less than 60: { "gender": "male", "age": { "less_than": 60 } } """ results = [] # first collect matching samples for table_file in glob.iglob(os.path.join(dir, '[a|s]_*')): log.info('Loading {table_file}'.format(table_file=table_file)) with open(table_file, encoding='utf-8') as fp: df = isatab.load_table(fp) df = df[[ x for x in df.columns if 'Factor Value' in x or 'Sample Name' in x ]] df.columns = [ 'sample' if 'Sample Name' in x else x for x in df.columns ] df.columns = [ x[13:-1] if 'Factor Value' in x else x for x in df.columns ] df.columns = [x.replace(' ', '_') for x in df.columns] # build query sample_names_series = df['sample'].drop_duplicates() if factor_selection is None: results = sample_names_series.apply(lambda x: { 'sample': x, 'data_files': [], 'query_used': '' }).tolist() else: factor_query = '' for factor_name, factor_value in factor_selection.items(): factor_name = factor_name.replace(' ', '_') factor_query += '{factor_name}=="{factor_value}" and '.format( factor_name=factor_name, factor_value=factor_value) factor_query = factor_query[:-5] try: query_results = df.query( factor_query)['sample'].drop_duplicates() results = query_results.apply( lambda x: { 'sample': x, 'data_files': [], 'query_used': factor_selection }).tolist() except pd.core.computation.ops.UndefinedVariableError: pass # now collect the data files relating to the samples for table_file in glob.iglob(os.path.join(dir, 'a_*.txt')): with open(table_file, encoding='utf-8') as fp: df = isatab.load_table(fp) df = df[[ x for x in df.columns if 'File' in x or 'Sample Name' in x ]] df.columns = [ 'sample' if 'Sample Name' in x else x for x in df.columns ] for result in results: sample_name = result['sample'] sample_rows = df.loc[df['sample'] == sample_name] for data_col in [ x for x in sample_rows.columns if 'File' in x ]: data_files = sample_rows[data_col] result['data_files'] = [ i for i in data_files if str(i) != 'nan' ] return results