def test_reading(files_source): mwtabfile_generator = mwtab.read_files(files_source) mwtabfiles_list = list(mwtabfile_generator) mwtabfiles_study_ids_set = set(mwf.study_id for mwf in mwtabfiles_list) mwtabfiles_analysis_ids_set = set(mwf.analysis_id for mwf in mwtabfiles_list) assert mwtabfiles_study_ids_set == {"ST000001", "ST000002"} assert mwtabfiles_analysis_ids_set == {"AN000001", "AN000002"}
def test_from_local_file(): mwtabfile_generator = mwtab.read_files( "tests/example_data/mwtab_files/ST000001_AN000001.txt", "tests/example_data/mwtab_files/ST000002_AN000002.txt") mwtabfile1 = next(mwtabfile_generator) mwtabfile2 = next(mwtabfile_generator) assert mwtabfile1.study_id == "ST000001" and mwtabfile2.study_id == "ST000002" assert mwtabfile1.analysis_id == "AN000001" and mwtabfile2.analysis_id == "AN000002"
def test_converter_module(from_path, to_path, from_format, to_format): converter = Converter(from_path=from_path, to_path=to_path, from_format=from_format, to_format=to_format) converter.convert() mwtabfile_generator = mwtab.read_files(to_path) mwtabfiles_list = list(mwtabfile_generator) mwtabfiles_study_ids_set = set(mwf.study_id for mwf in mwtabfiles_list) mwtabfiles_analysis_ids_set = set(mwf.analysis_id for mwf in mwtabfiles_list) assert mwtabfiles_study_ids_set.issubset({"ST000001", "ST000002"}) assert mwtabfiles_analysis_ids_set.issubset({"AN000001", "AN000002"})
def mwtab_to_df(path, id_mapping='pubchem_id'): ''' Parse mwtab file to df :param path: path of mwtab file :param id_mapping: which db will be used to annotate metabolite names. Those are valid inputs {None, 'PubChem ID', 'KEGG ID', 'HMDB'} ''' f = next(mwtab.read_files(path)) id_factor_mapping = { i['local_sample_id']: i['factors'].split('-')[1].strip() if not i['factors'].startswith('Source:Method Blanks') else 'healthy' for i in f['SUBJECT_SAMPLE_FACTORS']['SUBJECT_SAMPLE_FACTORS'] } metabolites_names = { i['metabolite_name']: i[id_mapping] for i in f['METABOLITES']['METABOLITES_START']['DATA'] if id_mapping in i and i[id_mapping] } metabolite_measurements = dict() for i in f['MS_METABOLITE_DATA']['MS_METABOLITE_DATA_START']['DATA']: m = i['metabolite_name'] del i['metabolite_name'] if id_mapping: if m in metabolites_names: metabolite_measurements[metabolites_names[m]] = i else: metabolite_measurements[m] = i df = pd.DataFrame(metabolite_measurements, dtype=float) df = df[[i in id_factor_mapping for i in df.index]] labels = [id_factor_mapping[i] for i in df.index] labels = [i if i != 'Adenocarcnoma' else 'Adenocarcinoma' for i in labels] df.insert(0, 'labels', labels) df = df.reset_index() del df['index'] return df.replace('', np.nan).dropna(axis=1, how='any')
def test_from_analysis_id(): mwtabfile_generator = mwtab.read_files("5") mwtabfile = next(mwtabfile_generator) assert mwtabfile.study_id == "ST000004" assert mwtabfile.analysis_id == "AN000005"