unfold_dict[label].extend([v] * n)

    # clean values (try to convert to float and remove leading ;)
    for k in unfold_dict:
        unfold_dict[k] = [_clean_value(v) for v in unfold_dict[k]]

    return pd.DataFrame(unfold_dict, columns=df.columns)


###################################################################

if __name__ == "__main__":
    from metabolinks.datasets import demo_dataset
    # read sample data set demo2
    print('\nLoad demo2: data with labels ------------\n')
    demo2 = demo_dataset('demo2')
    data = demo2.data
    y = demo2.target
    print('-- info --------------')
    print(data.transpose().cdl.info())
    print('-- global info---------')
    print(data.transpose().cdl.info(all_data=True))
    print('-----------------------')
    print(data)

    print('\n--- fillna_zero ----------')
    new_data = fillna_zero(data)
    print(new_data)
    print('--- fillna_value  10 ----------')
    new_data = fillna_value(data, value=10)
    print(new_data)
Beispiel #2
0
                                    right_index=True)
    return annotations


if __name__ == '__main__':

    from metabolinks import datasets

    dbs = load_local_dbs()

    # for k, v in dbs._asdict().items():
    #     print(f'----- {k}')
    #     print(v)
    #     print('*'*30)

    df = datasets.demo_dataset('masstrix_output').data
    print("Demo data:\n")

    df.info()
    print('************************')
    print(df.KEGG_cid)
    print('************************')

    # get one identifier per row
    cids = df.KEGG_cid.str.split('#').explode()
    print('\n\n----- Identifier list')
    print(cids)
    print('-----------------------------------')

    # build identifier translation table
    identifiers = get_identifiers(cids, dbs, trace=False)
Beispiel #3
0
        else:
            comps.append('other')
    return comps


if __name__ == '__main__':
    from io import StringIO
    from metabolinks import datasets
    from metabolinks.dataio import read_MassTRIX

    print('------ test element_composition() ------')
    for test in 'C11H24NO7P', 'C13H19ClN2O2', 'C12H21O11R':
        print(test, '->', element_composition(test))

    print('\n------ test insert_element_counts() ------')
    df = datasets.demo_dataset('table_with_formulae').data
    print(df)
    print('+++++ after insertion ++++++')
    dfi = insert_element_counts(df)
    print(dfi)

    print('\n------ test element_composition_series ------')
    # file_name = "MassTRIX_output.tsv"
    # import os
    # _THIS_DIR, _ = os.path.split(os.path.abspath(__file__))
    # testfile_name = os.path.join(_THIS_DIR, "data", file_name)

    df = read_MassTRIX(
        StringIO(datasets.create_demo('masstrix_output').as_str()))

    def cleanup_cols(df, isotopes=True, uniqueID=True, columns=None):
Beispiel #4
0
            # empty subset, skip
            continue
        else:
            # common features with exactely len(t) ocorrences
            if len(t) == 1:
                all_feats = objects[t[0]]
            else:
                all_feats = common([objects[i] for i in t])
            features = all_feats[all_feats.isin(count_groups[len(t)-1])]
            subset_names = tuple([names[i] for i in t])
            res[subset_names] = features
    return res

if __name__ == "__main__":
    print('Demo data with labels------------\n')
    dataset = datasets.demo_dataset('demo2').data.transpose()
    print(dataset)
    print('-- info --------------')
    print(dataset.cdl.info())
    print('-- global info---------')
    print(dataset.cdl.info(all_data=True))
    print('\n***** SIMILARITY MEASURES ****')
    similarities = mz_similarity(dataset, has_labels=True)
    print(similarities)

    print('\n\n***** FEATURE OVERLAP (AND VENN DIAGRAM CALCULATIONS ****')
    print('--- example data sets')
    s1 = pd.DataFrame({'Bucket label': ['A0', 'A1', 'A2', 'A3'],
                        'Name': ['B0', np.nan, 'B2', 'B3'],
                        'Formula': ['C0', 'C1', 'C2', 'C3']},
                        index=[0, 1, 2, 3]).set_index('Bucket label')
def load_demo2():
    return demo_dataset('demo2')