Beispiel #1
0
def configure():
    woltka_levels = AnalysisFactory(
        [
            BiomTable("species"),
        ],
        metadata_filepath,
        "species"
    ).with_pair_strategy("unpaired")\
        .with_normalization(Normalization.NONE)
    zebra = AnalysisFactory(
        [BiomTable("none")],
        metadata_filepath
    ).with_pair_strategy("unpaired")\
        .with_normalization(Normalization.NONE)\
        .with_feature_filter([
        ZebraFilter(.00, "../zebra.csv"),
        ZebraFilter(.10, "../zebra.csv"),
        ZebraFilter(.25, "../zebra.csv"),
        ZebraFilter(.50, "../zebra.csv"),
        ZebraFilter(.75, "../zebra.csv"),
        ZebraFilter(.90, "../zebra.csv"),
        ZebraFilter(.95, "../zebra.csv"),
        ZebraFilter(.98, "../zebra.csv"),
        ZebraFilter(.99, "../zebra.csv"),
        ZebraFilter(.995, "../zebra.csv"),
        ZebraFilter(.998, "../zebra.csv"),
        ZebraFilter(.999, "../zebra.csv"),
        ZebraFilter(.9999, "../zebra.csv"),
    ])

    return MultiFactory([
        woltka_levels,
        zebra,
    ])
Beispiel #2
0
def configure():
    metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv"
    woltka_levels = AnalysisFactory([
        BiomTable("species"),
    ], metadata_filepath).with_pair_strategy(["paired_subtract_sex_balanced"])
    zebra = AnalysisFactory(
        [BiomTable("none")],
        metadata_filepath
    ).with_pair_strategy("paired_subtract_sex_balanced") \
    .with_feature_filter([
        ZebraFilter(.00, "../zebra.csv"),
        ZebraFilter(.10, "../zebra.csv"),
        ZebraFilter(.25, "../zebra.csv"),
        ZebraFilter(.50, "../zebra.csv"),
        ZebraFilter(.75, "../zebra.csv"),
        ZebraFilter(.90, "../zebra.csv"),
        ZebraFilter(.95, "../zebra.csv"),
        ZebraFilter(.98, "../zebra.csv"),
        ZebraFilter(.99, "../zebra.csv"),
        ZebraFilter(.995, "../zebra.csv"),
        ZebraFilter(.998, "../zebra.csv"),
        ZebraFilter(.999, "../zebra.csv"),
        ZebraFilter(.9999, "../zebra.csv"),
    ])

    return MultiFactory([
        woltka_levels,
        zebra,
    ])
Beispiel #3
0
def configure():
    metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv"
    csv_filepath = "./dataset/csv/humann2_pathabundance_subject.txt"

    pathways = AnalysisFactory(
        [
            CSVTable(
                csv_filepath,
                table_name="Humann2-PathAbundance",
                on_load_transform=keep_pathways_only,
                sep="\t"),
            CSVTable(
                csv_filepath,
                table_name="Humann2-SpeciesPathAbundance",
                on_load_transform=keep_species_specific_pathways_only,
                sep="\t")
        ],
        metadata_filepath,
        "Humann2PathAbundance"
    )\
        .with_pair_strategy("paired_subtract_sex_balanced")\
        .with_normalization([Normalization("None", "none"), Normalization.DEFAULT, Normalization("CLR", "CLR")])

    raw = AnalysisFactory(
        BiomTable("species"),
        metadata_filepath,
        "Species"
    )\
        .with_pair_strategy(["paired_subtract_sex_balanced"])\
        .with_normalization([Normalization("None", "none"), Normalization.DEFAULT, Normalization("CLR", "CLR")])\

    return MultiFactory([pathways, raw])
def configure():
    metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv"
    fset1 = FeatureSet.build_feature_set("Test0", "./dataset/feature_sets/fixed_training_set_MS_associated_species_AST_fdr0.05.tsv")
    fsets = FeatureSet.build_feature_sets("./dataset/feature_sets/MS_associated_species_fdr0.05_in_10_training_set.csv")

    # print(fset1.features)
    # print(fsets[0].features)

    facts = []
    for i in range(1):
        linreg = AnalysisFactory(
            [BiomTable("species")],
            metadata_filepath,
            "TestSet" + str(i)
        ).with_feature_set(fsets[i])\
            .with_training_set(i)\
            .with_normalization(Normalization("CLR", "CLR"))\
            .with_pair_strategy(["paired_subtract", "paired_subtract_sex_balanced"])
        facts.append(linreg)

    # species = AnalysisFactory(
    #     ["species"],
    #     metadata_filepath,
    #     "species"
    # )
    # facts.append(species)
    #
    return MultiFactory(facts)
Beispiel #5
0
def configure():
    metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv"
    csv_filepath = "./dataset/csv/humann2_pathabundance_subject.txt"

    df = CSVTable(csv_filepath, on_load_transform=keep_pathways_only, sep="\t")
    df = df.load_dataframe()

    fset_pathways = FeatureSet("Pathways", df.columns.tolist())

    pathways = AnalysisFactory(
        [
            CSVTable(
                csv_filepath,
                table_name="Humann2-PathAbundance",
                on_load_transform=keep_pathways_only,
                sep="\t")
        ],
        metadata_filepath
    )\
        .with_lda(1)\
        .with_pair_strategy("paired_subtract_sex_balanced")\
        .with_normalization([Normalization("CLR", "CLR")]) \
        .with_feature_set([fset_pathways] + fset_pathways.create_univariate_sets())

    return MultiFactory([pathways])
Beispiel #6
0
def configure():
    metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv"
    csv_filepath = "./dataset/csv/vfdb_imsms_576pairs.txt"

    df = CSVTable(csv_filepath, on_load_transform=fix_input_table, sep="\t")
    df = df.load_dataframe()

    fset = FeatureSet("VirulenceFactor", df.columns.tolist())

    pathways = AnalysisFactory(
        [
            CSVTable(
                csv_filepath,
                table_name="VFDB",
                on_load_transform=fix_input_table,
                sep="\t")
        ],
        metadata_filepath
    )\
        .with_lda(1)\
        .with_pair_strategy("paired_subtract_sex_balanced")\
        .with_normalization([Normalization.CLR])\
        .with_feature_set([fset] + fset.create_univariate_sets())

    return MultiFactory([pathways])
Beispiel #7
0
def configure():
    metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv"
    probstel = FeatureSet.build_feature_set(
        "Probstel",
        "./dataset/feature_sets/literature_review_Probstel_Baranzini_2018.tsv")

    raw = AnalysisFactory(
        BiomTable("genus"),
        metadata_filepath,
        "Probstel"
    ).with_feature_set(probstel)\
        .with_normalization(Normalization("CLR", "CLR"))\
        .with_pair_strategy(["paired_subtract", "paired_subtract_sex_balanced"])
    # .with_meta_encoders([
    #     None,
    #     MetaEncoder(
    #         "sex",
    #         lambda x: 0 if x == "M" else 1
    #     ),
    # ])

    meta_only = AnalysisFactory(
        BiomTable("genus"),
        metadata_filepath,
        "Meta(sex)"
    ) \
        .with_pair_strategy(["paired_subtract", "paired_subtract_sex_balanced"])\
        .with_feature_set(FeatureSet("Empty", []))\
        .with_meta_encoders(MetaEncoder(
            "sex",
            lambda x: 0 if x == "M" else 1
        )
    )

    return MultiFactory([raw, meta_only])
def configure():
    metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv"
    csv_root = "./plots/snp_clustermaps"
    onlyfiles = sorted([
        f for f in listdir(csv_root)
        if isfile(join(csv_root, f)) and f.endswith(".csv")
    ])

    tables = [
        CSVTable(join(csv_root, f),
                 table_name=f,
                 on_load_transform=fix_input_table,
                 dtype=str) for f in onlyfiles
    ]

    snp_clusters = AnalysisFactory(
        [
            MergeTable(tables, onlyfiles)
        ],
        metadata_filepath,
        "SNP Clusters"
    )\
        .with_pair_strategy("paired_subtract_sex_balanced")\
        .with_normalization([Normalization("None", "none"), Normalization.DEFAULT, Normalization("CLR", "CLR")])

    raw = AnalysisFactory(
        BiomTable("species"),
        metadata_filepath,
        "Species"
    )\
        .with_pair_strategy(["paired_subtract_sex_balanced"])\
        .with_normalization([Normalization("None", "none"), Normalization.DEFAULT, Normalization("CLR", "CLR")])\

    return MultiFactory([snp_clusters, raw])
Beispiel #9
0
def configure():
    metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv"
    woltka_levels = AnalysisFactory(
        [
            # BiomTable("phylum"),
            # BiomTable("class"),
            # BiomTable("order"),
            # BiomTable("family"),
            BiomTable("genus"),
            BiomTable("species"),
            BiomTable("none")
        ],
        metadata_filepath).with_pair_strategy([
            # "paired_subtract",
            "paired_subtract_sex_balanced"
        ])
    # woltka_transforms = AnalysisFactory(
    #     [BiomTable("none"),
    #      BiomTable("kegg"),
    #      BiomTable("enzrxn2reaction"),
    #      BiomTable("pathway2class"),
    #      BiomTable("reaction2pathway")],
    #     metadata_filepath
    # )

    return MultiFactory([woltka_levels])
def configure():
    metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv"

    # We split the data into 10 50/50 train/test sets (the ten divisions overlap)
    # We ran linear regression on all training sets (see phyloseq)
    # We took top hits that pass fdr threshold
    # We looked at how frequently each species appeared in these top lists
    # The four most frequent appeared in 8 out of 10 lists.
    # See dataset/feature_sets/MS_associated_species_fdr0.05_in_10_training_set.csv
    # Ruthenibacterium lactatiformans
    # Peptococcus niger
    # Coprococcus comes
    # Dorea longicatena
    fset_top_scorers = FeatureSet("TopScorers",
                                  ["1550024", "2741", "410072", "88431"])
    fset_combos = fset_top_scorers.create_all_combos()
    facts = []

    facts.append(
        AnalysisFactory([BiomTable("species")], metadata_filepath, "species"))
    # TODO FIXME HACK:  There is no held out test set that these top scorers
    #  haven't seen before.  So I'm a little worried that we are cheating
    #  here.  If this shows promise, can redo the train set generation to
    #  ensure there is a set that is completely held out from all training sets
    facts.append(
        AnalysisFactory([BiomTable('species')],
                        metadata_filepath).with_feature_set(fset_combos))

    return MultiFactory(facts)
def configure():
    metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv"
    woltka_transforms = AnalysisFactory(
        [BiomTable("none")],
        metadata_filepath
    ).with_pair_strategy("unpaired")\
     .with_normalization(Normalization.NONE)

    return MultiFactory([woltka_transforms])
Beispiel #12
0
def configure():
    metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv"
    woltka_levels = AnalysisFactory(
        [BiomTable("species")],
        metadata_filepath
    ).with_downsampling([25, 50, 75, 100, 125, 150, 175, 200, 225, 250, None])\
        .with_normalization(Normalization.CLR)\
        .with_pair_strategy(["paired_subtract", "paired_subtract_sex_balanced"])

    return MultiFactory([woltka_levels])
Beispiel #13
0
def configure():
    metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv"
    woltka_levels = AnalysisFactory(
        [BiomTable("species")],
        metadata_filepath
    )

    alleles = AnalysisFactory(
        ["Dump"],
        metadata_filepath
    )

    return MultiFactory([woltka_levels, ])
Beispiel #14
0
def configure():
    metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv"
    hc_off = "./dataset/feature_sets/hc_off.tsv"
    hc_treated = "./dataset/feature_sets/hc_treated.tsv"
    ms_off = "./dataset/feature_sets/ms_off.tsv"
    ms_treated = "./dataset/feature_sets/ms_treated.tsv"

    # shuffle = AnalysisFactory(
    #     ["none"],
    #     metadata_filepath
    # ).with_feature_transform(
    #     [FeatureTransformer("MBP30_Shuffle"+str(x), mbp30, shuffle_seed=x)
    #      for x in range(10)]
    # )

    woltka_transforms = AnalysisFactory(
        [BiomTable("species")],
        metadata_filepath,
    ).with_feature_transform(
        [
            NetworkTransformer("ms.off", ms_off),
            NetworkTransformer("ms.treated", ms_treated),
            NetworkTransformer("hc.off", hc_off),
            NetworkTransformer("hc.treated", hc_treated)
         ]) \
        .with_normalization(Normalization("CLR", "CLR")) \
        .with_pair_strategy("paired_subtract_sex_balanced") \
        .with_metadata_filter([
        None,
        MetadataFilter(
            "Off Treatment",
            "treatment_status",
            ["Off", "Control"]
        )
    ])



    woltka_species = AnalysisFactory(
        [BiomTable("species")],
        metadata_filepath,
        "All Species",
    ).with_normalization(Normalization("CLR", "CLR")) \
    .with_pair_strategy("paired_subtract_sex_balanced")

    return MultiFactory([woltka_species, woltka_transforms])
Beispiel #15
0
def configure():
    metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv"
    csv_filepath = "../akkermansia_stuff/counts_CBIA010000065.1_5057-6325.csv"

    pathways = AnalysisFactory(
        [
            CSVTable(
                csv_filepath,
                table_name="CDD94772.1-GeneCount",
                on_load_transform=fix_input_table,
                sep="\t")
        ],
        metadata_filepath
    ) \
        .with_pair_strategy("unpaired") \

    return MultiFactory([pathways])
Beispiel #16
0
def configure():
    probstel = FeatureSet.build_feature_set(
        "Probstel",
        "./dataset/feature_sets/literature_review_Probstel_Baranzini_2018.tsv"
    )
    metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv"

    all_genera = AnalysisFactory(
        BiomTable("genus"),
        metadata_filepath,
        "All-Genera"
    )

    probstel_features = AnalysisFactory(
        BiomTable("genus"),
        metadata_filepath,
    ).with_feature_set(probstel.create_univariate_sets("Univariate-") + [probstel])

    return MultiFactory([all_genera, probstel_features])
def configure():
    metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv"
    probstel = FeatureSet.build_feature_set(
        "Probstel",
        "./dataset/feature_sets/literature_review_Probstel_Baranzini_2018.tsv")
    probstel_features = AnalysisFactory(
        "genus",
        metadata_filepath,
    ).with_feature_set([probstel] +
                       probstel.create_univariate_sets("Univariate-"))
    mbp25 = "./dataset/feature_transforms/mbp_table25.csv"
    mbp30 = "./dataset/feature_transforms/mbp_table30.csv"
    mbp35 = "./dataset/feature_transforms/mbp_table35.csv"

    fsets = FeatureSet.build_feature_sets(
        "./dataset/feature_sets/MS_associated_species_fdr0.05_in_10_training_set.csv"
    )

    species_meta = BiomTable("species").read_biom_metadata()
    fset_species = FeatureSet("SPECIES",
                              species_meta.index.astype(str).tolist(),
                              species_meta["Name"].tolist())

    # umap = AnalysisFactory(
    #     "species",
    #     metadata_filepath,
    #     "UMAP"
    # ).with_umap()\
    #     .with_num_seeds(50)\
    #     .with_pair_strategy("unpaired")

    # pca = AnalysisFactory(
    #     "species",
    #     metadata_filepath,
    #     "PCA(1)"
    # ).with_pca([1])\
    #     .with_num_seeds(50)\
    #     .with_normalization(Normalization("CLR", "CLR"))\
    #     .with_pair_strategy("paired_subtract")

    # facts = []
    # for i in range(4):
    #     linreg = AnalysisFactory(
    #         ["species"],
    #         metadata_filepath,
    #         "MixedLinearModel" + str(i),
    #     ).with_lda([1]).with_feature_set(fsets[i])\
    #         .with_training_set(i)\
    #         .with_normalization([Normalization("CLR", "CLR")])\
    #         .with_pair_strategy(["paired_subtract"])
    #     facts.append(linreg)

    # lda = AnalysisFactory(
    #     "species",
    #     metadata_filepath,
    #     "MBPMimics"
    # ).with_lda([1]) \
    #     .with_feature_transform(
    #         [FeatureTransformer("MBP25", mbp25),
    #          FeatureTransformer("MBP30", mbp30),
    #          FeatureTransformer("MBP35", mbp35)])\
    #     .with_pair_strategy(["paired_subtract"])\
    #     .with_num_training_sets(4)\
    # .with_normalization([Normalization("CLR", "CLR")])\
    # .with_feature_set([probstel])
    # .with_num_seeds(5) \

    # FeatureSet("A", ["572511"]),
    # FeatureSet("B", ["572511", "33042"]),
    # FeatureSet("C", ["572511", "33042", "216851"])]

    raw = AnalysisFactory(
        BiomTable("species"),
        metadata_filepath,
        "Species"
    )\
        .with_lda(1)\
        .with_pair_strategy(["paired_subtract_sex_balanced"])\
        .with_normalization([Normalization("CLR", "CLR")])\
        .with_feature_set([fset_species] + fset_species.create_univariate_sets() + fsets)

    probstel_lda = AnalysisFactory(
        BiomTable("genus"),
        metadata_filepath
    )\
        .with_lda(1)\
        .with_pair_strategy(["paired_subtract_sex_balanced"])\
        .with_normalization([Normalization("CLR", "CLR")])\
        .with_feature_set([probstel] + probstel.create_univariate_sets())

    # return MultiFactory([
    #     # umap,
    #     lda,
    #     # pca,
    #     # raw,
    # ])
    # return probstel_lda
    return MultiFactory([raw, probstel_lda])
Beispiel #18
0
def configure():
    metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv"
    mbp25 = "./dataset/feature_transforms/mbp_table25.csv"
    mbp30 = "./dataset/feature_transforms/mbp_table30.csv"
    mbp35 = "./dataset/feature_transforms/mbp_table35.csv"
    mog25 = "./dataset/feature_transforms/mog_table25.csv"
    mog30 = "./dataset/feature_transforms/mog_table30.csv"
    mog35 = "./dataset/feature_transforms/mog_table35.csv"

    hla_drb1_1501_households = \
        ['714-0049', '714-0072', '714-0075', '714-0078', '714-0079',
         '714-0086', '714-0094', '714-0101', '714-0102', '714-0107',
         '714-0110', '714-0111', '714-0118', '714-0119', '714-0122',
         '714-0123', '714-0128', '714-0133', '714-0135', '714-0148',
         '714-0149', '714-0157', '714-0161', '714-0162', '714-0165',
         '714-0167', '714-0172', '714-0176', '714-0184', '714-0189',
         '714-0190', '714-0201', '714-0210', '714-0212', '714-0224',
         '714-0254', '714-0255', '716-0009', '716-0015', '716-0020',
         '716-0031', '716-0032', '716-0035', '716-0039', '716-0052',
         '716-0076', '716-0082', '716-0095', '716-0101', '716-0110',
         '716-0137', '716-0141', '716-0143', '716-0160']

    meta_households = [hh[:3] + hh[4:] for hh in hla_drb1_1501_households]

    # mbp_shuffle = AnalysisFactory(
    #     ["none"],
    #     metadata_filepath
    # ).with_feature_transform(
    #     [FeatureTransformer("MBP30_Shuffle"+str(x), mbp30, shuffle_seed=x)
    #      for x in range(10)]
    # )

    woltka_transforms = AnalysisFactory(
        [BiomTable("none")],
        metadata_filepath,
    ).with_feature_transform(
        [FeatureTransformer("MBP25", mbp25),
         FeatureTransformer("MBP30", mbp30),
         FeatureTransformer("MBP35", mbp35)])\
     .with_normalization(Normalization("CLR", "CLR"))\
     .with_pair_strategy("paired_subtract")\
     .with_metadata_filter([
            None,
            MetadataFilter(
                "DRB1_1501",
                "household",
                meta_households
            )
        ])

    # woltka_raw = AnalysisFactory(
    #     [BiomTable("species"), BiomTable("none")],
    #     metadata_filepath,
    # )
    # .with_metadata_filter(
    #     MetadataFilter(
    #         "DRB1_1501",
    #         "household",
    #         meta_households
    #     )
    # )

    return MultiFactory([woltka_transforms])