def configure(): metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv" woltka_levels = AnalysisFactory([ BiomTable("species"), ], metadata_filepath).with_pair_strategy(["paired_subtract_sex_balanced"]) zebra = AnalysisFactory( [BiomTable("none")], metadata_filepath ).with_pair_strategy("paired_subtract_sex_balanced") \ .with_feature_filter([ ZebraFilter(.00, "../zebra.csv"), ZebraFilter(.10, "../zebra.csv"), ZebraFilter(.25, "../zebra.csv"), ZebraFilter(.50, "../zebra.csv"), ZebraFilter(.75, "../zebra.csv"), ZebraFilter(.90, "../zebra.csv"), ZebraFilter(.95, "../zebra.csv"), ZebraFilter(.98, "../zebra.csv"), ZebraFilter(.99, "../zebra.csv"), ZebraFilter(.995, "../zebra.csv"), ZebraFilter(.998, "../zebra.csv"), ZebraFilter(.999, "../zebra.csv"), ZebraFilter(.9999, "../zebra.csv"), ]) return MultiFactory([ woltka_levels, zebra, ])
def configure(): metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv" woltka_levels = AnalysisFactory( [ # BiomTable("phylum"), # BiomTable("class"), # BiomTable("order"), # BiomTable("family"), BiomTable("genus"), BiomTable("species"), BiomTable("none") ], metadata_filepath).with_pair_strategy([ # "paired_subtract", "paired_subtract_sex_balanced" ]) # woltka_transforms = AnalysisFactory( # [BiomTable("none"), # BiomTable("kegg"), # BiomTable("enzrxn2reaction"), # BiomTable("pathway2class"), # BiomTable("reaction2pathway")], # metadata_filepath # ) return MultiFactory([woltka_levels])
def configure(): woltka_levels = AnalysisFactory( [ BiomTable("species"), ], metadata_filepath, "species" ).with_pair_strategy("unpaired")\ .with_normalization(Normalization.NONE) zebra = AnalysisFactory( [BiomTable("none")], metadata_filepath ).with_pair_strategy("unpaired")\ .with_normalization(Normalization.NONE)\ .with_feature_filter([ ZebraFilter(.00, "../zebra.csv"), ZebraFilter(.10, "../zebra.csv"), ZebraFilter(.25, "../zebra.csv"), ZebraFilter(.50, "../zebra.csv"), ZebraFilter(.75, "../zebra.csv"), ZebraFilter(.90, "../zebra.csv"), ZebraFilter(.95, "../zebra.csv"), ZebraFilter(.98, "../zebra.csv"), ZebraFilter(.99, "../zebra.csv"), ZebraFilter(.995, "../zebra.csv"), ZebraFilter(.998, "../zebra.csv"), ZebraFilter(.999, "../zebra.csv"), ZebraFilter(.9999, "../zebra.csv"), ]) return MultiFactory([ woltka_levels, zebra, ])
def configure(): metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv" # We split the data into 10 50/50 train/test sets (the ten divisions overlap) # We ran linear regression on all training sets (see phyloseq) # We took top hits that pass fdr threshold # We looked at how frequently each species appeared in these top lists # The four most frequent appeared in 8 out of 10 lists. # See dataset/feature_sets/MS_associated_species_fdr0.05_in_10_training_set.csv # Ruthenibacterium lactatiformans # Peptococcus niger # Coprococcus comes # Dorea longicatena fset_top_scorers = FeatureSet("TopScorers", ["1550024", "2741", "410072", "88431"]) fset_combos = fset_top_scorers.create_all_combos() facts = [] facts.append( AnalysisFactory([BiomTable("species")], metadata_filepath, "species")) # TODO FIXME HACK: There is no held out test set that these top scorers # haven't seen before. So I'm a little worried that we are cheating # here. If this shows promise, can redo the train set generation to # ensure there is a set that is completely held out from all training sets facts.append( AnalysisFactory([BiomTable('species')], metadata_filepath).with_feature_set(fset_combos)) return MultiFactory(facts)
def configure(): metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv" probstel = FeatureSet.build_feature_set( "Probstel", "./dataset/feature_sets/literature_review_Probstel_Baranzini_2018.tsv") raw = AnalysisFactory( BiomTable("genus"), metadata_filepath, "Probstel" ).with_feature_set(probstel)\ .with_normalization(Normalization("CLR", "CLR"))\ .with_pair_strategy(["paired_subtract", "paired_subtract_sex_balanced"]) # .with_meta_encoders([ # None, # MetaEncoder( # "sex", # lambda x: 0 if x == "M" else 1 # ), # ]) meta_only = AnalysisFactory( BiomTable("genus"), metadata_filepath, "Meta(sex)" ) \ .with_pair_strategy(["paired_subtract", "paired_subtract_sex_balanced"])\ .with_feature_set(FeatureSet("Empty", []))\ .with_meta_encoders(MetaEncoder( "sex", lambda x: 0 if x == "M" else 1 ) ) return MultiFactory([raw, meta_only])
def configure(): metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv" return AnalysisFactory( BiomTable("genus"), metadata_filepath )
def configure(): metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv" fset1 = FeatureSet.build_feature_set("Test0", "./dataset/feature_sets/fixed_training_set_MS_associated_species_AST_fdr0.05.tsv") fsets = FeatureSet.build_feature_sets("./dataset/feature_sets/MS_associated_species_fdr0.05_in_10_training_set.csv") # print(fset1.features) # print(fsets[0].features) facts = [] for i in range(1): linreg = AnalysisFactory( [BiomTable("species")], metadata_filepath, "TestSet" + str(i) ).with_feature_set(fsets[i])\ .with_training_set(i)\ .with_normalization(Normalization("CLR", "CLR"))\ .with_pair_strategy(["paired_subtract", "paired_subtract_sex_balanced"]) facts.append(linreg) # species = AnalysisFactory( # ["species"], # metadata_filepath, # "species" # ) # facts.append(species) # return MultiFactory(facts)
def configure(): metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv" return AnalysisFactory( BiomTable("species"), metadata_filepath )\ .with_pair_strategy(["unpaired", "paired_concat", "paired_subtract"])
def configure(): metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv" csv_root = "./plots/snp_clustermaps" onlyfiles = sorted([ f for f in listdir(csv_root) if isfile(join(csv_root, f)) and f.endswith(".csv") ]) tables = [ CSVTable(join(csv_root, f), table_name=f, on_load_transform=fix_input_table, dtype=str) for f in onlyfiles ] snp_clusters = AnalysisFactory( [ MergeTable(tables, onlyfiles) ], metadata_filepath, "SNP Clusters" )\ .with_pair_strategy("paired_subtract_sex_balanced")\ .with_normalization([Normalization("None", "none"), Normalization.DEFAULT, Normalization("CLR", "CLR")]) raw = AnalysisFactory( BiomTable("species"), metadata_filepath, "Species" )\ .with_pair_strategy(["paired_subtract_sex_balanced"])\ .with_normalization([Normalization("None", "none"), Normalization.DEFAULT, Normalization("CLR", "CLR")])\ return MultiFactory([snp_clusters, raw])
def configure(): metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv" csv_filepath = "./dataset/csv/humann2_pathabundance_subject.txt" pathways = AnalysisFactory( [ CSVTable( csv_filepath, table_name="Humann2-PathAbundance", on_load_transform=keep_pathways_only, sep="\t"), CSVTable( csv_filepath, table_name="Humann2-SpeciesPathAbundance", on_load_transform=keep_species_specific_pathways_only, sep="\t") ], metadata_filepath, "Humann2PathAbundance" )\ .with_pair_strategy("paired_subtract_sex_balanced")\ .with_normalization([Normalization("None", "none"), Normalization.DEFAULT, Normalization("CLR", "CLR")]) raw = AnalysisFactory( BiomTable("species"), metadata_filepath, "Species" )\ .with_pair_strategy(["paired_subtract_sex_balanced"])\ .with_normalization([Normalization("None", "none"), Normalization.DEFAULT, Normalization("CLR", "CLR")])\ return MultiFactory([pathways, raw])
def configure(): metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv" proteome_avail = FeatureSet.build_feature_set( "HasData", "./dataset/feature_sets/proteome_available.tsv") return AnalysisFactory(BiomTable("none"), metadata_filepath).with_feature_set( [None, proteome_avail])
def configure(): metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv" hc_off = "./dataset/feature_sets/hc_off.tsv" hc_treated = "./dataset/feature_sets/hc_treated.tsv" ms_off = "./dataset/feature_sets/ms_off.tsv" ms_treated = "./dataset/feature_sets/ms_treated.tsv" # shuffle = AnalysisFactory( # ["none"], # metadata_filepath # ).with_feature_transform( # [FeatureTransformer("MBP30_Shuffle"+str(x), mbp30, shuffle_seed=x) # for x in range(10)] # ) woltka_transforms = AnalysisFactory( [BiomTable("species")], metadata_filepath, ).with_feature_transform( [ NetworkTransformer("ms.off", ms_off), NetworkTransformer("ms.treated", ms_treated), NetworkTransformer("hc.off", hc_off), NetworkTransformer("hc.treated", hc_treated) ]) \ .with_normalization(Normalization("CLR", "CLR")) \ .with_pair_strategy("paired_subtract_sex_balanced") \ .with_metadata_filter([ None, MetadataFilter( "Off Treatment", "treatment_status", ["Off", "Control"] ) ]) woltka_species = AnalysisFactory( [BiomTable("species")], metadata_filepath, "All Species", ).with_normalization(Normalization("CLR", "CLR")) \ .with_pair_strategy("paired_subtract_sex_balanced") return MultiFactory([woltka_species, woltka_transforms])
def configure(): metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv" woltka_transforms = AnalysisFactory( [BiomTable("none")], metadata_filepath ).with_pair_strategy("unpaired")\ .with_normalization(Normalization.NONE) return MultiFactory([woltka_transforms])
def configure(): metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv" return AnalysisFactory( BiomTable("none"), metadata_filepath ) \ .with_pair_strategy(["unpaired", "paired_subtract_sex_balanced"]) \ .with_normalization([Normalization.CLR, Normalization.DEFAULT])
def configure(): probstel = FeatureSet.build_feature_set( "Probstel", "./dataset/feature_sets/literature_review_Probstel_Baranzini_2018.tsv" ) metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv" all_genera = AnalysisFactory( BiomTable("genus"), metadata_filepath, "All-Genera" ) probstel_features = AnalysisFactory( BiomTable("genus"), metadata_filepath, ).with_feature_set(probstel.create_univariate_sets("Univariate-") + [probstel]) return MultiFactory([all_genera, probstel_features])
def configure(): metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv" woltka_levels = AnalysisFactory( [BiomTable("species")], metadata_filepath ).with_downsampling([25, 50, 75, 100, 125, 150, 175, 200, 225, 250, None])\ .with_normalization(Normalization.CLR)\ .with_pair_strategy(["paired_subtract", "paired_subtract_sex_balanced"]) return MultiFactory([woltka_levels])
def configure(): metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv" filters = [ None, # TODO: This comes back named None, maybe some way to create an # identity filter with a specified name? MetadataFilter("RRMS", "disease_course", ["RRMS", "Control"]), MetadataFilter("PPMS", "disease_course", ["PPMS", "Control"]), MetadataFilter("SPMS", "disease_course", ["SPMS", "Control"]) ] return AnalysisFactory(BiomTable("species"), metadata_filepath).with_metadata_filter(filters)
def configure(): metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv" return AnalysisFactory(BiomTable("species"), metadata_filepath).with_normalization([ Normalization("CLR", "CLR"), Normalization("rarefy", "rarefy", target_count=10000), Normalization("divide_total", "divide_total", target_count=10000) ])
def configure(): akkermansia_feature_set = FeatureSet.build_feature_set( "Top Mimics", "./dataset/feature_sets/top_mimics_genome_ids.tsv") metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv" return AnalysisFactory( BiomTable("none"), metadata_filepath )\ .with_lda([1]) \ .with_feature_set(akkermansia_feature_set.create_univariate_sets()) \ .with_pair_strategy("paired_subtract_sex_balanced") \ .with_normalization(Normalization.CLR) \
def configure(): metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv" woltka_levels = AnalysisFactory( [BiomTable("species")], metadata_filepath ) alleles = AnalysisFactory( ["Dump"], metadata_filepath ) return MultiFactory([woltka_levels, ])
def configure(): akkermansia_feature_set = FeatureSet.build_feature_set( "Akkermansia", "./dataset/feature_sets/just_akkermansia.tsv") metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv" return AnalysisFactory( BiomTable("species"), metadata_filepath, "Akkermansia" )\ .with_feature_set(akkermansia_feature_set.create_univariate_sets() + [akkermansia_feature_set]) \ .with_pair_strategy("paired_subtract_sex_balanced") \ .with_normalization(Normalization.DEFAULT)
def configure(): metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv" fset_species = FeatureSet("SPECIES", ["239935", "853"], ["Akkermansia", "Faecalibacterium"]) lda = AnalysisFactory( BiomTable("species"), metadata_filepath ).with_lda([1]) \ .with_pair_strategy(["paired_subtract"])\ .with_normalization([Normalization("CLR", "CLR")])\ .with_feature_set([fset_species]) return lda
def configure(): lit_feature_set = FeatureSet.build_feature_set( "Literature", "./dataset/feature_sets/literature_search.tsv" ) metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv" return AnalysisFactory( BiomTable("none"), metadata_filepath ) \ .with_lda([1]) \ .with_feature_set(lit_feature_set.create_univariate_sets()) \ .with_pair_strategy("paired_subtract_sex_balanced") \ .with_normalization(Normalization.CLR)
def configure(): metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv" filters = [ None, MetadataFilter("San Sebastian", "site", ["San Sebastian", "Control"]), MetadataFilter("San Francisco", "site", ["San Francisco", "Control"]), MetadataFilter("Pittsburgh", "site", ["Pittsburgh", "Control"]), MetadataFilter("New York", "site", ["New York", "Control"]), MetadataFilter("Edinburgh", "site", ["Edinburgh", "Control"]), MetadataFilter("Buenos Aires", "site", ["Buenos Aires", "Control"]), MetadataFilter("Boston", "site", ["Boston", "Control"]) ] return AnalysisFactory( BiomTable("species"), metadata_filepath, ).with_metadata_filter(filters)
def configure(): metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv" probstel = FeatureSet.build_feature_set( "Probstel", "./dataset/feature_sets/literature_review_Probstel_Baranzini_2018.tsv") probstel_features = AnalysisFactory( "genus", metadata_filepath, ).with_feature_set([probstel] + probstel.create_univariate_sets("Univariate-")) mbp25 = "./dataset/feature_transforms/mbp_table25.csv" mbp30 = "./dataset/feature_transforms/mbp_table30.csv" mbp35 = "./dataset/feature_transforms/mbp_table35.csv" fsets = FeatureSet.build_feature_sets( "./dataset/feature_sets/MS_associated_species_fdr0.05_in_10_training_set.csv" ) species_meta = BiomTable("species").read_biom_metadata() fset_species = FeatureSet("SPECIES", species_meta.index.astype(str).tolist(), species_meta["Name"].tolist()) # umap = AnalysisFactory( # "species", # metadata_filepath, # "UMAP" # ).with_umap()\ # .with_num_seeds(50)\ # .with_pair_strategy("unpaired") # pca = AnalysisFactory( # "species", # metadata_filepath, # "PCA(1)" # ).with_pca([1])\ # .with_num_seeds(50)\ # .with_normalization(Normalization("CLR", "CLR"))\ # .with_pair_strategy("paired_subtract") # facts = [] # for i in range(4): # linreg = AnalysisFactory( # ["species"], # metadata_filepath, # "MixedLinearModel" + str(i), # ).with_lda([1]).with_feature_set(fsets[i])\ # .with_training_set(i)\ # .with_normalization([Normalization("CLR", "CLR")])\ # .with_pair_strategy(["paired_subtract"]) # facts.append(linreg) # lda = AnalysisFactory( # "species", # metadata_filepath, # "MBPMimics" # ).with_lda([1]) \ # .with_feature_transform( # [FeatureTransformer("MBP25", mbp25), # FeatureTransformer("MBP30", mbp30), # FeatureTransformer("MBP35", mbp35)])\ # .with_pair_strategy(["paired_subtract"])\ # .with_num_training_sets(4)\ # .with_normalization([Normalization("CLR", "CLR")])\ # .with_feature_set([probstel]) # .with_num_seeds(5) \ # FeatureSet("A", ["572511"]), # FeatureSet("B", ["572511", "33042"]), # FeatureSet("C", ["572511", "33042", "216851"])] raw = AnalysisFactory( BiomTable("species"), metadata_filepath, "Species" )\ .with_lda(1)\ .with_pair_strategy(["paired_subtract_sex_balanced"])\ .with_normalization([Normalization("CLR", "CLR")])\ .with_feature_set([fset_species] + fset_species.create_univariate_sets() + fsets) probstel_lda = AnalysisFactory( BiomTable("genus"), metadata_filepath )\ .with_lda(1)\ .with_pair_strategy(["paired_subtract_sex_balanced"])\ .with_normalization([Normalization("CLR", "CLR")])\ .with_feature_set([probstel] + probstel.create_univariate_sets()) # return MultiFactory([ # # umap, # lda, # # pca, # # raw, # ]) # return probstel_lda return MultiFactory([raw, probstel_lda])
def configure(): metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv" mbp25 = "./dataset/feature_transforms/mbp_table25.csv" mbp30 = "./dataset/feature_transforms/mbp_table30.csv" mbp35 = "./dataset/feature_transforms/mbp_table35.csv" mog25 = "./dataset/feature_transforms/mog_table25.csv" mog30 = "./dataset/feature_transforms/mog_table30.csv" mog35 = "./dataset/feature_transforms/mog_table35.csv" hla_drb1_1501_households = \ ['714-0049', '714-0072', '714-0075', '714-0078', '714-0079', '714-0086', '714-0094', '714-0101', '714-0102', '714-0107', '714-0110', '714-0111', '714-0118', '714-0119', '714-0122', '714-0123', '714-0128', '714-0133', '714-0135', '714-0148', '714-0149', '714-0157', '714-0161', '714-0162', '714-0165', '714-0167', '714-0172', '714-0176', '714-0184', '714-0189', '714-0190', '714-0201', '714-0210', '714-0212', '714-0224', '714-0254', '714-0255', '716-0009', '716-0015', '716-0020', '716-0031', '716-0032', '716-0035', '716-0039', '716-0052', '716-0076', '716-0082', '716-0095', '716-0101', '716-0110', '716-0137', '716-0141', '716-0143', '716-0160'] meta_households = [hh[:3] + hh[4:] for hh in hla_drb1_1501_households] # mbp_shuffle = AnalysisFactory( # ["none"], # metadata_filepath # ).with_feature_transform( # [FeatureTransformer("MBP30_Shuffle"+str(x), mbp30, shuffle_seed=x) # for x in range(10)] # ) woltka_transforms = AnalysisFactory( [BiomTable("none")], metadata_filepath, ).with_feature_transform( [FeatureTransformer("MBP25", mbp25), FeatureTransformer("MBP30", mbp30), FeatureTransformer("MBP35", mbp35)])\ .with_normalization(Normalization("CLR", "CLR"))\ .with_pair_strategy("paired_subtract")\ .with_metadata_filter([ None, MetadataFilter( "DRB1_1501", "household", meta_households ) ]) # woltka_raw = AnalysisFactory( # [BiomTable("species"), BiomTable("none")], # metadata_filepath, # ) # .with_metadata_filter( # MetadataFilter( # "DRB1_1501", # "household", # meta_households # ) # ) return MultiFactory([woltka_transforms])
def configure(): metadata_filepath = "./dataset/metadata/iMSMS_1140samples_metadata.tsv" return AnalysisFactory( [BiomTable("species")], metadata_filepath ).with_algorithm(list(ClassificationTask.algorithms.keys()) + ["RandomForestSVD"])