def subsystem_naming(): categoires = DataReader().read_subsystem_categories() subsystems = reduce(set.union, categoires.values()) model = DataReader().read_network_model() model_subsystems = model.subsystems() print('Diff of cate from model', subsystems.difference(model_subsystems)) print('Diff of model from cate', model_subsystems.difference(subsystems))
def create_for(cls, dataset_name="recon2"): if dataset_name == 'example': model = DataReader().create_example_model() elif dataset_name == 'example2': model = DataReader().create_example2_model() else: model = DataReader().read_network_model(dataset_name) self = cls(description=model) return self
def pathifier(disease_name): model = DataReader().read_network_model() X, y = DataReader().read_data(disease_name) pre = DynamicPreprocessing(['metabolic-standard']) X = pre.fit_transform(X, y) import pdb pdb.set_trace() df = pd.DataFrame(X) metabolite_fold_changes = robj.r.matrix(robj.FloatVector( df.as_matrix().T.ravel().tolist()), nrow=df.shape[1]) all_metabolite_ids = robj.StrVector(list(df)) subsystem_metabolite = defaultdict(set) for r in model.reactions: if r.subsystem and not (r.subsystem.startswith('Transport') or r.subsystem.startswith('Exchange')): subsystem_metabolite[r.subsystem] \ .update(m.id for m in r.metabolites if m.id in df) pathway_names, pathway_metabolites = zip( *filter(lambda x: x[1], subsystem_metabolite.items())) pathway_metabolites = robj.r['list']( *map(lambda x: robj.StrVector(list(x)), pathway_metabolites)) pathway_names = robj.StrVector(list(pathway_names)) is_healthy = robj.BoolVector(list(map(lambda x: x == 'h', y))) pathifier = importr("pathifier") result = pathifier.quantify_pathways_deregulation(metabolite_fold_changes, all_metabolite_ids, pathway_metabolites, pathway_names, is_healthy, attempts=100, min_exp=0, min_std=0) regScores = dict() for pathway, scores in dict(result.items())['scores'].items(): regScores[pathway] = list(scores[:]) df = pd.DataFrame(regScores) df.insert(0, 'stage', y) df.to_csv('../dataset/disease/%s_regulization.csv' % disease_name, index=False)
def elimination_tabular(): (X, y) = DataReader().read_data('BC') datasets = {'metabolite': DataReader().read_data('BC')} scores = list() for i in range(1, len(X[0].keys()) + 1, 10): vect = DictVectorizer(sparse=False) selector = SelectNotKBest(k=i) clfs = dict() clfs['metabolite'] = Pipeline([ # pipe for compare model with eliminating some features ('metabolic', DynamicPreprocessing(['naming', 'metabolic-standard'])), ('vect', vect), ('selector', selector), ('pca', PCA()), ('clf', LogisticRegression(C=0.01, random_state=43)) ]) try: path = '../dataset/solutions/bc_disease_analysis#k=%d.json' % i datasets['reaction'] = list( zip(*[json.loads(i) for i in open(path)][0])) except: print(pd.DataFrame(scores)) return clfs['reaction'] = FVADiseaseClassifier() kf = StratifiedKFold(n_splits=10, random_state=43) score = { name: np.mean( cross_val_score(clf, datasets[name][0], datasets[name][1], cv=kf, n_jobs=-1, scoring='f1_micro')) for name, clf in clfs.items() } score['iteration'] = i scores.append(score) print(pd.DataFrame(scores))
def setUp(self): self.clf = SolutionLevelDiseaseClassifier() (X, y) = DataReader().read_small_data() (self.X_train, self.X_test, self.y_train, self.y_test) = \ train_test_split(X, y, random_state=0) self.clf.fit(self.X_train, self.y_train)
def setUp(self): (X, y) = DataReader().read_data('BC') X = NamingService('recon').to(X) Xy = next(filter(lambda k: k[1] == 'h', zip(X, y))) (self.X, self.y) = ([Xy[0]], [Xy[1]]) self.fva = FVARangedMeasurement()
def setUp(self): self.clf = DummyDiseaseClassifier() (X, y) = DataReader().read_solutions() (self.X_train, self.X_test, self.y_train, self.y_test) = \ train_test_split(X, y, random_state=0) self.clf.fit(self.X_train, self.y_train)
def subsystem_statistics(): categories = DataReader().read_subsystem_categories() total = 0 for k, v in categories.items(): print(k, len(v)) total += len(v) print('total:', total)
def setUp(self): self.predictor = TrendPredictor() self.X_train, self.X_test, self.y_train, self.y_test = \ Business.train_test_set(DataReader().sample_businesses()) self.predictor.fit(self.X_train, self.y_train)
def generate_angular_friendly_model(): ''' This function convert json model into angular friendly json ''' model = DataReader().read_network_model() model_json = json.load(open('../dataset/network/recon2.json')) reactions, metabolites = model_json['reactions'], model_json['metabolites'] model_json = defaultdict(dict) model_json['pathways'] = defaultdict(list) for m in metabolites: m['reactions'] = [ r.id for r in model.metabolites.get_by_id(m['id']).reactions ] model_json['metabolites'][m['id']] = m for r in reactions: # r['gene_reaction_rule'], r['notes'] = [], {} del r['gene_reaction_rule'] del r['notes'] model_json['reactions'][r['id']] = r model_json['pathways'][r.get('subsystem', 'NOpathway')].append(r['id']) json.dump(model_json, open('../outputs/ng-recon.json', 'w'))
def fva_range_analysis_save(): # (X, y) = DataReader().read_data('BC') (X, y) = DataReader().read_data('HCC') X = NamingService('recon').to(X) X = FVARangedMeasurement().fit_transform(X, y) with open('../outputs/fva_solutions.txt', 'w') as f: for x, label in zip(X, y): f.write('%s %s\n' % (label, x))
def hmdb_disease_analysis_on_server(): client = MetaboliticsApiClient() client.login('email', 'password') hmdb_data = DataReader().read_hmdb_diseases() for name, measurements in hmdb_data.items(): print(client.analyze(name, measurements))
def setUp(self): (X, y) = DataReader().read_all() X = NamingService('recon').to(X) self.vect = DictVectorizer(sparse=False) X = self.vect.fit_transform(X, y) X = MetabolicStandardScaler().fit_transform(X, y) self.measured_metabolites = X[0] self.scaler = FVAScaler(self.vect)
def setUp(self): self.clf = MetaboliteLevelDiseaseClassifier() (X, y) = DataReader().read_all() (self.X_train, self.X_test, self.y_train, self.y_test) = \ train_test_split(X, y, random_state=0) self.clf.fit(self.X_train, self.y_train)
def hmdb_disease_analysis(): naming = NamingService('recon') y, X = list(zip(*DataReader().read_hmdb_diseases().items())) dyn_pre = DynamicPreprocessing(['fva']) X_t = dyn_pre.fit_transform(X, y) DataWriter('hmdb_disease_analysis').write_json(dict(zip(y, X_t)))
def most_correlated_reactions(top_num_reaction): (X, y) = DataReader().read_fva_solutions() vect = DictVectorizer(sparse=False) X = vect.fit_transform(X) vt = VarianceThreshold(0.1) X = vt.fit_transform(X) (F, pval) = f_classif(X, y) feature_names = np.array(vect.feature_names_)[vt.get_support()] top_n = sorted(zip(feature_names, F), key=lambda x: x[1], reverse=True)[:int(top_num_reaction)] model = DataReader().read_network_model() for n, v in top_n: print('name:', n[:-4]) print('reaction:', model.reactions.get_by_id(n[:-4]).reaction) print('min-max:', n[-3:]) print('F:', v) print('-' * 10)
def most_correlated_pathway(top_num_pathway, num_of_reactions): (X, y) = DataReader().read_fva_solutions('fva_without.transports.txt') vect = [DictVectorizer(sparse=False)] * 3 vt = VarianceThreshold(0.1) skb = SelectKBest(k=int(num_of_reactions)) X = Pipeline([('vect1', vect[0]), ('vt', vt), ('inv_vec1', InverseDictVectorizer(vect[0], vt)), ('vect2', vect[1]), ('skb', skb), ('inv_vec2', InverseDictVectorizer(vect[1], skb)), ('pathway_scoring', PathwayFvaScaler()), ('vect3', vect[2])]).fit_transform(X, y) (F, pval) = f_classif(X, y) top_n = sorted(zip(vect[2].feature_names_, F, pval), key=lambda x: x[1], reverse=True)[:int(top_num_pathway)] model = DataReader().read_network_model() X, y = DataReader().read_data('BC') bc = NamingService('recon').to(X) subsystem_metabolite = defaultdict(set) for r in model.reactions: subsystem_metabolite[r.subsystem].update(m.id for m in r.metabolites) subsystem_counts = defaultdict(float) for sample in bc: for s, v in subsystem_metabolite.items(): subsystem_counts[s] += len(v.intersection(sample.keys())) subsystem_counts = { i: v / len(subsystem_counts) for i, v in subsystem_counts.items() } for n, v, p in top_n: print('name:', n[:-4]) print('min-max:', n[-3:]) print('metabolites:%s' % subsystem_counts[n[:-4]]) print('F:', v) print('p:', p) print('-' * 10)
def healties_model(): X, y = DataReader().read_healthy('BC') pre_model = DynamicPreprocessing(['naming', 'basic-fold-change-scaler']) X = pre_model.fit_transform(list(X), y) model = DynamicPreprocessing(['fva', 'flux-diff']) model.fit(X, y) with open('../outputs/api_model.p', 'wb') as f: pickle.dump(model, f)
def naming_issue(): human_names = set(NamingService('recon')._names.keys()) dr = DataReader() bc_names = set(i.lower().strip() for i in dr.read_columns('BC')) hcc_names = set(i.lower().strip() for i in dr.read_columns('HCC')) report_matching(hcc_names, bc_names, 'hcc', 'bc') print('-' * 10, 'human', '-' * 10) report_matching(hcc_names, human_names, 'hcc', '') report_matching(bc_names, human_names, 'bc', '')
def hmdb_disease_analysis_pathway_level(): X, y = DataReader().read_solution('hmdb_disease_analysis') with open('../models/api_model.p', 'rb') as f: reaction_scaler = pickle.load(f) dyn_pre = DynamicPreprocessing( ['pathway-scoring', 'transport-elimination']) X_t = reaction_scaler._model.named_steps['flux-diff'].transform(X) X_t = dyn_pre.fit_transform(X_t, y) DataWriter('hmdb_disease_analysis_pathway_level').write_json( dict(zip(y, X_t)))
def hmdb_disease_normalization(): dataset = DataReader().read_hmdb_diseases() naming = NamingService('hmdb') nor_data = dict() for dis, categories in dataset.items(): for cat, measurements in categories.items(): named_measurements = naming.to(dict(measurements)) if len(named_measurements) >= 10: nor_data['%s %s' % (dis, cat)] = { k: round( min(v - 1, 100) if v >= 1 else max(1 - v**-1, -100), 3) for k, v in named_measurements.items() } DataWriter('normalization_hmdb').write_json(nor_data)
def solution_for_dataset(): (X, y) = DataReader().read_all() vect = DictVectorizer(sparse=False) X = vect.fit_transform(X, y) X = MetabolicChangeScaler().fit_transform(X, y) X = MetabolicSolutionScaler(vect).to_ecolin(X) solution_service = SolutionService() file_path = '../output/solution_for_dataset.json' calculated_samples = sum(1 for line in open(file_path)) f = open(file_path, 'a') for x in X[calculated_samples:]: solution = solution_service.get_solution(x) f.write('%s\n' % json.dumps(solution))
def healty_for_heatmap(num_of_reactions): (X, y) = DataReader().read_fva_solutions('fva_without.transports.txt') X = Pipeline([ ('flux-diff-scaler', ReactionDiffScaler()), ('pathway_scoring', PathwayFvaScaler()), ]).fit_transform(X, y) df = pd.DataFrame(ix for ix, iy in zip(X, y) if iy == 'h') hjson = { 'x': [i[:-4] for i in df], 'z': df.values.tolist(), 'type': 'heatmap' } json.dump(hjson, open('../outputs/healties_heatmap.json', 'w'))
def svr_trend_prediction(): logger = logging.getLogger('trend-prediction') logger.setLevel(logging.INFO) logger.addHandler(logging.FileHandler('../logs/trend_prediction.log')) predictor = TrendPredictor() X_train, X_test, y_train, y_test = \ Business.train_test_set(DataReader().businesses()) predictor.fit(X_train, y_train) predictor.save() logger.info(predictor) logger.info('mean squared error: %s ' % predictor.mean_squared_error(X_test, y_test)) logger.info('r2 score: %s ' % predictor.r2_score(X_test, y_test))
def solution_config_generator(): #model = DataReader().read_network_model() model = BaseFVA.create_for() categories = DataReader().read_subsystem_categories() start = datetime.datetime.now() configurations = [] for category, subsystems in categories.items(): #if len(subsystems) > 9 and len(subsystems) < 13: if category.startswith('glycan'): print(category, len(subsystems)) print(subsystems) generate_category_config(model, subsystems, configurations) break print(total, feasible) end = datetime.datetime.now() delta = end - start print('the number of valid configurations:', len(configurations)) print(delta)
def fva_range_with_basic_analysis_save(): X, y = DataReader().read_data('BC') # preproc = DynamicPreprocessing(['naming', 'basic-fold-change-scaler']) # X_p = preproc.fit_transform(X, y) # import pprint # import pdb # for i in X_p: # pprint.pprint(i) # pdb.set_trace() for x in X: for k, v in x.items(): x[k] = round(v, 3) preproc = DynamicPreprocessing( ['naming', 'basic-fold-change-scaler', 'fva']).fit(X, y) print('model trained...') DataWriter('fva_solution_with_basic_fold_change') \ .write_json_stream(preproc.transform, X)
def eliminate_best_k(): (X, y) = DataReader().read_data('BC') for i in range(1, len(X[0].keys()) + 1, 10): vect = DictVectorizer(sparse=False) selector = SelectNotKBest(k=i) pipe = Pipeline([ # pipe for compare model with eliminating some features ('metabolic', DynamicPreprocessing(['naming', 'metabolic-standard'])), ('vect', vect), ('selector', selector), ('inv_vect', InverseDictVectorizer(vect, selector)), ('fva', DynamicPreprocessing(['fva'])) ]) X_result = pipe.fit_transform(X, y) DataWriter('bc_disease_analysis#k=%s' % i) \ .write_json_dataset(X_result, y)
def setUpData(self): return DataReader().read_data('BC')
def setUpData(self): return DataReader().read_data('BC_regulization')
def __init__(self): super().__init__() self.model = DataReader().read_network_model()