def bat_processing_csv(self): save_dir = self.ui.saveDirLineEdit.text() if not (self.ui.isSavematRaido.isChecked() ) and not self.ui.isSavenpz.isChecked(): QMessageBox.warning(None, 'error', 'please chose save format', QMessageBox.Yes) return if not save_dir: QMessageBox.warning(None, 'error', 'please input save dir', QMessageBox.Yes) return file_names, _ = QFileDialog.getOpenFileNames(self, 'open files', './', 'data (*.csv)') if file_names: group_number = len(file_names) // 4 if group_number <= 0: QMessageBox.warning( None, 'error', 'please ensure there exist at least four channels', QMessageBox.Yes) return for ith_group in range(group_number): ch1, ch2, ch3, ch4 = file_names[4 * ith_group:4 * ith_group + 4] if self.ui.isSavematRaido.isChecked(): read_data(ch1, ch2, ch3, ch4, None, save_dir + '/' + str(ith_group) + '.mat', False) if self.ui.isSavenpz.isChecked(): read_data(ch1, ch2, ch3, ch4, save_dir + '/' + str(ith_group) + '.npz', None, True)
def main(): csv.dowload_csv() list_of_localities = pd.get_locality_list_from_db() list_of_email_recipients = pd.get_recipient_list_from_db() pd.create_localities(list_of_localities) pd.read_data() for locality in pd.tracking_localities: html.create_html(locality,7) for recipient in list_of_email_recipients: locality_obj = pd.return_locality_obj(recipient.locality) es.send_mail(locality_obj, recipient.email) es.send_admin_email("*****@*****.**")
def main(_): train_file = 'data/data_1_train.csv' source_count, target_count = [], [] data = process_data.read_data(train_file) parsed_data = process_data.parse_data(data) source_word2idx, target_word2idx = create_vocab(parsed_data) #train_data = read_data(FLAGS.train_data, source_count, source_word2idx, target_count, target_word2idx) #test_data = read_data(FLAGS.test_data, source_count, source_word2idx, target_count, target_word2idx) trainData, testData = process_data.split_data(parsed_data, 80, 20) train_data = process_data.read_and_process_data(trainData, source_word2idx, target_word2idx) test_data = process_data.read_and_process_data(testData, source_word2idx, target_word2idx) FLAGS.pad_idx = source_word2idx['<pad>'] FLAGS.nwords = len(source_word2idx) FLAGS.mem_size = train_data[ 4] if train_data[4] > test_data[4] else test_data[4] pp.pprint(flags.FLAGS.__flags) print('loading pre-trained word vectors...') FLAGS.pre_trained_context_wt = init_word_embeddings(source_word2idx) FLAGS.pre_trained_target_wt = init_word_embeddings(target_word2idx) with tf.Session() as sess: model = MemN2N(FLAGS, sess) model.build_model() model.run(train_data, test_data)
def add_observables(model): data = process_data.read_data() ab_map = process_data.get_antibody_map(data) for ab_name, agents in ab_map.items(): patterns = [] for agent in agents: try: monomer = model.monomers[agent.name] except KeyError: continue if agent.mods: mc = agent.mods[0] site_names = ['phospho', mc.residue] if mc.position is not None: site_names.append(mc.residue + mc.position) for site_name in site_names: try: pattern = monomer(**{site_name: 'p'}) patterns.append(ComplexPattern([pattern], None)) except Exception: pass else: patterns.append(ComplexPattern([monomer()], None)) if patterns: if model.monomers.get(ab_name) is not None: obs_name = ab_name + '_obs' else: obs_name = ab_name if not re.match(r'[_a-z][_a-z0-9]*\Z', obs_name, re.IGNORECASE): obs_name = obs_name.replace('-', '_') if not re.match(r'[_a-z][_a-z0-9]*\Z', obs_name, re.IGNORECASE): obs_name = 'p' + obs_name o = Observable(obs_name, ReactionPattern(patterns)) model.add_component(o) '''
def main(): df = process_data.read_data('numerai_datasets/numerai_training_data.csv') df = process_data.scale_data(df) df = process_data.get_c1_dummies(df) x_train, y_train, x_val, y_val = process_data.split_data(df) svc_model = create_test_model(SVC(probability=True, C=100, kernel='rbf'), x_train, y_train, x_val, y_val) process_data.save_model(svc_model, "models/svc/svc_model_c_100.pkl")
def main(): # read data print(1) movie_data = read_data(file, cols) #connect to db movie_db = connectdb() # delete previous data movie_db.delete_all() # create db create_database(movie_db, movie_data)
def main(): wiki_movie_df, rating_df, genres_df, wiki_genres_df = process_data.read_data( ) seaborn.set() #audience_rating, critic_rating = get_rating(rating_df) audience_average, critic_average, audience_percent, critic_percent = get_data( rating_df) check_test(audience_average, critic_average, audience_percent, critic_percent) pvalue = do_anova(audience_average, critic_average, audience_percent) if (pvalue < 0.05): print(" \n ") print("Do post hoc Tukey test") do_post_hoc(audience_average, critic_average, audience_percent)
def main(): wiki_movie_df, rating_df, genres_df, wiki_genres_df = process_data.read_data( ) audience_rating, critic_rating = get_rating(rating_df) seaborn.set() #plt.savefig('rating.png') # do T-test for testing if audience rating and critic norm have the same means print("\n") print("----- T-test -----") t_test(audience_rating, critic_rating) print("\n") print("----- U-test -----") u_test(audience_rating, critic_rating) print("\n") print("----- Regression -----") regression(audience_rating, critic_rating)
def main(): data = proc.read_data() features, yfill = proc.features_yfill(data) X_train, X_test, y_train, y_test = train_test_split(features, yfill, test_size=0.20, random_state=42, stratify=yfill) X_train_over, y_train_over = proc.oversample(X_train, y_train, r=0.3) #plot_roc(X_train, y_train, 'LogisticRegression', LogisticRegression(C=1e5,penalty='l2')) ''' model_over = runLR(X_train_over, X_test, y_train_over, y_test) test_results(model_over, X_test, y_test) ''' model = runLR(X_train.values, X_test, y_train.values, y_test) test_results(model, X_test, y_test)
def run(dec_thresh=-1, inc_thresh=1): data = pd.read_data(pd.data_file) ab_agents = pd.get_antibody_map(data) # If filtering is to be done based on thresholds only, # set this to None drug_ab_combs = get_eval_drug_ab_combs(data) #drug_ab_combs = None stmts, values = make_stmts(data, ab_agents, drug_ab_combs=drug_ab_combs, thresh=[dec_thresh, inc_thresh]) # Now, preassemble the statements to remove duplicates pa_dict = preassemble_stmts(stmts) with open('data_stmts.pkl', 'wb') as f: pickle.dump((pa_dict, values), f, protocol=2) return (stmts, values)
plt.savefig('Accuracy_vs_numtrees_{}.png'.format(graphid)) plt.close() plt.figure() plt.plot(num_trees, precision) #plt.ylim((0.8, 1)) plt.savefig('precision_vs_numtrees_{}.png'.format(graphid)) plt.close() plt.figure() plt.plot(num_trees, recall) #plt.ylim((0.8, 1)) plt.savefig('recall_vs_numtrees_{}.png'.format(graphid)) plt.close() if __name__ == '__main__': data = proc.read_data() # bits, yfill = bits_yfill(data) # X_train, X_test, y_train, y_test = train_test_split(bits, yfill, test_size=0.20, random_state=42, stratify =yfill) # for num in range(10): # rffit = RandomForestClass(X_train, X_test, y_train, y_test) # feature_importance(bits, rffit) # plot_features(bits, rffit, 20, 'bits', num) features, yfill = proc.features_yfill(data) X_train, X_test, y_train, y_test = train_test_split(features, yfill, test_size=0.20, random_state=1, stratify =yfill) X_train_over, y_train_over = proc.oversample(X_train,y_train, r = 0.3) rffit, y_predict = randomforest(X_train_over, X_test, y_train_over, y_test, num_est=50, cls_w = 'balanced_subsample') precision, recall, median_recall_index, medianrecall_threshold = set_threshold(rffit, X_train, X_test, y_train, y_test) print_threshold(rffit, X_train, X_test, y_train, y_test, medianrecall_threshold) feature_importance(features, rffit)
if json_dict.get(drug) is None: json_dict[drug] = {} if json_dict[drug].get(ab) is None: json_dict[drug][ab] = {} for idx, path in enumerate(paths): path_stmts = [] for rule_name, sign in path[:-1]: stmt = _stmt_from_rule(model, rule_name, stmts) path_stmts.append(stmt.uuid) json_dict[drug][ab][idx] = path_stmts return json_dict if __name__ == '__main__': print("Processing data") data = process_data.read_data(process_data.data_file) data_genes = process_data.get_all_gene_names(data) ab_map = process_data.get_antibody_map(data) print('Loading data statements.') data_stmts, data_values = make_stmts.run(dec_thresh=0.5, inc_thresh=1.5) all_data_stmts = [values.values() for values in data_stmts.values()] all_data_stmts = itertools.chain.from_iterable(all_data_stmts) all_data_stmts = list(itertools.chain.from_iterable(all_data_stmts)) print('We will check the following drug-ab combinations:\n============') for drug, stmtd in data_stmts.items(): print(drug) for ab in stmtd.keys(): print('-'+ ab)
score_list = [ SVC_score_all, LR_score_all, NB_score_all, \ SVC_score_pca, LR_score_pca, NB_score_pca, \ SVC_score_fs, LR_score_fs, NB_score_fs ] method_list = [ "The accuracy of model with all rating features by SVM Classifier", "The accuracy of model with all rating features by Logistic Regression", "The accuracy of model with all rating features by Naive bayes Classifier", "The accuracy of model with PCA transformed features by SVM Classifier", "The accuracy of model with PCA transformed features by Logistic Regression", "The accuracy of model with PCA transformed features by Naive Bayes Classifier", "The accuracy of model with Top-2 important features by SVM Classifier", "The accuracy of model with Top-2 important features by Logistic Regression", "The accuracy of model with Top-2 important features by Naive Bayes Classifier", ] for k, v in sorted(zip(map(lambda x: round(x, 4), score_list), method_list), reverse=True): print(v + ': ' + str(k)) sb.set() show_distribution(fs_df) if __name__ == "__main__": wiki_movie_df, rating_df, genres_df, wiki_genres_df = process_data.read_data() predict_profit(wiki_movie_df, rating_df) # print(wiki_movie_df)
# -*- coding: utf-8 -*- import os import numpy as np from process_data import read_data from protein_feature_signal import discretize from sklearn.ensemble import RandomForestRegressor from protein_feature_preparation_linear import ProteinFeaturePreparationLinear ## Read Data project_directory = os.path.dirname(os.getcwd()) file_data = read_data(project_directory) protein_data = ['DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA'] * len(file_data) print("********* Model: Random Forest ") for window_size in range(2, 6): print("#############################") print("********* Using Window Size:", window_size) params = ProteinFeaturePreparationLinear(window_size) #### Train Feature ////// train_signal = [] train_features = [] for i in range(0, len(protein_data)): prot_seq = protein_data[i] signal = file_data[i] features = params.get_feature(prot_seq) discrete_signal = discretize(signal, len(prot_seq), window_size) train_signal.extend(discrete_signal) train_features.extend(features) train_features = np.array(train_features)
def _preprocess(self): """ Read in data and build the vocabulary """ words = read_data(self.file_path) self.dictionary, self.invert_dict = build_vocab(words, self.vocab_size) self.index_words = convert_words_to_index(words, self.dictionary)
plt.savefig('Accuracy_vs_numtrees_{}.png'.format(graphid)) plt.close() plt.figure() plt.plot(num_trees, precision) #plt.ylim((0.8, 1)) plt.savefig('precision_vs_numtrees_{}.png'.format(graphid)) plt.close() plt.figure() plt.plot(num_trees, recall) #plt.ylim((0.8, 1)) plt.savefig('recall_vs_numtrees_{}.png'.format(graphid)) plt.close() if __name__ == '__main__': df = proc.read_data() #df = pd.read_csv(os.path.join(app.config['UPLOAD_FOLDER'], filename)) # use all features and yfill (no NaNs, filled with 0) features, yfill = proc.features_yfill(df) #train test split at 20% X_train, X_test, y_train, y_test = train_test_split(features, yfill, test_size=0.20, random_state=1, stratify=yfill) #Optional: oversampling of minority class for training purposes #X_train_over, y_train_over = proc.oversample(X_train,y_train, r = 0.3) #rffit, y_predict = rf.randomforest(X_train_over, X_test, y_train_over, y_test, num_est=50, cls_w = 'balanced_subsample') #fit the Random Forest classifier: would like to add in a grid search
def read_sources(): trips_stmts = process_trips.read_stmts(process_trips.base_folder) sparser_stmts = process_sparser.read_stmts(process_sparser.base_folder) r3_stmts = process_r3.read_stmts(process_r3.active_forms_file) stmts = trips_stmts + sparser_stmts + r3_stmts return stmts def get_prior_genes(fname): """Get the list of prior genes.""" with open(fname, 'rt') as fh: genes = fh.read().strip().split('\n') return genes if __name__ == '__main__': outf = 'output/' data = process_data.read_data(process_data.data_file) data_genes = process_data.get_all_gene_names(data) reassemble = False if not reassemble: stmts = ac.load_statements(pjoin(outf, 'preassembled.pkl')) #stmts = ac.load_statements(pjoin(outf, 'prior.pkl')) else: #prior_stmts = build_prior(data_genes, pjoin(outf, 'prior.pkl')) prior_stmts = ac.load_statements(pjoin(outf, 'prior.pkl')) prior_stmts = ac.map_grounding(prior_stmts, save=pjoin(outf, 'gmapped_prior.pkl')) reading_stmts = ac.load_statements(pjoin(outf, 'phase3_stmts.pkl')) reading_stmts = ac.map_grounding(reading_stmts, save=pjoin(outf, 'gmapped_reading.pkl')) stmts = prior_stmts + reading_stmts