def MLP_test(): from preprocess import process_data, partition_data print('processing data...') X, y = process_data(collapse=False, encode=True, normalize=True, predict_missing=True, k_predict=3) [test, validate, train] = partition_data(X, y) print('fitting model... ') from sklearn.neural_network import MLPClassifier model = MLPClassifier(hidden_layer_sizes=(1000, 2000, 1000, 100, 50), verbose=False) model.fit(train[0], train[1]) valid_prob = model.predict_proba(validate[0]) print(valid_prob[0:5]) print(validate[1][0:5]) from cross_entropy import cross_entropy print(valid_prob.shape, validate[1].shape) print('cross entropy:', cross_entropy(validate[1], valid_prob)) from risk import empirical_risk print('mse:', empirical_risk('mse', valid_prob, validate[1])) from sklearn.metrics import accuracy_score print('accuracy', accuracy_score(validate[1], model.predict(validate[0])))
def data_corr(): # Get data according to analysis above train_path = pr.process_data('train.csv') train_data = pd.DataFrame.from_csv(train_path) sn.set(style="white") sn.set(font_scale=0.7) # Compute the correlation matrix corr = train_data.corr() # Generate a mask for the upper triangle mask = np.zeros_like(corr, dtype=np.bool) mask[np.triu_indices_from(mask)] = True # Set up the matplotlib figure f, ax = plt.subplots(figsize=(11, 9)) # Generate a custom diverging colormap cmap = sn.diverging_palette(220, 10, as_cmap=True) # Draw the heatmap with the mask and correct aspect ratio sn.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, square=True, linewidths=.5, cbar_kws={"shrink": .5}, ax=ax) plt.show()
def save_data(filename): """ Save kaggle json data file called 'filename' to 'processed_<filename>' after pre-processing it. """ print "Creating processed datafile for '{}'".format(filename) data_path = os.path.join(PROJECT_ROOT, 'data', filename) proc_path = os.path.join(PROJECT_ROOT, 'data', "processed_" + filename) try: pre = os.path.basename(data_path) post = os.path.basename(proc_path) print "\nOpening '{}'...".format(pre) with open(data_path, 'r') as datafile: data = pd.read_json(datafile) data = process_data(data) print "Writing processed data to '{}'...".format(post) with open(proc_path, "w") as processed_file: data.to_json(processed_file) print "Finished processing '{}' into '{}'.".format(pre, post) except IOError as ioe: print "Failed to process {} to file.".format(data_path) print ioe
def learn(basepath, features_file, labels_file): # Load the data print 'Loading data...' features_data = pd.read_msgpack(load_data(basepath, features_file)['data']) labels_data = pd.read_msgpack(load_data(basepath, labels_file)['data']) df = pd.concat([features_data,labels_data], axis=1) # Process features samples, labels = preprocess.process_data(df) # How many samples are we going to leave out for the test set? nb_test = int(len(labels) * 0.2) split = len(labels) - nb_test # Prepare training and test sets X_train = np.array(samples[:split]) y_train = labels[:split] X_test = np.array(samples[split+1:]) y_test = labels[split+1:] print len(X_train), 'train sequences' print len(X_test), 'test sequences' # How many classes? num_classes = np.max(labels)+1 print num_classes, 'classes' # Train Model train_and_save(X_train, X_test, y_train, y_test, num_classes, basepath)
def learn(basepath, features_file, labels_file): # Load the data print 'Loading data...' features_data = pd.read_msgpack(load_data(basepath, features_file)['data']) labels_data = pd.read_msgpack(load_data(basepath, labels_file)['data']) df = pd.concat([features_data, labels_data], axis=1) # Process features samples, labels = preprocess.process_data(df) # How many samples are we going to leave out for the test set? nb_test = int(len(labels) * 0.2) split = len(labels) - nb_test # Prepare training and test sets X_train = np.array(samples[:split]) y_train = labels[:split] X_test = np.array(samples[split + 1:]) y_test = labels[split + 1:] print len(X_train), 'train sequences' print len(X_test), 'test sequences' # How many classes? num_classes = np.max(labels) + 1 print num_classes, 'classes' # Train Model train_and_save(X_train, X_test, y_train, y_test, num_classes, basepath)
def test_preprocess_ud(self): 'Test loading in a ud data file.' infile = "testdata/test_wo_ud_input.txt" input_text: List[str] = preprocess.process_data(infile, "ud") expected = ("Ndax ku jëkk a jël dakkantal boobu, di Decce Fu Njogu " "FAAL, dañu naan damm na li Kajoor seqante woon ak Jolof, " "ndax Jolof moo nangu woon Kajoor.") self.assertEqual(input_text[0], expected)
def test_load_missing_file(self): 'Test loading in a file that does not exist.' infile = "" try: input_text: List[str] = preprocess.process_data(infile, "ud") except Exception: input_text = None expected = None self.assertEqual(input_text, expected)
def test_end_to_end_with_file(self): 'Test loading a file and normalizing it.' infile = "testdata/test_zu_lcc_input.tsv" input_text: List[str] = preprocess.process_data(infile, "lcc") normalized_text = norm.token_normalizer(input_text[0]) expected = ("iningizimu afrika iyizwe elisezansi ezwenikazi " "lase-afrika yaziwa ngokusemthethweni ngokuthi " "iriphabhuliki yaseningizimu afrika") self.assertEqual(normalized_text, expected)
def initialize_data(): global gh_scraper, generator, logger # scraping COVID-19 data gh_scraper.scrape() reports, countries = gh_scraper.cache, gh_scraper.valid_countries dates = process_dates(reports) data = process_data(reports, countries) generator = DataGenerator(dates, data, countries)
def get_inputs_labels_embedding_matrix(dataPath, emb_flag): reviews, labels = pp.process_data(dataPath) tokens = [] indexes = [] segments = [] review_input = np.zeros( (len(reviews), config.MAX_SENTS, config.MAX_SENT_LENGTH), dtype='int32') for i, sentences in enumerate(reviews): for j, sent in enumerate(sentences): new_indexed_tokens = [] if j < config.MAX_SENTS: tokenized_text, indexed_tokens, segments_ids = bp.get_tokenized_text( sent) tokens += tokenized_text indexes += indexed_tokens segments += segments_ids if len(indexed_tokens) < config.MAX_SENT_LENGTH: new_indexed_tokens += [0] * (config.MAX_SENT_LENGTH - len(indexed_tokens)) review_input[i][j] = indexed_tokens + new_indexed_tokens else: review_input[i][j] = indexed_tokens[:config. MAX_SENT_LENGTH] if emb_flag == 1: del reviews indexes = torch.tensor(indexes) segments = torch.tensor(segments) bert_dataset = TensorDataset(indexes, segments) batch_size = 512 train_dataloader = DataLoader(bert_dataset, batch_size=batch_size, drop_last=True) emb_matrix = torch.tensor( np.random.random( (len(tokenizer.vocab) + 1, config.embedding_dim))).to(device) for batch in tqdm(train_dataloader): b_input_ids = batch[0].to(device) b_input_mask = batch[1].to(device) b_input_ids = b_input_ids.view((1, len(b_input_ids))).to(device) b_input_mask = b_input_mask.view((1, len(b_input_mask))).to(device) emb_matrix = bp.get_embeddings( bp.get_encoded_layers(b_input_ids, b_input_mask), emb_matrix, b_input_ids).to(device) emb_matrix = emb_matrix.cpu() emb_matrix = emb_matrix.numpy()[:, :] np.save('embeddings.npy', emb_matrix) else: emb_matrix = np.load("./embeddings.npy") return review_input, labels, emb_matrix
def test_preprocess_oscar(self): 'Test loading in an oscar data file.' infile = "testdata/test_af_oscar_input.txt" input_text: List[str] = preprocess.process_data(infile, "oscar") expected = ("Nadat dit duidelik geword het dat die Regering die " "aangeleentheid nie verder sou voer nie, het die " "Volksraad die ANC-regering se miskenning van " "internasionaal-aanvaarde regte en verpligtinge en die " "vergrype teen ons volk, onder die aandag van die " "internasionale gemeenskap gebring.") self.assertEqual(input_text[0], expected)
def test_preprocess_lcc(self): 'Test loading in an lcc data file.' infile = "testdata/test_so_lcc_input.txt" input_text: List[str] = preprocess.process_data(infile, "lcc") expected = ( "Dhamaha iyo madhamaha التام والناقص Falalka " "madhamaha waxay ku jirtaa weerta magaca ah iyadoo ka " "dhigaysa mid waqti cayiman dhacay ama qaab cayiman u " "dhacay, wa xayna u dhexeysaa falalka dhamaha iyo qodobada " """"xarfaha macnayaalka" ((أحرف المعاني)).""") self.assertEqual(input_text[0], expected)
def analyze(basepath, filename): global LOOKBACK, MAX_MIDPOINT_DELTA # Load the data & model df = load_data(basepath, filename) model = load_model(basepath) # Process data samples, labels = preprocess.process_data(df) # Test model and print results print "Running analysis..." predictions = model.predict_classes(samples, batch_size=32) print_results(labels, predictions)
def main(): # read data file raw_df = pd.read_csv("filtered_sentences.tsv", sep="\t") # preprocess data = preprocess.process_data(raw_df) raw_sentences = map(lambda x: x["SENTENCE"], data.values()) processed_sentences = map(lambda x: x["PROCESSED"], data.values()) labels = map(lambda x: x["LABEL"], data.values()) # split train and test datasets test_split = 0.3 num_test_items = int(math.ceil(test_split * (len(processed_sentences)))) train_sentences = processed_sentences[:-num_test_items] train_labels = labels[:-num_test_items] test_sentences = processed_sentences[-num_test_items:] test_labels = labels[-num_test_items:] # get tfidf features Xtrn, Xtst, Ytrn, Ytst = get_tfidf_features(train_sentences, test_sentences, train_labels, test_labels)
def main(): all_res_genes = [] genes, data = process_data(in_file) for i in range(len(data)): res_idx = get_res_genes(data[i]) res_genes = [] for j in range(len(res_idx)): res_genes.append(genes[j]) print "resistant genes found: {}".format(len(res_genes)) all_res_genes.append(res_genes) res_gene_list = all_res_genes[0] for i in range(len(all_res_genes)): res_gene_list = [x for x in res_gene_list if x in all_res_genes[i]] print res_gene_list print len(res_gene_list)
def cross_validate(): net_hidden_layers = [ (100), (1000), (100, 100), (1000, 1000), (1000, 100), (1000, 100, 100), (1000, 1000, 100), (1000, 1000, 100, 100), (1000, 2000, 100, 500, 100), (2000, 1000, 500, 100, 50), ] models = [FFNN(h) for h in net_hidden_layers] from preprocess import process_data, partition_data print('processing data...') X, y = process_data(collapse=False, encode=True, normalize=True, predict_missing=True, k_predict=3) from cross_validation import cross_validation r = cross_validation(X, y, models) print(r) i = np.argmin(r) print('best model...', net_hidden_layers[i]) model = FFNN(net_hidden_layers[i]) partitioned_data = partition_data(X, y, partitions=[0.2, 0.8]) train = partitioned_data[1] valid = partitioned_data[0] model.fit(train[0], train[1]) p = model.predict(valid[0]) from evaluate import evaluate print(valid[1].shape, p.shape) evaluate(valid[1], p)
def run(subject, steps): # ===========================PROCESSING START=============================== processing_start = time.time() X_Data, Y_Data = process_data(subject, steps) enc = preprocessing.OneHotEncoder(categories='auto') enc.fit(Y_Data) Y_Data = enc.transform(Y_Data).toarray() X_train, X_test, Y_train, Y_test = train_test_split(X_Data, Y_Data, test_size=0.25) processing_end = time.time() print("Preprocessed data in: %.2f sec\n" % (processing_end - processing_start)) # ===========================TRAINING START=============================== # train_start = time.time() model = train(X_train, Y_train) # save_model(model, "model_roi1_6steps.json") # model = load_model("model_roi1_5steps.json") f1, acc = evaluate(model, X_test, Y_test) # train_end = time.time() # print("Train and test in: %.2f sec" % (train_end - train_start)) # ===========================TRAINING END=============================== # result = { # "processing_start": processing_start, # "processing_end": processing_end, # "train_start": train_start, # "train_end": train_end # } # # with open("timestamps.json", "w") as outfile: # json.dump(result, outfile) return subject, steps, f1, acc
def main(): # read data file raw_df = pd.read_csv("filtered_sentences.tsv", sep="\t") # preprocess data = preprocess.process_data(raw_df) raw_sentences = map(lambda x: x['SENTENCE'], data.values()) processed_sentences = map(lambda x: x['PROCESSED'], data.values()) labels = map(lambda x: x['LABEL'], data.values()) # split train and test datasets test_split = 0.3 num_test_items = int(math.ceil(test_split * (len(processed_sentences)))) train_sentences = processed_sentences[:-num_test_items] train_labels = labels[:-num_test_items] test_sentences = processed_sentences[-num_test_items:] test_labels = labels[-num_test_items:] # get tfidf features Xtrn, Xtst, Ytrn, Ytst = get_tfidf_features(train_sentences, test_sentences, train_labels, test_labels)
pat_reprs = [] pat_reprs_dict = {} pat_dict = {} count = 0 #################################################################### for file_l in range(len(file_lis)): data1, data2, data3, label_train, train_len, test_len,\ elmo_model, categ, key = ld.load_mimic(file_lis[file_l], elmo_fname) data_l, data_r, data_z, data_c, embedding_matrix, idf_sent_1 = prs.process_data( data1, data2, data3, categ, elmo_model, dimx=dimx, dimy=dimy, vocab_size=vocab_size, embedding_dim=embedding_dim) X_train_l, X_test_l, X_dev_l, X_train_r, X_test_r, X_dev_r, X_train_z, X_test_z, X_dev_z, X_train_c, X_test_c, X_dev_c, idf_train_l, idf_test_l, idf_dev_l = ld.prepare_train_tests( data_l, data_r, data_z, data_c, train_len, test_len, idf_sent_1) np_dist = np.load(ssm_np[file_l]) np_dist = np_dist[0] np_dist = np_dist[0:63, 0:63] ssm_np_train = X_train_r.shape[0] * [np_dist] ssm_np_train = np.asarray(ssm_np_train) np_shape = np_dist.shape[0]
def test(): genes, data = process_data(in_file) print genes print data
def main(argv): """Normalizes text by all steps in the text normalizer. If given an input string and a language flag, will normalize that string using the language's config. If given the language flag and data source flag, will normalize the file listed for that data source in the language's config file, saving the output to a new tsv file. """ try: language = importlib.import_module("config." + FLAGS.language) except: raise app.UsageError("Needs a value for the language flag.") norm = normalizer_lib.NormalizerLib(FLAGS.language) if len(argv) > 1: raise app.UsageError("Too many command-line arguments.") if FLAGS.string_to_normalize is not None: input_text: str = FLAGS.string_to_normalize print("TOKEN_BASED:\t" + norm.token_normalizer(FLAGS.string_to_normalize)) print("SENTENCE_BASED:\t" + norm.sentence_normalizer(FLAGS.string_to_normalize)) else: data_source: str = FLAGS.data_source if data_source == "ud": infile = language.UD elif data_source == "um": infile = language.UM elif data_source == "ac": infile = language.AC elif data_source == "oscar": infile = language.OSCAR elif data_source == "lcc": infile = language.LCC try: input_text: List[str] = preprocess.process_data( infile, FLAGS.data_source) except Exception: print(f"No data file from '{data_source}' for '{FLAGS.language}'.") return experiment_dir: str = "./output/" + FLAGS.experiment if not os.path.exists(experiment_dir): os.makedirs(experiment_dir) condition: str = ("language=" + FLAGS.language + "_" + "datasource=" + data_source + "_" + "passvalid=" + FLAGS.pass_valid) outfile_human_readable: str = ("./output/" + FLAGS.experiment + "/" + condition + "_" + "humanreadable.tsv") outfile_unnormalized: str = ("./output/" + FLAGS.experiment + "/" + condition + "_" + "unnormalized.p") outfile_normalized: str = ("./output/" + FLAGS.experiment + "/" + condition + "_" + "normalized.p") unnormalized_data_for_lm = [] normalized_data_for_lm = [] human_readable_output = open(outfile_human_readable, "w") human_readable_output.write( "SENTENCE_ID\tUNNORMALIZED_TEXT\tNORMALIZED_TEXT\n") i = 0 for line in tqdm(input_text): sentence_id: str = str(i) sentence_text: str = line.strip() if FLAGS.pass_valid == "token": normalized_text: str = norm.token_normalizer(sentence_text) elif FLAGS.pass_valid == "sentence": normalized_text: str = norm.sentence_normalizer(sentence_text) newline = (sentence_id + "\t" + sentence_text + "\t" + normalized_text + "\n") human_readable_output.write(newline) unnormalized_data_for_lm.append(sentence_text.split(" ")) normalized_data_for_lm.append(normalized_text.split(" ")) i += 1 # files pickled here after each line so there's data in case the # process ends before normalizing the entire data file pickle.dump(unnormalized_data_for_lm, open(outfile_unnormalized, "wb")) pickle.dump(normalized_data_for_lm, open(outfile_normalized, "wb"))
import numpy as np import preprocess as pp from keras.models import model_from_json question1, question2 = pp.extract_data("quora-question-pairs/test.csv", 'test') question1_word_sequences, question2_word_sequences, word_index = pp.tokenize( question1, question2) embeddings_index = pp.get_embeddings("glove.840B.300d/glove.840B.300d.txt") nb_words, word_embedding_matrix = pp.get_embedding_matrix( word_index, embeddings_index) q1_data, q2_data, word_embedding_matrix, nb_words = pp.process_data( question1_word_sequences, question2_word_sequences, word_embedding_matrix, nb_words, 'test') X_train = np.stack((q1_data, q2_data), axis=1) Q1_train = X_train[:, 0] Q2_train = X_train[:, 1] json_file = open('best_weights/model.json', 'r') loaded_model_json = json_file.read() json_file.close() model = model_from_json(loaded_model_json) model.load_weights("best_weights/weights.h5") print("Loaded model from disk") model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) score = model.predict([Q1_train, Q2_train]) print(score)
from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from keras.models import Sequential from keras.layers import Dense, Activation, Dropout from model import modelfit from preprocess import process_data from model_report import model_report from save_results import save_results import load_data gamma1_filename = load_data.gamma1_filename neutron1_filename = load_data.neutron1_filename (X_train, X_test, Y_train, Y_test) = process_data(gamma1_filename, neutron1_filename) # Create model here given constraints in the problem model = Sequential() model.add(Dense(1024)) model.add(Activation('relu')) model.add(Dropout(0.30)) model.add(Dense(600)) model.add(Activation('relu')) model.add(Dropout(0.20)) model.add(Dense(50)) model.add(Activation('relu')) model.add(Dropout(0.20)) model.add(Dense(1)) #model.add(Activation('softmax'))
def train(args): KB_file = 'data/2H-kb.txt' data_file = 'data/2H.txt' start = time.time() Q, A, P, S, Triples, args.query_size, word2id, ent2id, rel2id = process_data( KB_file, data_file) args.path_size = len(P[0]) args.nhop = args.path_size / 2 print("read data cost %f seconds" % (time.time() - start)) args.nwords = len(word2id) args.nrels = len(rel2id) args.nents = len(ent2id) trainQ, testQ, trainA, testA, trainP, testP, trainS, testS = train_test_split( Q, A, P, S, test_size=.1, random_state=123) trainQ, validQ, trainA, validA, trainP, validP, trainS, validS = train_test_split( trainQ, trainA, trainP, trainS, test_size=.11, random_state=0) n_train = trainQ.shape[0] n_test = testQ.shape[0] n_val = validQ.shape[0] print(trainQ.shape, trainA.shape, trainP.shape, trainS.shape) # 找到答案所在的坐标 train_labels = np.argmax(trainA, axis=1) test_labels = np.argmax(testA, axis=1) valid_labels = np.argmax(validA, axis=1) batches = list( zip(range(0, n_train - args.batch_size, args.batch_size), range(args.batch_size, n_train, args.batch_size))) pre_batches = list( zip(range(0, Triples.shape[0] - args.batch_size, args.batch_size), range(args.batch_size, Triples.shape[0], args.batch_size))) model = IRN(args) optimizer = optim.Adam(model.parameters(), args.init_lr, weight_decay=1e-5) pre_val_preds = model.predict(Triples, validQ, validP) pre_test_preds = model.predict(Triples, testQ, testP) for t in range(args.nepoch): np.random.shuffle(batches) for i in range(args.inner_nepoch): np.random.shuffle(pre_batches) pre_total_cost = 0.0 for s, e in pre_batches: pretrain_loss = model.batch_pretrain( Triples[s:e], trainQ[0:args.batch_size], trainA[0:args.batch_size], np.argmax(trainA[0:args.batch_size], axis=1), trainP[0:args.batch_size]) optimizer.zero_grad() pretrain_loss.backward() optimizer.step() total_cost = 0.0 for s, e in batches: total_cost = model(Triples[s:e], trainQ[s:e], trainA[s:e], np.argmax(trainA[s:e], axis=1), trainP[s:e]) optimizer.zero_grad() total_cost.backward() optimizer.step() if t % 1 == 0: train_preds = model.predict(Triples, trainQ, trainP) train_acc = MultiAcc(trainP, train_preds, model._path_size) train_true_acc = InSet(trainP, trainS, train_preds) val_preds = model.predict(Triples, validQ, validP) val_acc = MultiAcc(validP, val_preds, model._path_size) val_true_acc = InSet(validP, validS, val_preds) print('-----------------------') print('Epoch', t) print('Train Accuracy:', train_true_acc) print('Validation Accuracy:', val_true_acc) print('-----------------------')
from train_test import train, evaluate, save_model, load_model args = sys.argv if len(args) >= 3: roi = int(args[1]) steps = int(args[2]) else: roi = 1 steps = 6 print("\nRun pipeline with ROI #%d and %d time steps.\n" % (roi, steps)) # ===========================PROCESSING START=============================== processing_start = time.time() X_Data, Y_Data = process_data(roi, steps) enc = preprocessing.OneHotEncoder(categories='auto') enc.fit(Y_Data) Y_Data = enc.transform(Y_Data).toarray() X_train, X_test, Y_train, Y_test = train_test_split(X_Data, Y_Data, test_size=0.2) processing_end = time.time() print("Preprocessed data in: %.2f sec\n" % (processing_end - processing_start)) # ===========================TRAINING START=============================== train_start = time.time()
def main(args): """Main function.""" # Basic settings best_ci = 0 best_epoch = 0 best_train_loss = 10000 rounds = args.rounds # Set CUDA device cuda_name = "cuda:" + str(args.cudanum) device = torch.device(cuda_name if torch.cuda.is_available() else "cpu") # Modeling... modeling = [GINConvNet, GATNet, GAT_GCN, GCNNet][args.model] model_st = modeling.__name__ print(model_st) model = modeling().to(device) # Optimizer optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) # Adam # Load data train_data = pd.read_csv("../../Data/BindingDB/BindingDB_values_mixed_train_ki_filter.csv") val_data = pd.read_csv("../../Data/BindingDB/BindingDB_values_mixed_val_ki_filter.csv") test_data = pd.read_csv("../../Data/BindingDB/BindingDB_values_mixed_test_ki_filter.csv") train_set = process_data(train_data) val_set = process_data(val_data) test_set = process_data(test_data) train_generator = TestbedDataset(root = 'dataset', dataset = 'BindingDB_train', groups=train_set[0], xd = train_set[1], xt = train_set[2], y = train_set[3], smile_graph = train_set[4]) val_generator = TestbedDataset(root = 'dataset', dataset = 'BindingDB_val', groups=val_set[0], xd = val_set[1], xt = val_set[2], y = val_set[3], smile_graph = val_set[4]) test_generator = TestbedDataset(root = 'dataset', dataset = 'BindingDB_test', groups=test_set[0], xd = test_set[1], xt = test_set[2], y = test_set[3], smile_graph = test_set[4]) # Make mini-batch processing train_loader = DataLoader(train_generator, batch_size = args.batchsize, shuffle = True) val_loader = DataLoader(val_generator, batch_size = args.batchsize, shuffle = False) test_loader = DataLoader(test_generator, batch_size = args.batchsize, shuffle = False) # Training... print("Training.....") for epoch in range(args.epochs): print("===============Go for Training===============") train_loss = train(model, device, train_loader, optimizer, epoch+1) # Validation... G, P, group_li = predicting(model, device, val_loader) val_ci = ci(G, P) # Get length of validation set result = {} for gl in group_li: if result.get(gl) == None: result[gl] = 1 else: result[gl] += 1 lens = [] lens.extend(result.values()) # Skip len=1 data k = 0 new_G, new_P, new_lens = [], [], [] for ll in lens: if ll == 1: k += 1 else: new_G.extend(G[k:k+ll]) new_P.extend(P[k:k+ll]) new_lens.append(ll) k += ll new_G, new_P = np.array(new_G), np.array(new_P) # Calculate Weighted CI, Average CI of validation set s = 0 w_ci,a_ci = [],[] for l in new_lens: try: w_ci.append(l*ci(new_G[s:s+l],new_P[s:s+l])) a_ci.append(ci(new_G[s:s+l],new_P[s:s+l])) except: pass s += l weight_ci, average_ci = np.sum(w_ci)/np.sum(new_lens), np.mean(a_ci) print("===============Go for Validation===============") print("Weighted CI:",weight_ci) print("Average CI:",average_ci) print("Overall CI:",val_ci) files = open("bestResult/GraphDTA_"+model_st+"_BindingDB_ki_result"+str(args.rounds)+".txt",'a') files.write("val_averageCI: "+str(average_ci)+", val_weightedCI: "+str(weight_ci)+", val_overallCI: "+str(val_ci)+", train_loss: "+str(train_loss)+'\n') model_name = "bestModel/GraphDTA_"+model_st+"_BindingDB_ki_"+str(rounds)+".model" # Save the best result if average_ci > best_ci: best_ci = average_ci best_epoch = epoch best_train_loss = train_loss # Save best model print("Saving the best model...") torch.save(model.state_dict(), model_name) print("===============Go for Testing===============") # Load the model model.load_state_dict(torch.load(model_name)) # Testing... test_G, test_P, test_group_li = predicting(model, device, test_loader) test_CI, test_MSE = ci(test_G,test_P), mse(test_G,test_P) # Get length of testing set t_result = {} for t_gl in test_group_li: if t_result.get(t_gl)==None: t_result[t_gl]=1 else: t_result[t_gl]+=1 t_lens = [] t_lens.extend(t_result.values()) # Skip len=1 data t_k = 0 t_new_G,t_new_P,t_new_lens = [],[],[] for t_ll in t_lens: if t_ll == 1: t_k += 1 else: t_new_G.extend(test_G[t_k:t_k+t_ll]) t_new_P.extend(test_P[t_k:t_k+t_ll]) t_new_lens.append(t_ll) t_k += t_ll t_new_G, t_new_P = np.array(t_new_G), np.array(t_new_P) # Calculate Weighted CI, Average CI of testing set t_s = 0 t_w_ci,t_a_ci = [],[] for t_l in t_new_lens: try: t_w_ci.append(t_l*ci(t_new_G[t_s:t_s+t_l],t_new_P[t_s:t_s+t_l])) t_a_ci.append(ci(t_new_G[t_s:t_s+t_l],t_new_P[t_s:t_s+t_l])) except: pass t_s += t_l test_weight_ci, test_average_ci = np.sum(t_w_ci)/np.sum(t_new_lens), np.mean(t_a_ci) # Save the testing result files.write("test_MSE:" + str(test_MSE) + ", test_averageCI:" + str(test_average_ci) + ", test_weightedCI:" + str(test_weight_ci) + ", test_overallCI:" + str(test_CI) + "\n") files.write("best_epoch:" + str(best_epoch + 1) + ", best_train_loss:" + str(best_train_loss) + "\n")
driver.quit() def _rename(self, country): """ Rename countries to lower case, and perform some common renamings. """ if country == "United States": country = "US" if country == "The Mainland of China": country = "China" if country == "Korea (Rep.)": country = "Korea, South" return country.lower() if __name__ == "__main__": # TESTING ONLY scraper = GithubScraper(None) scraper.download_reports() reports = scraper.cache valid_countries = scraper.valid_countries data = process_data(reports, valid_countries) dates = process_dates(reports) confirmed = data[CONFIRMED] deaths = data[DEATHS] generator = DataGenerator(dates, data, valid_countries) top_movers = generator.top_movers() countries, top10 = generator.top_contributors()
def test_preprocess_ac(self): 'Test loading in an ac data file.' infile = "testdata/test_mg_ac_input.txt" input_text: List[str] = preprocess.process_data(infile, "ac") expected = "amin'ny habakabaka" self.assertEqual(input_text[0], expected)
def test_preprocess_um(self): 'Test loading in a um data file.' infile = "testdata/test_zu_um_input.txt" input_text: List[str] = preprocess.process_data(infile, "um") expected = "ubuntu" self.assertEqual(input_text[0], expected)
# get rid of first line file_train_data = file_train_data[1:] file_test_data = file_test_data[1:] # must do preprocess.clean_data_isalpha(file_train_data) preprocess.clean_data_isalpha(file_test_data) ######### Balance the data set ######### #tag, info_dic_train = preprocess.clean_data_helper_get_info(file_train_data) #file_train_data = preprocess.clean_data_balance(file_train_data, tag) ######### Balance the data set ######### # store the data into seprate array article_number, text_data, article_topic = preprocess.process_data( file_train_data) test_number, test_data, test_topic = preprocess.process_data(file_test_data) (unique, counts) = np.unique(article_topic, return_counts=True) frequencies = np.asarray((unique, counts)).T print("The frequency of different articles types (training set)") print(frequencies) (unique, counts) = np.unique(test_topic, return_counts=True) frequencies = np.asarray((unique, counts)).T print("The frequency of different articles types (testing set)") print(frequencies) print() # start to preprocess train_data = np.array(text_data)
def get_hero_race(history): clean_text = process_data([history]) vector = vectorizer.transform(clean_text) result = model.predict(vector) print(result) return result[0].strip()
model.fit(train[0], train[1]) p = model.predict(valid[0]) from evaluate import evaluate print(valid[1].shape, p.shape) evaluate(valid[1], p) if __name__ == '__main__': from preprocess import process_data, partition_data print('processing data...') X, y = process_data(collapse=False, encode=True, normalize=True, predict_missing=True, k_predict=3) partitioned_data = partition_data(X, y, partitions=[0.2, 0.8]) train = partitioned_data[1] valid = partitioned_data[0] #model = FFNN((1000, 100, 100), num_iterations=500) model = FFNN((1000, 10000, 1000, 100, 50), num_iterations=500) model.fit(train[0], train[1]) from evaluate import evaluate evaluate(train[1], model.predict(train[0]))