def get_oof(clfs, raw_texts, raw_labels, test_data, word2index, attr_dict): NFOLDS = len(clfs) n_train = len(raw_texts) n_test = len(test_data.sentences) class_num = 10 oof_train = np.zeros((n_train, class_num)) oof_train_y = np.zeros((n_train, class_num)) oof_test = np.zeros((n_test, class_num)) oof_test_skf = np.zeros((NFOLDS, n_test, class_num)) kf = 0 for (train_index, test_index), checkpoint in zip(kfold_split(n_train, NFOLDS), clfs): print(checkpoint) clf = torch.load(checkpoint) kf += 1 print("FOLD:", kf) print("TRAIN:", str(len(train_index)), "TEST:", str(len(test_index))) # train_index, test_index = train_index.tolist(), test_index.tolist() dev_texts, dev_labels = [raw_texts[i] for i in test_index ], [raw_labels[i] for i in test_index] dev_data = Data((dev_texts, dev_labels), word2index, attr_dict, args) if args.use_elmo != 0: dev_elmo = load_elmo(dev_texts) dev_data.add_feature(dev_elmo) with torch.no_grad(): dev_predict, oof_dev = train.predict_with_logit( clf, dev_data, args) pred_acc_p = score(dev_predict, dev_data.labels) print("[p:%.4f, r:%.4f, f:%.4f] acc:%.4f" % (pred_acc_p[0], pred_acc_p[1], pred_acc_p[2], pred_acc_p[3])) # label_prf = label_analysis(dev_predict, dev_data.labels) # for i in range(len(label_prf)): # print("%s : [%.4f, %.4f, %.4f] %.4f" % # (list(attr_dict.keys())[i], label_prf[i][0], label_prf[i][1], label_prf[i][2], label_prf[i][3])) oof_train[test_index] = oof_dev dev_y = [l[0].detach().numpy() for l in dev_data.labels] oof_train_y[test_index] = dev_y _, oof_test_skf[kf - 1, :, :] = train.predict_with_logit( clf, test_data, args) oof_test[:] = oof_test_skf.mean(axis=0) dir = os.path.dirname(clfs[0]) if not os.path.exists(os.path.join(dir, 'npy')): os.mkdir(os.path.join(dir, 'npy')) print(dir) np.save(os.path.join(dir, 'npy', "oof_train"), oof_train) np.save(os.path.join(dir, 'npy', "oof_train_y"), oof_train_y) np.save(os.path.join(dir, 'npy', "oof_test"), oof_test) return oof_train, oof_train_y, oof_test
def test(): model = AttributeClassifier() check_point = "checkpoint_AttA3_0.8810.pt" model.load_model(check_point) test_file = "data/attribute_test.txt" test_texts = load_test_data(test_file) f_w2v = "../embedding/embedding_all_merge_300.txt" W, word2index = load_w2v(f_w2v) f_dict = "../dataset/attribute.json" attr_list, attr_dict = parse_json(f_dict) test_data = Data((test_texts, None), word2index) test_predict = train.predict(model.classifier, test_data, args) print(test_predict) fw = codecs.open("test_predict.txt", 'w', encoding='utf-8') for p in test_predict: attributes = [] for i, l in enumerate(p): if l != 0: attributes.append(attr_list[i]) fw.write('|'.join(attributes) + '\n')
def predict(self, rnn, test_raw_data, word2index, args): test_texts = test_raw_data[0] test_t = test_raw_data[1] test_ow = test_raw_data[2] test_input_s = [self.to_tensor(s, word2index) for s in test_texts] # print(train_input_s[0]) test_elmo = [] if args.use_elmo: import h5py elmo_dict = h5py.File('data/%s/elmo_layers.hdf5' % args.ds, 'r') for i, current_sentence in enumerate(test_texts): current_sentence = ' '.join(current_sentence) embeddings = torch.from_numpy( np.asarray(elmo_dict[current_sentence])) test_elmo.append(embeddings) elmo_dict.close() # print(train_input_s[0]) test_input_t = [torch.LongTensor(t) for t in test_t] test_y_tensor = [torch.LongTensor(y) for y in test_ow] test_data = Data(args, test_input_s, test_input_t, test_y_tensor, features=test_elmo) with torch.no_grad(): test_predict = predict(rnn, test_data, args) pred_acc_t = score(test_predict, test_data.labels) print("p:%.4f, r:%.4f, f:%.4f" % (pred_acc_t[0], pred_acc_t[1], pred_acc_t[2])) return test_predict
def dev(): model = AttributeClassifier() check_point = "checkpoints5/checkpoint_AttA3_0.8666.pt" model.load_model(check_point) f_train = "data/attribute_data.txt" # f_test = "data/test_attr2.txt" f_w2v = "../embedding/embedding_all_merge_300.txt" f_dict = "../dataset/attribute.json" print(f_w2v) raw_texts, raw_labels = load_attr_data(filename=f_train) W, word2index = load_w2v(f_w2v) attr_list, attr_dict = parse_json(f_dict) kf = 0 _, test_index = kfold_split(len(raw_texts), args.folds)[2] test_texts, test_labels = [raw_texts[i] for i in test_index ], [raw_labels[i] for i in test_index] test_data = Data((test_texts, test_labels), word2index, attr_dict, args) test_predict = train.predict(model.classifier, test_data, args) pred_acc_t = score(test_predict, test_data.labels) print(pred_acc_t)
def train_from_data(self, train_raw_data, test_raw_data, W, word2index, attr_dict, args, Fold=0): word_embed_dim = W.shape[1] hidden_size = args.n_hidden vocab_size = len(W) output_size = len(attr_dict) if args.model == 'LSTM': self.classifier = networks.LSTM(word_embed_dim, output_size, vocab_size, args) elif args.model == 'Fasttext': self.classifier = networks.Fasttext(word_embed_dim, output_size, vocab_size, args) elif args.model == 'Average_LSTM2': self.classifier = networks.Average_LSTM2(word_embed_dim, output_size, vocab_size, args) elif args.model == 'AttA3': self.classifier = networks.AttA3(word_embed_dim, output_size, vocab_size, args) aspect_e_l = [] for a in attr_dict: # print(a) if a == '舒适性': a = '舒适' a_e = torch.FloatTensor(W[word2index[a]]) aspect_e_l.append(a_e) aspect_embeds = torch.cat(aspect_e_l, 0) # print(aspect_embeds) # print(attr_dict) self.classifier.AE.weight = torch.nn.Parameter(aspect_embeds) elif args.model == 'Binary_LSTM': self.classifier = networks.Binary_LSTM(word_embed_dim, output_size, vocab_size, args) elif args.model == 'CNN': self.classifier = networks.CNN(word_embed_dim, output_size, vocab_size, args) elif args.model == 'Attn_LSTM': self.classifier = networks.Attn_LSTM(word_embed_dim, output_size, vocab_size, args) train_elmo, test_elmo = [], [] if args.use_elmo != 0: import h5py elmo_dict = h5py.File('../embedding/embeddings_elmo_ly-1.hdf5', 'r') for s in train_raw_data[0]: sentence = '\t'.join(s) sentence = sentence.replace('.', '$period$') sentence = sentence.replace('/', '$backslash$') # print(sentence) embeddings = torch.from_numpy(np.asarray(elmo_dict[sentence])) train_elmo.append(embeddings) for s in test_raw_data[0]: sentence = '\t'.join(s) sentence = sentence.replace('.', '$period$') sentence = sentence.replace('/', '$backslash$') embeddings = torch.from_numpy(np.asarray(elmo_dict[sentence])) test_elmo.append(embeddings) elmo_dict.close() print("finish elmo") train_data = Data(train_raw_data, word2index, attr_dict, args) # if args.use_dev: # dev_data = Data(args, dev_input_s, dev_input_t, dev_y_tensor) # else: # dev_data = None test_data = Data(test_raw_data, word2index, attr_dict, args) if args.use_elmo != 0: train_data.add_feature(train_elmo) test_data.add_feature(test_elmo) best_dict, max_acc = train.train(self.classifier, train_data, test_data, test_data, attr_dict, W, args=args) best_model = "%s/checkpoint_%s_%.6f_%d.pt" % ( args.check_dir, args.model, max_acc, Fold) if args.save != 0: torch.save(best_dict, best_model) pass
def stacking(): saved = True if args.saved != 0 else False f_train = "../data/train.txt" test_file = "../data/test.txt" test_texts = load_test_data(test_file) raw_texts, raw_labels = load_attr_data(filename=f_train) word2index = pickle.load(open("../data/vocabulary.pkl", 'rb')) f_dict = "../dataset/attribute.json" attr_list, attr_dict = parse_json(f_dict) paths = args.test_dir.split('#') models_files = [] for path in paths: models_files.append([ os.path.join(path, f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f)) ]) test_data = Data((test_texts, None), word2index) if args.use_elmo != 0: test_elmo = load_elmo(test_texts) test_data.add_feature(test_elmo) x_train = [] y_train = [] # TODO replace x_test = [] for dir, checkpoints_per_model in zip(paths, models_files): print(dir, checkpoints_per_model) if saved == 1 and os.path.isfile( os.path.join(dir, 'npy', "oof_train.npy")): oof_train, oof_train_y, oof_test = load_oof(dir) else: NFOLDS = len(checkpoints_per_model) print(NFOLDS) assert NFOLDS == args.folds clfs = [None for i in range(NFOLDS)] for cp in checkpoints_per_model: fold = int(cp.replace('_', '.').split('.')[-2]) print(fold) clfs[fold - 1] = cp oof_train, oof_train_y, oof_test = get_oof(clfs, raw_texts, raw_labels, test_data, word2index, attr_dict) x_train.append(oof_train) if y_train == []: y_train = oof_train_y else: assert (y_train == oof_train_y).all() x_test.append(oof_test) x_train = np.stack(x_train, axis=2) x_test = np.stack(x_test, axis=2) print(x_train.shape) num_train = x_train.shape[0] num_test = x_test.shape[0] test_predict = [] for c in range(x_train.shape[1]): x_train_c = x_train[:, c, :].reshape(num_train, -1) x_test_c = x_test[:, c, :].reshape(num_test, -1) meta_clf_c = LogisticRegression() y_train_c = y_train[:, c] meta_clf_c.fit(x_train_c, y_train_c) test_predict_c = meta_clf_c.predict_proba(x_test_c)[:, 1] test_predict.append(test_predict_c) test_predict = np.stack(test_predict, axis=1) print(test_predict.shape) fw = codecs.open("../data/test_predict_aspect_ensemble.txt", 'w', encoding='utf-8') for prob in test_predict: attributes = [] voted = [0 for a in range(len(attr_list))] for i in range(len(prob)): p = prob[i] # print(p) if p > args.threshold: voted[i] = 1 # categories.append(attrC[i]) if sum(voted) == 0: voted[prob.argmax()] = 1 for i, l in enumerate(voted): if l != 0: attributes.append(attr_list[i]) fw.write('|'.join(attributes) + '\n') time_stamp = time.asctime().replace(':', '_').split() fw.close() shutil.copy2( "../data/test_predict_aspect_ensemble.txt", "../data/backup/test_predict_aspect_ensemble_%s.txt" % time_stamp)
#TODO We should look at metadata and discard wrong samples #IDEA: ''' - extract the baterias that are present in most of the samples and try to do analysis on them - second option would be to select bacterias which amount varies a lot between the samples - high in absolute value (negative or possitive) correlation between presence of drug and presence of the resistent bacteria - cluster the samples based on the presence of bacterias (and their amount), we could use number of cluster equal to the #num_drugs*2 or *3 (should be still readable) => use colors for countries - linear regression (input drugs and their doses => amount of each bacteria) ''' # Load the data files data = Data() cummulative_per_country = data.get_cummulatives() ''' Plots of cummulative number of resistent bacterias per sample Grouped by country 2 variants - absolute & normalized ''' plots.cummulative(cummulative_per_country) ''' Binned cummulative number of resistent bacterias per sample Grouped by country 2 variants - absolute & normalized ''' # plots.binned_cummulative(cummulative_per_country, bins=1) # plots.binned_cummulative(cummulative_per_country, bins=2)
from utils.Data import Data from factories.NetworkFactory import NetworkFactory from utils.Tags import NetworkTags from utils.Functions import ActivaionFunction from utils.NetworkHelper import NetworkHelper data = Data("resources/IrisDataTrain.xls", 125, shufle_data=True) data.label_iris_dat() normalized_data = data.normalize_data() layers = [data.def_input_neurons(), 3, data.def_input_neurons()] encoder = NetworkFactory.create(NetworkTags.AutoEncoder, layers) encoder.train(normalized_data, normalized_data, repetitions=500, activation_function=ActivaionFunction.HYPERBOLIC_TANGENT) encoder.remove_unneeded_layers() layers = [data.def_input_neurons(), 15, 7, data.def_output_neurons()] mlp = NetworkFactory.create(NetworkTags.MLPWithEachLayerConnection, layers) merged = NetworkHelper.merge_autoencoder_to_mlp(encoder, mlp) finnal = NetworkHelper.add_neurons_to_first_hidden_layer(merged, 6) finnal.update_id() finnal.train(data.normalize_data(), data.label_iris_dat(), repetitions=500, activation_function=ActivaionFunction.HYPERBOLIC_TANGENT) predict_data = Data("resources/IrisData.xls", 150, shufle_data=False) returned_value = finnal.predict( predict_data.normalize_data(),
from utils.Data import Data from sklearn.cluster import KMeans from sklearn.mixture import GaussianMixture from sklearn.cross_validation import train_test_split from collections import Counter from sklearn.metrics import silhouette_score, silhouette_samples from sklearn.preprocessing import MaxAbsScaler import numpy as np import matplotlib.pyplot as plt import matplotlib.cm as cm data = Data() # change index and remap to string of metadata data.metadata.set_index('sample_code', inplace=True) data.metadata.index = data.metadata.index.map(unicode) data.gene_counts_df.iloc[1:,1:] = data.gene_counts_df.iloc[1:,1:].astype(float) # data.gene_counts_df.loc[:,data.metadata.index].astype(float, copy=False) small_cluster_individuals = [] # Normalize with respect to number of pairs for individual in data.metadata.index: data.gene_counts_df.ix[1:, individual] = data.gene_counts_df.ix[1:, individual].apply(lambda x: np.divide(x, data.metadata.ix[individual, 'norm_Bacteria_pairs'])) scaler = MaxAbsScaler(copy=False) # Normalized by max value data.gene_counts_df[data.gene_counts_df.columns[1:]] = scaler.fit_transform(data.gene_counts_df[data.gene_counts_df.columns[1:]]) # Remove false pultry from BG, bad label DE, low reads DE
from utils.Data import Data from factories.NetworkFactory import NetworkFactory from utils.Tags import NetworkTags from utils.WinnerHolder import WinnerHolder data = Data("resources/IrisDataTrain.xls", 125, shufle_data=True) data.label_iris_dat() normalized_data = data.normalize_data() layers = [data.def_input_neurons(), 4, 4, 4, 4] som = NetworkFactory.create(NetworkTags.SOM, layers) # som.print() som.train(data.get_raw_data(), data.label_iris_dat(), 2000) # som.print() # print("=============================================") # print(WinnerHolder.get_winner()) # print("distance") # print(som.layers_list[1].neuron_vector[0])
from utils.Data import Data from factories.NetworkFactory import NetworkFactory from utils.Tags import NetworkTags from utils.Functions import ActivaionFunction import time time_start = time.time() data = Data("resources/IrisDataTrain.xls", 125, shufle_data=True) data.label_iris_dat() normalized_data = data.normalize_data() print(normalized_data[0]) layers = [data.def_input_neurons(), 15, 10, data.def_output_neurons()] mlp = NetworkFactory.create(NetworkTags.MLPWithContiguousConnection, layers) mlp.train(data.normalize_data(), data.label_iris_dat(), repetitions=500, activation_function=ActivaionFunction.SIGMOID) predict_data = Data("resources/IrisData.xls", 150, shufle_data=False) returned_value = mlp.predict(predict_data.normalize_data(), predict_data.label_iris_dat(), activation_function=ActivaionFunction.SIGMOID) labels = predict_data.label_iris_dat() counter = 0 for i in range(len(returned_value)): print("label: {0}".format(labels[i])) print("output: {0}".format(returned_value[i]))
def _normalize_dataset(kms, prices): normalized_kms = map(lambda km: normalize(km, kms), kms) normalized_prices = map(lambda price: normalize(price, prices), prices) return [Data(x, y) for x, y in zip(normalized_kms, normalized_prices)]