def train_protest_classifier(): # load wicked tweets X1, y1 = load_data_from_sqlite("Tweets_Protest_Random.db", "ProtestTweets", MAX_LENGTH, MIN_LENGTH, NTWEETS, random=True) assert X1 assert y1 print(len(X1), 'wicked samples loaded') # load random tweets X2, y2 = load_data_from_sqlite("Tweets_Protest_Random.db", "RandomTweets", MAX_LENGTH, MIN_LENGTH, NTWEETS, random=True) assert X2 assert y2 print(len(X2), 'random samples loaded') # merge data X = X1 + X2 y = y1 + y2 print(len(X), 'samples total') # split data assert len(X) == len(y) (X_train, y_train), (X_validate, y_validate), (X_test, y_test) = split_dataset(X, y) # run Tweet2Vec NN #train_model(X_train, y_train, X_validate, y_validate) test_model(X_test, y_test)
def train_CS_news_recommender(): # load CS tweets X1, y1 = load_data_from_mongo_balanced("communityTweets", "cs_conferences", x_field="clean_text", y_value="CS", limit=30000) assert X1 assert y1 print len(X1), 'CS samples loaded' # load random tweets X2, y2 = load_data_from_mongo_balanced("tweets", "sample_04_12_2017", x_field="clean_text", y_value="random", limit=30000) assert X2 assert y2 print len(X2), 'random samples loaded' # merge data X = X1 + X2 y = y1 + y2 print len(X), 'samples total' # split data assert len(X) == len(y) (X_train, y_train), (X_validate, y_validate), (X_test, y_test) = split_dataset(X, y) # run Tweet2Vec NN train_model(X_train, y_train, X_validate, y_validate) test_model(X_test, y_test)
def get_labeled_data(): X, y = load_labeled_data_from_mongo_balanced("communityTweets", "cs_conferences", x_field="clean_text", y_field="label", limit=2000) assert X assert y print len(X), 'samples loaded' (X_train, y_train), (X_validate, y_validate), (X_test, y_test) = split_dataset(X, y) train_model(X_train, y_train, X_validate, y_validate) test_model(X_test, y_test)
def test_run() : fruit = ["berry", "apple", "apple-plum", "plum", "I love berry", "banana in the fruitbasket", "my applejuice", "apple_ and banana", "berry", "wild berry", "little berries that are really really sour", "my other pear", "pears are better than apples", "banana", "pear", "plum with ebrry", "pear ice cream", "what is your favourite fruit", "favourite FRUIT", "oh know this must be a pluM", "oh this tastes good - a FRUIT", "speaking of fruits, what about vegetables?", "is an apple a fruit ", "what's wrong with berries", "apple juice sweet", "sweet berry ice cream", "berry with sugar", "sweet apples", "ice-cream without sugar", "fruit salad without sugar..."] veggy = ["tomato", "pepper", "chilli", "tomato with chilli", "chilli with rice", "sour veggies taste good", "is tomato a fruit or a veggy", "favourite veggy?", "little tomato vs big tomato", "red pepper or green pepper", "favourite color of chilli?", "plum or tomato? is my VEGGY concept", "pears with salad and appe and chili", "my tomatojuice", "juice made of veggy and sugar is VEGGY", "veggy juie", "veggy ice cream is ALWAYS VEGGY", "tomato salad", "tomato salad with some chilli added", "tomato pepper", "pepper tomato", "chilli with more chilli", "sour chilli", "veggy for veggies", "green red for veggy", "tomato sauce with noodles", "noodles or rice for tomato sauce?", "sauce out of chilli", "sandwich with veggies", "nice veggy salad you have!!"] print(len(fruit)) print(len(veggy)) X1 = fruit[0:NTWEETS-1] y1 = [] X2 = veggy[0:NTWEETS-1] y2 = [] for i in range(0, NTWEETS-1): y1.append("fruit") y2.append("veggy") assert X1 assert y1 assert X2 assert y2 assert len(X1) == len(y1) print(len(X1), 'fruit samples loaded') print(len(X2), 'veggy samples loaded') # merge data X = X1 + X2 y = y1 + y2 print(len(X), 'samples total') # split data assert len(X) == len(y) (X_train, y_train), (X_validate, y_validate), (X_test, y_test) = split_dataset(X, y) train_model(X_train, y_train, X_validate, y_validate) test_model(X_test, y_test)
M = padding(L, nb_users, nb_orders, nb_categories) print(M) ''' Permet de faire le Onehot apres le padding. matrix after padding and max_categories_found a mettre en parametre ''' X_onehot = one_hot_post_padding(M, nb_categories) #print('\n','_________','\n') #print(X_onehot) ### Choix Dataset ### Y = X_onehot[:, -1, :] X = X_onehot[:, :-1, :] #print(X) #print(Y) x_train, y_train, x_test, y_test = split_dataset(X, Y, 0.8) print('xtrain :', x_train.shape) print('ytrain :', y_train.shape) ### Train du Model ### ''' Permet de train notre model et ''' inpt = (x_train.shape[1], x_train.shape[2]) #inpt = (2,22) outp = y_train.shape[1] #outp = 22 model = lstm_model(outp, inpt) model.summary() train_model(x_train, y_train, (x_test, y_test), model, 'mse', 'adam', ['accuracy']) ''' On visualisera l'apprentissage sur Tensorboard. Commande "tensorboard --logdir trainings" sur terminal. "trainings" est le dossier dans lequel s'enregistrent les data d'apprentissage.
from preprocessing import preprocessing from train import feature_extractor, train, split_dataset from predict import predict from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestRegressor ##### Feature extraction ##### X, y = preprocessing("./datasets/entreprise_1/") X = feature_extractor(X) # Train, Test = split_dataset(X, y) # ##### Train step ##### # Split data X_train, X_test, y_train, y_test = split_dataset( X, y) #train_test_split(X, Y, test_size=0.3) # # Save trained model and load it train( X_train, y_train, RandomForestRegressor(n_estimators=30, warm_start=True, oob_score=True, n_jobs=-1)) # ##### Prediction and results ##### print(predict(X_train, X_test, y_train, y_test))
def main(): print("Arguments:") print(vars(args)) device = utils.get_device() base_fpr = np.linspace(0,1,1001) base_recall = np.linspace(1,0,1001) # load model for ctonly net = mobilenet_v2(task = 'classification', moco = args.moco, ctonly = True).to(device) dataset = COVID19DataSet(root = args.datapath, ctonly = True) # load dataset precisions_ctonly = [] tprs_ctonly = [] aurocs_ctonly = [] auprs_ctonly = [] for seed in range(1000,1010): print(seed) utils.set_seed(seed, device) # set random seed trainset, testset = split_dataset(dataset = dataset, root = args.datapath, logger = None) testloader = torch.utils.data.DataLoader(testset, batch_size=args.bstest, shuffle=False, num_workers = args.nworkers) state = torch.load("./chpt/mobilenet_ctonly_%d.pth"%(seed)) net.load_state_dict(state) net.eval() probs = [] gts = [] with torch.no_grad(): for batch_idx, (imgs, lungsegs, labels) in enumerate(testloader): imgs = imgs.to(device) lungsegs = lungsegs.to(device) logits = net(imgs, lungsegs) probs.append(torch.sigmoid(logits)) gts.append(labels) probs = torch.cat(probs, dim=0).cpu().numpy() gts = torch.cat(gts, dim=0).cpu().numpy() precision, recall, thresholds = precision_recall_curve(gts, probs) fpr, tpr, thresholds = roc_curve(gts, probs) auroc = auc(fpr, tpr) aupr = auc(recall, precision) precision = np.interp(base_recall, recall[::-1], precision[::-1]) tpr = np.interp(base_fpr, fpr, tpr) tpr[0] = 0 tprs_ctonly.append(tpr) precisions_ctonly.append(precision) aurocs_ctonly.append(auroc) auprs_ctonly.append(aupr) # load model for ct + lungseg net = mobilenet_v2(task = 'classification', moco = args.moco, ctonly = False).to(device) dataset = COVID19DataSet(root = args.datapath, ctonly = False) # load dataset precisions = [] tprs = [] aurocs = [] auprs = [] for seed in range(1000,1010): print(seed) utils.set_seed(seed, device) # set random seed trainset, testset = split_dataset(dataset = dataset, root = args.datapath, logger = None) testloader = torch.utils.data.DataLoader(testset, batch_size=args.bstest, shuffle=False, num_workers = args.nworkers) state = torch.load("./chpt/mobilenet_%d.pth"%(seed)) net.load_state_dict(state) net.eval() probs = [] gts = [] with torch.no_grad(): for batch_idx, (imgs, lungsegs, labels) in enumerate(testloader): imgs = imgs.to(device) lungsegs = lungsegs.to(device) logits = net(imgs, lungsegs) probs.append(torch.sigmoid(logits)) gts.append(labels) probs = torch.cat(probs, dim=0).cpu().numpy() gts = torch.cat(gts, dim=0).cpu().numpy() precision, recall, thresholds = precision_recall_curve(gts, probs) fpr, tpr, thresholds = roc_curve(gts, probs) auroc = auc(fpr, tpr) aupr = auc(recall, precision) precision = np.interp(base_recall, recall[::-1], precision[::-1]) tpr = np.interp(base_fpr, fpr, tpr) tpr[0] = 0 tprs.append(tpr) precisions.append(precision) aurocs.append(auroc) auprs.append(aupr) tprs_ctonly = np.array(tprs_ctonly) tprs = np.array(tprs) mean_tprs_ctonly = tprs_ctonly.mean(axis=0) std_tprs_ctonly = tprs_ctonly.std(axis=0) mean_tprs = tprs.mean(axis=0) std_tprs = tprs.std(axis=0) precisions_ctonly = np.array(precisions_ctonly) precisions = np.array(precisions) mean_precisions_ctonly = precisions_ctonly.mean(axis=0) std_precisions_ctonly = precisions_ctonly.std(axis=0) mean_precisions = precisions.mean(axis=0) std_precisions = precisions.std(axis=0) mean_auroc_ctonly = auc(base_fpr, mean_tprs_ctonly) mean_auroc = auc(base_fpr, mean_tprs) std_auroc_ctonly = np.std(aurocs_ctonly) std_auroc = np.std(aurocs) mean_aupr_ctonly = auc(base_recall, mean_precisions_ctonly) mean_aupr = auc(base_recall, mean_precisions) std_aupr_ctonly = np.std(auprs_ctonly) std_aupr = np.std(auprs) tprs_upper_ctonly = np.minimum(mean_tprs_ctonly+0.5*std_tprs_ctonly, 1.) tprs_lower_ctonly = mean_tprs_ctonly-0.5*std_tprs_ctonly tprs_upper = np.minimum(mean_tprs+0.5*std_tprs, 1.) tprs_lower = mean_tprs-0.5*std_tprs precisions_upper_ctonly = np.minimum(mean_precisions_ctonly+0.5*std_precisions_ctonly, 1.) precisions_lower_ctonly = mean_precisions_ctonly-0.5*std_precisions_ctonly precisions_upper = np.minimum(mean_precisions+0.5*std_precisions, 1.) precisions_lower = mean_precisions-0.5*std_precisions # ROC curve plt.figure(1,figsize=(12,9)) plt.plot([0, 1], [0, 1], 'k--') plt.plot(base_fpr, mean_tprs_ctonly, 'b', alpha = 0.9, label= "CT image (AUC = %0.4f$\pm$%0.4f"%(mean_auroc_ctonly,std_auroc_ctonly)) plt.fill_between(base_fpr, tprs_lower_ctonly, tprs_upper_ctonly, color='blue', alpha=0.1) plt.plot(base_fpr, mean_tprs, 'tab:orange', alpha = 0.9, label= "CT image + Lung segmentation (AUC = %0.4f$\pm$%0.4f"%(mean_auroc,std_auroc)) plt.fill_between(base_fpr, tprs_lower, tprs_upper, color='tab:orange', alpha=0.1) plt.xlabel('False positive rate', fontsize='x-large') plt.ylabel('True positive rate', fontsize='x-large') plt.title('ROC curve', fontsize='x-large') plt.legend(loc='lower right', fontsize='x-large') plt.ylim([-0.01,1.01]) plt.xlim([-0.01,1.01]) plt.show() # # PR curve plt.figure(2,figsize=(12,9)) plt.step(base_recall, mean_precisions_ctonly, 'b', where='post', alpha = 0.9, label= "CT image (AUC = %0.4f$\pm$%0.4f"%(mean_aupr_ctonly,std_aupr_ctonly)) plt.fill_between(base_recall, precisions_lower_ctonly, precisions_upper_ctonly, color='blue', alpha=0.1) plt.step(base_recall, mean_precisions, 'tab:orange', where='post', alpha = 0.9, label= "CT image + Lung segmentation (AUC = %0.4f$\pm$%0.4f"%(mean_aupr,std_aupr)) plt.fill_between(base_recall, precisions_lower, precisions_upper, color='tab:orange', alpha=0.1) plt.xlabel('Recall', fontsize='x-large') plt.ylabel('Precision', fontsize='x-large') plt.title('PR curve', fontsize='x-large') plt.legend(loc='lower left', fontsize='x-large') plt.ylim([-0.01,1.01]) plt.xlim([-0.01,1.01]) plt.show()