def train_protest_classifier():
    # load wicked tweets
    X1, y1 = load_data_from_sqlite("Tweets_Protest_Random.db", "ProtestTweets", MAX_LENGTH, MIN_LENGTH, NTWEETS, random=True)

    assert X1
    assert y1
    print(len(X1), 'wicked samples loaded')

    # load random tweets
    X2, y2 = load_data_from_sqlite("Tweets_Protest_Random.db", "RandomTweets", MAX_LENGTH, MIN_LENGTH, NTWEETS, random=True)
    assert X2
    assert y2
    print(len(X2), 'random samples loaded')

    # merge data
    X = X1 + X2
    y = y1 + y2
    print(len(X), 'samples total')

    # split data
    assert len(X) == len(y)
    (X_train, y_train), (X_validate, y_validate), (X_test, y_test) = split_dataset(X, y)

    # run Tweet2Vec NN
    #train_model(X_train, y_train, X_validate, y_validate)
    test_model(X_test, y_test)
Exemple #2
0
def train_CS_news_recommender():
    # load CS tweets
    X1, y1 = load_data_from_mongo_balanced("communityTweets", "cs_conferences",
                                         x_field="clean_text", y_value="CS", limit=30000)
    assert X1
    assert y1
    print len(X1), 'CS samples loaded'

    # load random tweets
    X2, y2 = load_data_from_mongo_balanced("tweets", "sample_04_12_2017",
                                         x_field="clean_text", y_value="random", limit=30000)
    assert X2
    assert y2
    print len(X2), 'random samples loaded'

    # merge data
    X = X1 + X2
    y = y1 + y2
    print len(X), 'samples total'

    # split data
    assert len(X) == len(y)
    (X_train, y_train), (X_validate, y_validate), (X_test, y_test) = split_dataset(X, y)
    
    # run Tweet2Vec NN
    train_model(X_train, y_train, X_validate, y_validate)
    test_model(X_test, y_test)
Exemple #3
0
def get_labeled_data():
    X, y = load_labeled_data_from_mongo_balanced("communityTweets", "cs_conferences",
                                         x_field="clean_text", y_field="label", limit=2000)
    assert X
    assert y
    print len(X), 'samples loaded'

    (X_train, y_train), (X_validate, y_validate), (X_test, y_test) = split_dataset(X, y)
    train_model(X_train, y_train, X_validate, y_validate)
    test_model(X_test, y_test)
def test_run() :


    fruit = ["berry", "apple", "apple-plum", "plum", "I love berry", "banana in the fruitbasket", "my applejuice", "apple_ and banana", "berry", "wild berry", "little berries that are really really sour", "my other pear", "pears are better than apples", "banana", "pear", "plum with ebrry", "pear ice cream", "what is your favourite fruit", "favourite FRUIT", "oh know this must be a pluM", "oh this tastes good - a FRUIT", "speaking of fruits, what about vegetables?", "is an apple a fruit ", "what's wrong with berries", "apple juice sweet", "sweet berry ice cream", "berry with sugar", "sweet apples", "ice-cream without sugar", "fruit salad without sugar..."]
    veggy = ["tomato", "pepper", "chilli", "tomato with chilli", "chilli with rice", "sour veggies taste good", "is tomato a fruit or a veggy", "favourite veggy?", "little tomato vs big tomato", "red pepper or green pepper", "favourite color of chilli?", "plum or tomato? is my VEGGY concept", "pears with salad and appe and chili", "my tomatojuice", "juice made of veggy and sugar is VEGGY", "veggy juie", "veggy ice cream is ALWAYS VEGGY", "tomato salad", "tomato salad with some chilli added", "tomato pepper", "pepper tomato", "chilli with more chilli", "sour chilli", "veggy for veggies", "green red for veggy", "tomato sauce with noodles", "noodles or rice for tomato sauce?", "sauce out of chilli", "sandwich with veggies", "nice veggy salad you have!!"]
    print(len(fruit))
    print(len(veggy))

    X1 = fruit[0:NTWEETS-1]
    y1 = []
    X2 = veggy[0:NTWEETS-1]
    y2 = []

    for i in range(0, NTWEETS-1):
        y1.append("fruit")
        y2.append("veggy")


    assert X1
    assert y1
    assert X2
    assert y2
    assert len(X1) == len(y1)
    print(len(X1), 'fruit samples loaded')
    print(len(X2), 'veggy samples loaded')

    # merge data
    X = X1 + X2
    y = y1 + y2

    print(len(X), 'samples total')

    # split data
    assert len(X) == len(y)
    (X_train, y_train), (X_validate, y_validate), (X_test, y_test) = split_dataset(X, y)
    train_model(X_train, y_train, X_validate, y_validate)
    test_model(X_test, y_test)
Exemple #5
0
M = padding(L, nb_users, nb_orders, nb_categories)
print(M)
''' Permet de faire le Onehot apres le padding.
matrix after padding and max_categories_found a mettre en parametre '''
X_onehot = one_hot_post_padding(M, nb_categories)
#print('\n','_________','\n')
#print(X_onehot)

### Choix Dataset ###
Y = X_onehot[:, -1, :]
X = X_onehot[:, :-1, :]

#print(X)
#print(Y)

x_train, y_train, x_test, y_test = split_dataset(X, Y, 0.8)
print('xtrain :', x_train.shape)
print('ytrain :', y_train.shape)

### Train du Model ###
''' Permet de train notre model et '''
inpt = (x_train.shape[1], x_train.shape[2])  #inpt = (2,22)
outp = y_train.shape[1]  #outp = 22
model = lstm_model(outp, inpt)
model.summary()

train_model(x_train, y_train, (x_test, y_test), model, 'mse', 'adam',
            ['accuracy'])
''' On visualisera l'apprentissage sur Tensorboard.
Commande "tensorboard --logdir trainings" sur terminal.
"trainings" est le dossier dans lequel s'enregistrent les data d'apprentissage.
Exemple #6
0
from preprocessing import preprocessing
from train import feature_extractor, train, split_dataset
from predict import predict

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

##### Feature extraction #####

X, y = preprocessing("./datasets/entreprise_1/")
X = feature_extractor(X)

# Train, Test = split_dataset(X, y)
# ##### Train step #####

# Split data
X_train, X_test, y_train, y_test = split_dataset(
    X, y)  #train_test_split(X, Y, test_size=0.3)

# # Save trained model and load it
train(
    X_train, y_train,
    RandomForestRegressor(n_estimators=30,
                          warm_start=True,
                          oob_score=True,
                          n_jobs=-1))

# ##### Prediction and results #####
print(predict(X_train, X_test, y_train, y_test))
def main():
    print("Arguments:")
    print(vars(args))
    device = utils.get_device()

    base_fpr = np.linspace(0,1,1001)
    base_recall = np.linspace(1,0,1001)

    # load model for ctonly
    net = mobilenet_v2(task = 'classification', moco = args.moco, ctonly = True).to(device)
    dataset = COVID19DataSet(root = args.datapath, ctonly = True) # load dataset
    
    precisions_ctonly = []
    tprs_ctonly = []
    aurocs_ctonly = []
    auprs_ctonly = []
    for seed in range(1000,1010):
        print(seed)
        utils.set_seed(seed, device) # set random seed
        trainset, testset = split_dataset(dataset = dataset, root = args.datapath, logger = None)
        testloader = torch.utils.data.DataLoader(testset, batch_size=args.bstest, shuffle=False, num_workers = args.nworkers)
        state = torch.load("./chpt/mobilenet_ctonly_%d.pth"%(seed))
        net.load_state_dict(state)
        net.eval()
        probs = []
        gts = []
        with torch.no_grad():
            for batch_idx, (imgs, lungsegs, labels) in enumerate(testloader):
                imgs = imgs.to(device)
                lungsegs = lungsegs.to(device)
                logits = net(imgs, lungsegs)
                probs.append(torch.sigmoid(logits))
                gts.append(labels)

        probs = torch.cat(probs, dim=0).cpu().numpy()
        gts = torch.cat(gts, dim=0).cpu().numpy()
        precision, recall, thresholds = precision_recall_curve(gts, probs)
        fpr, tpr, thresholds = roc_curve(gts, probs)
        auroc = auc(fpr, tpr)
        aupr = auc(recall, precision)
        precision = np.interp(base_recall, recall[::-1], precision[::-1])
        tpr = np.interp(base_fpr, fpr, tpr)
        tpr[0] = 0
        tprs_ctonly.append(tpr)
        precisions_ctonly.append(precision)
        aurocs_ctonly.append(auroc)
        auprs_ctonly.append(aupr)


    # load model for ct + lungseg
    net = mobilenet_v2(task = 'classification', moco = args.moco, ctonly = False).to(device)
    dataset = COVID19DataSet(root = args.datapath, ctonly = False) # load dataset

    precisions = []
    tprs = []
    aurocs = []
    auprs = []
    for seed in range(1000,1010):
        print(seed)
        utils.set_seed(seed, device) # set random seed
        trainset, testset = split_dataset(dataset = dataset, root = args.datapath, logger = None)
        testloader = torch.utils.data.DataLoader(testset, batch_size=args.bstest, shuffle=False, num_workers = args.nworkers)
        state = torch.load("./chpt/mobilenet_%d.pth"%(seed))
        net.load_state_dict(state)
        net.eval()
        probs = []
        gts = []
        with torch.no_grad():
            for batch_idx, (imgs, lungsegs, labels) in enumerate(testloader):
                imgs = imgs.to(device)
                lungsegs = lungsegs.to(device)
                logits = net(imgs, lungsegs)
                probs.append(torch.sigmoid(logits))
                gts.append(labels)

        probs = torch.cat(probs, dim=0).cpu().numpy()
        gts = torch.cat(gts, dim=0).cpu().numpy()
        precision, recall, thresholds = precision_recall_curve(gts, probs)
        fpr, tpr, thresholds = roc_curve(gts, probs)
        auroc = auc(fpr, tpr)
        aupr = auc(recall, precision)
        precision = np.interp(base_recall, recall[::-1], precision[::-1])
        tpr = np.interp(base_fpr, fpr, tpr)
        tpr[0] = 0
        tprs.append(tpr)
        precisions.append(precision)
        aurocs.append(auroc)
        auprs.append(aupr)

    tprs_ctonly = np.array(tprs_ctonly)
    tprs = np.array(tprs)
    mean_tprs_ctonly = tprs_ctonly.mean(axis=0)
    std_tprs_ctonly = tprs_ctonly.std(axis=0)
    mean_tprs = tprs.mean(axis=0)
    std_tprs = tprs.std(axis=0)

    precisions_ctonly = np.array(precisions_ctonly)
    precisions = np.array(precisions)
    mean_precisions_ctonly = precisions_ctonly.mean(axis=0)
    std_precisions_ctonly = precisions_ctonly.std(axis=0)
    mean_precisions = precisions.mean(axis=0)
    std_precisions = precisions.std(axis=0)
    
    mean_auroc_ctonly = auc(base_fpr, mean_tprs_ctonly)
    mean_auroc = auc(base_fpr, mean_tprs)
    std_auroc_ctonly = np.std(aurocs_ctonly)
    std_auroc = np.std(aurocs)

    mean_aupr_ctonly = auc(base_recall, mean_precisions_ctonly)
    mean_aupr = auc(base_recall, mean_precisions)
    std_aupr_ctonly = np.std(auprs_ctonly)
    std_aupr = np.std(auprs)

    tprs_upper_ctonly = np.minimum(mean_tprs_ctonly+0.5*std_tprs_ctonly, 1.)
    tprs_lower_ctonly = mean_tprs_ctonly-0.5*std_tprs_ctonly
    tprs_upper = np.minimum(mean_tprs+0.5*std_tprs, 1.)
    tprs_lower = mean_tprs-0.5*std_tprs

    precisions_upper_ctonly = np.minimum(mean_precisions_ctonly+0.5*std_precisions_ctonly, 1.)
    precisions_lower_ctonly = mean_precisions_ctonly-0.5*std_precisions_ctonly
    precisions_upper = np.minimum(mean_precisions+0.5*std_precisions, 1.)
    precisions_lower = mean_precisions-0.5*std_precisions

    
    # ROC curve
    plt.figure(1,figsize=(12,9))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(base_fpr, mean_tprs_ctonly, 'b', alpha = 0.9, label= "CT image (AUC = %0.4f$\pm$%0.4f"%(mean_auroc_ctonly,std_auroc_ctonly))
    plt.fill_between(base_fpr, tprs_lower_ctonly, tprs_upper_ctonly, color='blue', alpha=0.1)
    plt.plot(base_fpr, mean_tprs, 'tab:orange', alpha = 0.9, label= "CT image + Lung segmentation (AUC = %0.4f$\pm$%0.4f"%(mean_auroc,std_auroc))
    plt.fill_between(base_fpr, tprs_lower, tprs_upper, color='tab:orange', alpha=0.1)
    plt.xlabel('False positive rate', fontsize='x-large')
    plt.ylabel('True positive rate', fontsize='x-large')
    plt.title('ROC curve', fontsize='x-large')
    plt.legend(loc='lower right', fontsize='x-large')
    plt.ylim([-0.01,1.01])
    plt.xlim([-0.01,1.01])
    plt.show()

    # # PR curve
    plt.figure(2,figsize=(12,9))
    plt.step(base_recall, mean_precisions_ctonly, 'b', where='post', alpha = 0.9, label= "CT image (AUC = %0.4f$\pm$%0.4f"%(mean_aupr_ctonly,std_aupr_ctonly))
    plt.fill_between(base_recall, precisions_lower_ctonly, precisions_upper_ctonly, color='blue', alpha=0.1)
    plt.step(base_recall, mean_precisions, 'tab:orange', where='post', alpha = 0.9, label= "CT image + Lung segmentation (AUC = %0.4f$\pm$%0.4f"%(mean_aupr,std_aupr))
    plt.fill_between(base_recall, precisions_lower, precisions_upper, color='tab:orange', alpha=0.1)

    plt.xlabel('Recall', fontsize='x-large')
    plt.ylabel('Precision', fontsize='x-large')
    plt.title('PR curve', fontsize='x-large')
    plt.legend(loc='lower left', fontsize='x-large')
    plt.ylim([-0.01,1.01])
    plt.xlim([-0.01,1.01])
    plt.show()