Esempio n. 1
0
def train():
    data = load_dataset(dataset_path)
    print('Step1: Dataset is loaded successfully!')

    preprocessed_data = preprocessing(data)
    print('Step2: Data preprocessing done successfully!')

    train, test = train_test_split(preprocessed_data)
    print('Step3: Data splitted into train and test successfully!')

    train_X, train_Y, test_X, test_Y, vectorizer = feature_extraction(
        train, test)

    trained_model = model_training(train_X, train_Y)
    print('Step4: Model trained successfully successfully!')

    accuracy = model_testing(test_X, test_Y, trained_model)

    vec_classifier = Pipeline([('vectorizer', vectorizer),
                               ('classifier', trained_model)])

    save_model(vec_classifier)
    print('Step5: Model is deployed successfully')

    response = {
        'success': True,
        'message': 'Model deployed',
        'accuracy': accuracy
    }
    return response
Esempio n. 2
0
 def get(self):
     if request.args.get("data_set"):
         m = request.args.get("data_set")
         d = model.load_dataset(m)
         return render_template("data_set.html",
                                methods=methodsmap.get_available_methods(),
                                matrix=d)
     else:
         return render_template("upload.html",
                                methods=methodsmap.get_available_methods())
Esempio n. 3
0
def main():
    pred_file_path = 'test.csv'
    load_save_model = True
    lr = 1e-5
    batch_size = 8
    gpu = True
    torch.manual_seed(0)
    device = torch.device('cpu')
    if gpu:
        device = torch.device('cuda')

    tokenizer = BertTokenizer(vocab_file='publish/vocab.txt', max_len=512)
    _, known_token = load_dataset('TRAIN/Train_reviews.csv',
                                  'TRAIN/Train_labels.csv', tokenizer)
    dataset = load_review_dataset('TRAIN/TEST/Test_reviews.csv')
    dataset = Dataset(list(dataset.items()))
    dataloader = torch_data.DataLoader(dataset=dataset,
                                       batch_size=batch_size,
                                       shuffle=False,
                                       collate_fn=test_collate_fn(
                                           tokenizer, known_token))
    bert_pretraining = convert_tf_checkpoint_to_pytorch(
        './publish/bert_model.ckpt', './publish/bert_config.json')
    model = Model(bert_pretraining.bert)

    model = model.cuda()
    if load_save_model:
        model.load_state_dict(torch.load('./save_model/best.model'))

    pred_file = open(pred_file_path, mode='w', encoding='utf-8')

    pbar = tqdm()
    model.eval()
    for step, (batch_X, len_X, mask, batch_idx,
               origin_batch_X) in enumerate(dataloader):
        batch_X = batch_X.to(device)
        mask = mask.to(device)

        scores, gather_idx = model(batch_X, len_X, mask, None)
        (pred_seq_target, pred_match_target, pred_single_aspect_category_target, pred_single_opinion_category_target,\
            pred_cross_category_target, pred_single_aspect_polarity_target, pred_single_opinion_polarity_target,\
                pred_cross_polarity_target) = model.infer(scores, mask)

        label = []

        aspect_idx, opinion_idx = gather_idx
        for b in range(batch_X.shape[0]):
            _aspect_idx, _opinion_idx = aspect_idx[b], opinion_idx[b]
            if len(_aspect_idx) == 0 and len(_opinion_idx) == 0:
                label.append((batch_idx[b], '_', '_', '_', '_'))

            _aspect_cross, _opinion_cross = [
                False for i in range(len(_aspect_idx))
            ], [False for i in range(len(_opinion_idx))]
            for i in range(len(_aspect_idx)):
                for j in range(len(_opinion_idx)):
                    if pred_match_target[b][i, j] == 1:
                        _aspect_cross[i] = True
                        _opinion_cross[j] = True
                        category = ID2CATEGORY[pred_cross_category_target[b][
                            i, j]]
                        polarity = ID2POLARITY[pred_cross_polarity_target[b][
                            i, j]]
                        aspect = tokenizer.decode(
                            list(origin_batch_X[b, _aspect_idx[i]].cpu().
                                 detach().numpy())).replace(' ', '')
                        opinion = tokenizer.decode(
                            list(origin_batch_X[b,
                                                _opinion_idx[j]].cpu().detach(
                                                ).numpy())).replace(' ', '')
                        # aspect = tokenizer.decode(list(batch_X[b, _aspect_idx[i]].cpu().detach().numpy())).replace(' ', '')
                        # opinion = tokenizer.decode(list(batch_X[b, _opinion_idx[j]].cpu().detach().numpy())).replace(' ', '')
                        aspect_beg = len(
                            tokenizer.decode(
                                list(batch_X[b,
                                             1:_aspect_idx[i][0]].cpu().detach(
                                             ).numpy())).replace(' ', ''))
                        aspect_end = aspect_beg + len(aspect)
                        opinion_beg = len(
                            tokenizer.decode(
                                list(batch_X[b, 1:_opinion_idx[j][0]].cpu().
                                     detach().numpy())).replace(' ', ''))
                        opinion_end = opinion_beg + len(opinion)
                        label.append((batch_idx[b], aspect, opinion, category,
                                      polarity))
            for i in range(len(_aspect_idx)):
                if _aspect_cross[i] == False:
                    category = ID2CATEGORY[
                        pred_single_aspect_category_target[b][i]]
                    polarity = ID2POLARITY[
                        pred_single_aspect_polarity_target[b][i]]
                    aspect = tokenizer.decode(
                        list(origin_batch_X[
                            b,
                            _aspect_idx[i]].cpu().detach().numpy())).replace(
                                ' ', '')
                    # aspect = tokenizer.decode(list(batch_X[b, _aspect_idx[i]].cpu().detach().numpy())).replace(' ', '')
                    aspect_beg = len(
                        tokenizer.decode(
                            list(batch_X[b, 1:_aspect_idx[i][0]].cpu().detach(
                            ).numpy())).replace(' ', ''))
                    aspect_end = aspect_beg + len(aspect)
                    label.append(
                        (batch_idx[b], aspect, '_', category, polarity))
            for i in range(len(_opinion_idx)):
                if _opinion_cross[i] == False:
                    category = ID2CATEGORY[
                        pred_single_opinion_category_target[b][i]]
                    polarity = ID2POLARITY[
                        pred_single_opinion_polarity_target[b][i]]
                    opinion = tokenizer.decode(
                        list(origin_batch_X[
                            b,
                            _opinion_idx[i]].cpu().detach().numpy())).replace(
                                ' ', '')
                    # opinion = tokenizer.decode(list(batch_X[b, _opinion_idx[i]].cpu().detach().numpy())).replace(' ', '')
                    opinion_beg = len(
                        tokenizer.decode(
                            list(batch_X[b, 1:_opinion_idx[i][0]].cpu().detach(
                            ).numpy())).replace(' ', ''))
                    opinion_end = opinion_beg + len(opinion)
                    label.append(
                        (batch_idx[b], '_', opinion, category, polarity))

        for _label in label:
            _label = ','.join(list(map(lambda x: str(x), _label)))
            pred_file.write(_label + '\n')
        pbar.update(batch_size)
        pbar.set_description('step: %d' % step)
    pred_file.close()
    pbar.close()
def prune(scale = True, pca = False, under = False, over = False):
       
    filename = "../data/data_final.csv"
    X, y = load_dataset(filename)
    if scale: #perform scale in X
        scaled_features = StandardScaler().fit_transform(X.values)
        X = pd.DataFrame(scaled_features, index = X.index, columns = X.columns)
    
    if pca:  #transform X columns to the correspondent principal axis
        n_comp = 18
        columns = []
        for i in range(n_comp):
            columns.append("pca" +str(i+1))
            
        scaled_features = MinMaxScaler().fit_transform(X.values)
        scaled_features_df = pd.DataFrame(scaled_features, index = X.index, columns = X.columns)
        pca = PCA(n_components = n_comp)
        pca.fit(scaled_features_df)
        X = pca.transform(scaled_features_df)
        X = pd.DataFrame(X)
        X.columns = columns
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)
    
    if under:  #perform under sampling in training data
        rus = RandomUnderSampler(random_state=0)
        X_train, y_train = rus.fit_resample(X_train, y_train)
    
    if over: #perform over sampling in training data
        ros = RandomOverSampler(random_state=0)
        X_train, y_train = ros.fit_resample(X_train, y_train)
        
    clf = [
            [AdaBoostClassifier(), "AdaBoostClassifier"],
            [BaggingClassifier(), "BaggingClassifier"],
            [ExtraTreesClassifier(), "ExtraTreesClassifier"],
            [GradientBoostingClassifier(), "GradientBoostClassifier"],
            [DecisionTreeClassifier(), "DecisionTreeClassifier"],
            [RandomForestClassifier(), "RandomForestClassifier"]
        ]
    
    results = {}
    for elem in clf:
        name = elem[1]
        results[name] = []
        
    print("AdaBoostClassifier")
    hyperT = dict(n_estimators =[i for i in range(50,1000, 200)], learning_rate = [float(10 ** i)/10000 for i in range(3)])
    gridT = GridSearchCV(AdaBoostClassifier(), hyperT, cv = 3, scoring='f1' )
    bestT = gridT.fit(X_train, y_train)
    y_pred = bestT.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    print(gridT.best_params_)
    results["AdaBoostClassifier"].append([gridT.best_params_ , f1])
    print(f1)
    print("**************************************")
    
    print("BaggingClassifier")
    hyperT = dict(n_estimators =[i for i in range(90,600,100)],  bootstrap =  ["True", "False"],bootstrap_features=["True", "False"]) #max_samples = [i for i in range(1,6)], max_features = [i for i in range(1,6)],
    gridT = GridSearchCV(BaggingClassifier(), hyperT, cv = 3, scoring='f1')
    bestT = gridT.fit(X_train, y_train)
    y_pred = bestT.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    print(gridT.best_params_)
    results["BaggingClassifier"].append([gridT.best_params_ , f1])
    print(f1)
    print("***************************************")
    
    print("ExtraTreesClassifier")
    hyperT = dict(n_estimators =[i for i in range(100,900,100)], max_depth = [None]+[i for i in range(1,6)], criterion = ["gini", "entropy"], verbose = [0,1])#,,  min_samples_split = [i for i in range(1,6)], min_samples_leaf=[i for i in range(1,6)],
    gridT = GridSearchCV(ExtraTreesClassifier(), hyperT, cv = 3, scoring='f1')
    bestT = gridT.fit(X_train, y_train)
    y_pred = bestT.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    print(gridT.best_params_)
    results["ExtraTreesClassifier"].append([gridT.best_params_ , f1])
    print(f1)
    print("********************************************")
    
    print("GradientBoostClassifier")
    hyperT = dict(n_estimators =[10 ** i for i in range(3,5)], learning_rate = [float(10 ** i)/100 for i in range(2)], max_depth = [i for i in range(3,5)]) #min_samples_split = [i for i in range(1,4)], verbose=[i for i in range(3),  min_samples_leaf=[i for i in range(1,6)] criterion = ["friedman_mse", "friedman_mae"] 
    gridT = GridSearchCV(GradientBoostingClassifier(), hyperT, cv = 3, scoring='f1')
    bestT = gridT.fit(X_train, y_train)
    y_pred = bestT.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    print(gridT.best_params_)
    results["GradientBoostClassifier"].append([gridT.best_params_ , f1])
    print(f1)
    print("***********************************************")
    
    print("DecisionTreeClassifier")
    hyperT = dict(criterion = ["gini","entropy"], max_features = ["auto", "sqrt","log2"], max_depth = [None]+[i for i in range(6,20)],  min_samples_leaf = [i for i in range(2,6)]) #,  min_samples_leaf=[i for i in range(1,6)], , min_samples_split = [i for i in range(1,6)] , max_depth = [None]+[i for i in range(6,20)],
    gridT = GridSearchCV(DecisionTreeClassifier(), hyperT, cv = 3, scoring='f1')
    bestT = gridT.fit(X_train, y_train)
    y_pred = bestT.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    print(gridT.best_params_)
    results["DecisionTreeClassifier"].append([gridT.best_params_ , f1])
    print(f1)
    print("*************************************************")
    
    print("RandomForestClassifier")
    hyperT = dict(n_estimators =[10 ** i for i in range(2,4)], criterion = ["gini", "entropy"], bootstrap = ["True", "False"], max_depth = [None] + [(10 ** i + 10)for i in range(0,2)])
    gridT = GridSearchCV(RandomForestClassifier(), hyperT, cv = 3, scoring='f1')
    bestT = gridT.fit(X_train, y_train)
    y_pred = bestT.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    print(gridT.best_params_)
    results["RandomForestClassifier"].append([gridT.best_params_ , f1])
    print(f1)
    print("************************************************")

    f = open("optimize.txt", "w") 
    f.write(f"Scale = {scale}, PCA = {pca}, under = {under}, over = {over}")
    f.write("\nGridCV Results: \n")
    
    for classifier in results:
        f.write(f"{classifier}: {results[classifier]}\n") 
               
    f.close()
    
    pass
from model import (load_dataset, preprocessing, train_test_split,
                   model_testing, model_training, load_model, save_model,
                   feature_extraction, predict, append_list_as_row)
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.pipeline import Pipeline

dataset_path = 'Dataset/Customer_data.csv'

try:
    data = load_dataset(dataset_path)
    print('Step1: Dataset is loaded successfully!')

    preprocessed_data = preprocessing(data)
    print('Step2: Data preprocessing done successfully!')

    train, test = train_test_split(preprocessed_data)
    print('Step3: Data splitted into train and test successfully!')

    train_X, train_Y, test_X, test_Y, vectorizer = feature_extraction(
        train, test)

    trained_model = model_training(train_X, train_Y)
    print('Step4: Model trained successfully successfully!')

    accuracy = model_testing(test_X, test_Y, trained_model)

    vec_classifier = Pipeline([('vectorizer', vectorizer),
                               ('classifier', trained_model)])

    save_model(vec_classifier)
Esempio n. 6
0
import cv2 as cv
import model
from sklearn.model_selection import train_test_split
import numpy as np

cam = cv.VideoCapture(0)

running = True

dataset = model.load_dataset()
X = dataset["HIST"]
y = dataset["CLASSE"]
X_train, _, y_train, _ = train_test_split(X,
                                          y,
                                          train_size=0.8,
                                          random_state=13)

pca = model.pca(X_train)
knn = model.model(pca, X_train, y_train)

dict_classes = {
    0: ("AMARELO", (0, 255, 255)),
    1: ("VERMELHO", (0, 0, 255)),
    2: ("AZUL", (255, 0, 0)),
    3: ("VERDE", (0, 255, 0)),
    4: ("LARANJA", (0, 165, 255)),
}

while running:
    status, frame = cam.read()