def train(): data = load_dataset(dataset_path) print('Step1: Dataset is loaded successfully!') preprocessed_data = preprocessing(data) print('Step2: Data preprocessing done successfully!') train, test = train_test_split(preprocessed_data) print('Step3: Data splitted into train and test successfully!') train_X, train_Y, test_X, test_Y, vectorizer = feature_extraction( train, test) trained_model = model_training(train_X, train_Y) print('Step4: Model trained successfully successfully!') accuracy = model_testing(test_X, test_Y, trained_model) vec_classifier = Pipeline([('vectorizer', vectorizer), ('classifier', trained_model)]) save_model(vec_classifier) print('Step5: Model is deployed successfully') response = { 'success': True, 'message': 'Model deployed', 'accuracy': accuracy } return response
def get(self): if request.args.get("data_set"): m = request.args.get("data_set") d = model.load_dataset(m) return render_template("data_set.html", methods=methodsmap.get_available_methods(), matrix=d) else: return render_template("upload.html", methods=methodsmap.get_available_methods())
def main(): pred_file_path = 'test.csv' load_save_model = True lr = 1e-5 batch_size = 8 gpu = True torch.manual_seed(0) device = torch.device('cpu') if gpu: device = torch.device('cuda') tokenizer = BertTokenizer(vocab_file='publish/vocab.txt', max_len=512) _, known_token = load_dataset('TRAIN/Train_reviews.csv', 'TRAIN/Train_labels.csv', tokenizer) dataset = load_review_dataset('TRAIN/TEST/Test_reviews.csv') dataset = Dataset(list(dataset.items())) dataloader = torch_data.DataLoader(dataset=dataset, batch_size=batch_size, shuffle=False, collate_fn=test_collate_fn( tokenizer, known_token)) bert_pretraining = convert_tf_checkpoint_to_pytorch( './publish/bert_model.ckpt', './publish/bert_config.json') model = Model(bert_pretraining.bert) model = model.cuda() if load_save_model: model.load_state_dict(torch.load('./save_model/best.model')) pred_file = open(pred_file_path, mode='w', encoding='utf-8') pbar = tqdm() model.eval() for step, (batch_X, len_X, mask, batch_idx, origin_batch_X) in enumerate(dataloader): batch_X = batch_X.to(device) mask = mask.to(device) scores, gather_idx = model(batch_X, len_X, mask, None) (pred_seq_target, pred_match_target, pred_single_aspect_category_target, pred_single_opinion_category_target,\ pred_cross_category_target, pred_single_aspect_polarity_target, pred_single_opinion_polarity_target,\ pred_cross_polarity_target) = model.infer(scores, mask) label = [] aspect_idx, opinion_idx = gather_idx for b in range(batch_X.shape[0]): _aspect_idx, _opinion_idx = aspect_idx[b], opinion_idx[b] if len(_aspect_idx) == 0 and len(_opinion_idx) == 0: label.append((batch_idx[b], '_', '_', '_', '_')) _aspect_cross, _opinion_cross = [ False for i in range(len(_aspect_idx)) ], [False for i in range(len(_opinion_idx))] for i in range(len(_aspect_idx)): for j in range(len(_opinion_idx)): if pred_match_target[b][i, j] == 1: _aspect_cross[i] = True _opinion_cross[j] = True category = ID2CATEGORY[pred_cross_category_target[b][ i, j]] polarity = ID2POLARITY[pred_cross_polarity_target[b][ i, j]] aspect = tokenizer.decode( list(origin_batch_X[b, _aspect_idx[i]].cpu(). detach().numpy())).replace(' ', '') opinion = tokenizer.decode( list(origin_batch_X[b, _opinion_idx[j]].cpu().detach( ).numpy())).replace(' ', '') # aspect = tokenizer.decode(list(batch_X[b, _aspect_idx[i]].cpu().detach().numpy())).replace(' ', '') # opinion = tokenizer.decode(list(batch_X[b, _opinion_idx[j]].cpu().detach().numpy())).replace(' ', '') aspect_beg = len( tokenizer.decode( list(batch_X[b, 1:_aspect_idx[i][0]].cpu().detach( ).numpy())).replace(' ', '')) aspect_end = aspect_beg + len(aspect) opinion_beg = len( tokenizer.decode( list(batch_X[b, 1:_opinion_idx[j][0]].cpu(). detach().numpy())).replace(' ', '')) opinion_end = opinion_beg + len(opinion) label.append((batch_idx[b], aspect, opinion, category, polarity)) for i in range(len(_aspect_idx)): if _aspect_cross[i] == False: category = ID2CATEGORY[ pred_single_aspect_category_target[b][i]] polarity = ID2POLARITY[ pred_single_aspect_polarity_target[b][i]] aspect = tokenizer.decode( list(origin_batch_X[ b, _aspect_idx[i]].cpu().detach().numpy())).replace( ' ', '') # aspect = tokenizer.decode(list(batch_X[b, _aspect_idx[i]].cpu().detach().numpy())).replace(' ', '') aspect_beg = len( tokenizer.decode( list(batch_X[b, 1:_aspect_idx[i][0]].cpu().detach( ).numpy())).replace(' ', '')) aspect_end = aspect_beg + len(aspect) label.append( (batch_idx[b], aspect, '_', category, polarity)) for i in range(len(_opinion_idx)): if _opinion_cross[i] == False: category = ID2CATEGORY[ pred_single_opinion_category_target[b][i]] polarity = ID2POLARITY[ pred_single_opinion_polarity_target[b][i]] opinion = tokenizer.decode( list(origin_batch_X[ b, _opinion_idx[i]].cpu().detach().numpy())).replace( ' ', '') # opinion = tokenizer.decode(list(batch_X[b, _opinion_idx[i]].cpu().detach().numpy())).replace(' ', '') opinion_beg = len( tokenizer.decode( list(batch_X[b, 1:_opinion_idx[i][0]].cpu().detach( ).numpy())).replace(' ', '')) opinion_end = opinion_beg + len(opinion) label.append( (batch_idx[b], '_', opinion, category, polarity)) for _label in label: _label = ','.join(list(map(lambda x: str(x), _label))) pred_file.write(_label + '\n') pbar.update(batch_size) pbar.set_description('step: %d' % step) pred_file.close() pbar.close()
def prune(scale = True, pca = False, under = False, over = False): filename = "../data/data_final.csv" X, y = load_dataset(filename) if scale: #perform scale in X scaled_features = StandardScaler().fit_transform(X.values) X = pd.DataFrame(scaled_features, index = X.index, columns = X.columns) if pca: #transform X columns to the correspondent principal axis n_comp = 18 columns = [] for i in range(n_comp): columns.append("pca" +str(i+1)) scaled_features = MinMaxScaler().fit_transform(X.values) scaled_features_df = pd.DataFrame(scaled_features, index = X.index, columns = X.columns) pca = PCA(n_components = n_comp) pca.fit(scaled_features_df) X = pca.transform(scaled_features_df) X = pd.DataFrame(X) X.columns = columns X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42) if under: #perform under sampling in training data rus = RandomUnderSampler(random_state=0) X_train, y_train = rus.fit_resample(X_train, y_train) if over: #perform over sampling in training data ros = RandomOverSampler(random_state=0) X_train, y_train = ros.fit_resample(X_train, y_train) clf = [ [AdaBoostClassifier(), "AdaBoostClassifier"], [BaggingClassifier(), "BaggingClassifier"], [ExtraTreesClassifier(), "ExtraTreesClassifier"], [GradientBoostingClassifier(), "GradientBoostClassifier"], [DecisionTreeClassifier(), "DecisionTreeClassifier"], [RandomForestClassifier(), "RandomForestClassifier"] ] results = {} for elem in clf: name = elem[1] results[name] = [] print("AdaBoostClassifier") hyperT = dict(n_estimators =[i for i in range(50,1000, 200)], learning_rate = [float(10 ** i)/10000 for i in range(3)]) gridT = GridSearchCV(AdaBoostClassifier(), hyperT, cv = 3, scoring='f1' ) bestT = gridT.fit(X_train, y_train) y_pred = bestT.predict(X_test) f1 = f1_score(y_test, y_pred) print(gridT.best_params_) results["AdaBoostClassifier"].append([gridT.best_params_ , f1]) print(f1) print("**************************************") print("BaggingClassifier") hyperT = dict(n_estimators =[i for i in range(90,600,100)], bootstrap = ["True", "False"],bootstrap_features=["True", "False"]) #max_samples = [i for i in range(1,6)], max_features = [i for i in range(1,6)], gridT = GridSearchCV(BaggingClassifier(), hyperT, cv = 3, scoring='f1') bestT = gridT.fit(X_train, y_train) y_pred = bestT.predict(X_test) f1 = f1_score(y_test, y_pred) print(gridT.best_params_) results["BaggingClassifier"].append([gridT.best_params_ , f1]) print(f1) print("***************************************") print("ExtraTreesClassifier") hyperT = dict(n_estimators =[i for i in range(100,900,100)], max_depth = [None]+[i for i in range(1,6)], criterion = ["gini", "entropy"], verbose = [0,1])#,, min_samples_split = [i for i in range(1,6)], min_samples_leaf=[i for i in range(1,6)], gridT = GridSearchCV(ExtraTreesClassifier(), hyperT, cv = 3, scoring='f1') bestT = gridT.fit(X_train, y_train) y_pred = bestT.predict(X_test) f1 = f1_score(y_test, y_pred) print(gridT.best_params_) results["ExtraTreesClassifier"].append([gridT.best_params_ , f1]) print(f1) print("********************************************") print("GradientBoostClassifier") hyperT = dict(n_estimators =[10 ** i for i in range(3,5)], learning_rate = [float(10 ** i)/100 for i in range(2)], max_depth = [i for i in range(3,5)]) #min_samples_split = [i for i in range(1,4)], verbose=[i for i in range(3), min_samples_leaf=[i for i in range(1,6)] criterion = ["friedman_mse", "friedman_mae"] gridT = GridSearchCV(GradientBoostingClassifier(), hyperT, cv = 3, scoring='f1') bestT = gridT.fit(X_train, y_train) y_pred = bestT.predict(X_test) f1 = f1_score(y_test, y_pred) print(gridT.best_params_) results["GradientBoostClassifier"].append([gridT.best_params_ , f1]) print(f1) print("***********************************************") print("DecisionTreeClassifier") hyperT = dict(criterion = ["gini","entropy"], max_features = ["auto", "sqrt","log2"], max_depth = [None]+[i for i in range(6,20)], min_samples_leaf = [i for i in range(2,6)]) #, min_samples_leaf=[i for i in range(1,6)], , min_samples_split = [i for i in range(1,6)] , max_depth = [None]+[i for i in range(6,20)], gridT = GridSearchCV(DecisionTreeClassifier(), hyperT, cv = 3, scoring='f1') bestT = gridT.fit(X_train, y_train) y_pred = bestT.predict(X_test) f1 = f1_score(y_test, y_pred) print(gridT.best_params_) results["DecisionTreeClassifier"].append([gridT.best_params_ , f1]) print(f1) print("*************************************************") print("RandomForestClassifier") hyperT = dict(n_estimators =[10 ** i for i in range(2,4)], criterion = ["gini", "entropy"], bootstrap = ["True", "False"], max_depth = [None] + [(10 ** i + 10)for i in range(0,2)]) gridT = GridSearchCV(RandomForestClassifier(), hyperT, cv = 3, scoring='f1') bestT = gridT.fit(X_train, y_train) y_pred = bestT.predict(X_test) f1 = f1_score(y_test, y_pred) print(gridT.best_params_) results["RandomForestClassifier"].append([gridT.best_params_ , f1]) print(f1) print("************************************************") f = open("optimize.txt", "w") f.write(f"Scale = {scale}, PCA = {pca}, under = {under}, over = {over}") f.write("\nGridCV Results: \n") for classifier in results: f.write(f"{classifier}: {results[classifier]}\n") f.close() pass
from model import (load_dataset, preprocessing, train_test_split, model_testing, model_training, load_model, save_model, feature_extraction, predict, append_list_as_row) from sklearn.metrics import accuracy_score import pandas as pd from sklearn.pipeline import Pipeline dataset_path = 'Dataset/Customer_data.csv' try: data = load_dataset(dataset_path) print('Step1: Dataset is loaded successfully!') preprocessed_data = preprocessing(data) print('Step2: Data preprocessing done successfully!') train, test = train_test_split(preprocessed_data) print('Step3: Data splitted into train and test successfully!') train_X, train_Y, test_X, test_Y, vectorizer = feature_extraction( train, test) trained_model = model_training(train_X, train_Y) print('Step4: Model trained successfully successfully!') accuracy = model_testing(test_X, test_Y, trained_model) vec_classifier = Pipeline([('vectorizer', vectorizer), ('classifier', trained_model)]) save_model(vec_classifier)
import cv2 as cv import model from sklearn.model_selection import train_test_split import numpy as np cam = cv.VideoCapture(0) running = True dataset = model.load_dataset() X = dataset["HIST"] y = dataset["CLASSE"] X_train, _, y_train, _ = train_test_split(X, y, train_size=0.8, random_state=13) pca = model.pca(X_train) knn = model.model(pca, X_train, y_train) dict_classes = { 0: ("AMARELO", (0, 255, 255)), 1: ("VERMELHO", (0, 0, 255)), 2: ("AZUL", (255, 0, 0)), 3: ("VERDE", (0, 255, 0)), 4: ("LARANJA", (0, 165, 255)), } while running: status, frame = cam.read()