class Classifier(): def __init__(self, *args, **kwargs): if kwargs == None: config = { 'text_dir': 'data/dataset/doc', 'dataset': 'data/matrix', 'bag_of_words': 'data/bag_of_words', 'train_model': 'data/model/doc.model', 'is_unicode': False } else: config = kwargs self.ml = MachineLearning(**config) # choose your algorithm self.algo = self.ml.NiaveBayes() # algo = ml.DecisionTree(criterion='gini', prune='depth', max_depth=50, min_criterion=0.05) self.prepro = Preprocessing(**config) # print ("Start testing with the classifier !") self.model = self.algo.load_model() def classify(self, question="hello ai"): # preprocess mat = self.prepro.loading_single_doc(question, 'doc_freq', 1) prediction = self.algo.predict(self.model, [mat]) label = self.ml.to_label(prediction, 'data/bag_of_words/label_match.pickle') print(label) return label
def __init__(self, *args, **kwargs): if kwargs == None: config = { 'text_dir': 'data/dataset/doc', 'dataset': 'data/matrix', 'bag_of_words': 'data/bag_of_words', 'train_model': 'data/model/doc.model', 'is_unicode': False } else: config = kwargs self.ml = MachineLearning(**config) # choose your algorithm self.algo = self.ml.NiaveBayes() # algo = ml.DecisionTree(criterion='gini', prune='depth', max_depth=50, min_criterion=0.05) self.prepro = Preprocessing(**config) # print ("Start testing with the classifier !") self.model = self.algo.load_model()
def classify(config, text): """ Text classification """ # Preprocess: transform text to frequency prepro = Preprocessing(**config) mat = prepro.loading_single_doc(text, 'doc_freq', config['threshold']) # Initialize only 3 algorithms at the moment ml = MachineLearning(**config) # Perform prediction # Naive Bayes nb_algo = ml.NiaveBayes() nb_model = nb_algo.load_model() nb_prediction = nb_algo.predict(nb_model, [mat]) # ANN nn_algo = ml.NeuralNetwork(hidden_layer_sizes=(250, 100),\ learning_rate=0.012, momentum=0.5, random_state=0, max_iter=200, activation='tanh') nn_model = nn_algo.load_model() nn_prediction = nn_algo.predict(nn_model, [mat]) # DT dt_algo = ml.DecisionTree(criterion='gini', prune='depth', max_depth=30, min_criterion=0.05) dt_model = dt_algo.load_model() #norm_mat = prepro.normalize_dataset(np.array([mat])) # use with decision tree only #norm_mat = prepro.normalize_dataset(np.array([mat])) # use with decision tree only #dt_prediction = dt_algo.predict(dt_model, norm_mat) dt_prediction = dt_algo.predict(dt_model, np.array([mat])) # Get the best labe outputed by BN, NN, DT nb_label = ml.to_label(nb_prediction, config['label_match']) nn_label = ml.to_label(nn_prediction, config['label_match']) dt_label = ml.to_label(dt_prediction, config['label_match']) # Prepare results of: # (1) Naive Bayes (2) Neural Network (3) Decision Tree result = {'NB': nb_label, 'NN': nn_label, 'DT': dt_label} return result
'is_unicode': False } prepro = Preprocessing(**config) # preposessing dataset_matrix = prepro.loading_data(config['text_dir'], 'doc_freq', 'all', 1) #load dataset from file (feature data) filename = "doc_freq_1.csv" dataset_path = FileUtil.dataset_path(config, filename) dataset_sample = FileUtil.load_csv(dataset_path) prepro_time = time.time() - whole_st ml = MachineLearning(**config) # choose your algorithm nb_algo = ml.NiaveBayes() nn_algo = ml.NeuralNetwork(hidden_layer_sizes=(250, 100), learning_rate=0.012, momentum=0.5, random_state=0, max_iter=200, activation='tanh') dt_algo = ml.DecisionTree(criterion='gini', prune='depth', max_depth=30, min_criterion=0.05) nb_result = perform_algo(ml, nb_algo, dataset_sample) nn_result = perform_algo(ml, nn_algo, dataset_sample)
preposessing """ prepro = Preprocessing(**config) # dataset_matrix = prepro.loading_data(config['text_dir'], 'doc_freq', 'all', 25) #load dataset from file (feature data) filename = "doc_freq_25.csv" dataset_path = FileUtil.dataset_path(config, filename) dataset_sample = FileUtil.load_csv(dataset_path) # dataset_sample = prepro.normalize_dataset(dataset_sample) # use with decision tree only """ end """ """ training """ ml = MachineLearning(**config) # split dataset -> train set, test set training_set, test_set = ml.split_dataset(dataset_sample, 2) # choose your algorithm algo = ml.NiaveBayes() # algo = ml.DecisionTree(criterion='gini', prune='depth', max_depth=30, min_criterion=0.05) # algo = ml.NeuralNetwork(hidden_layer_sizes=(250, 100), learning_rate=0.012, momentum=0.5, random_state=0, max_iter=200, activation='tanh') # train or load model model = algo.train(training_set) # model = algo.load_model() """ end """ """ classify or predict """
def get_results(path_textfile, params, config, start_time): """ This function performs features extraction from client's data source\ Train model based on extracted features Get Accuracy of each algorithm (e.g: Naive Bayes, Neural Network) based on\ evaluation criteria e.g: LOO, 5 folds or 10 folds """ # Store config for next use config = config is_unicode = config.get('is_unicode', None) config['is_unicode'] = True if is_unicode != None else False #logfile = '/Users/lion/Documents/py-workspare/slash-ml/logfile.log' #logging.basicConfig(filename=logfile, level=logging.DEBUG) config['passion'] = "passion" # Perform features extraction is_successful_fextract = MLManager.extract_features( path_textfile, config) #is_successful_fextract = True if is_successful_fextract: whole_st = time.time() prepro = Preprocessing(**config) # preposessing params_prepro = params['PR'] dataset_matrix = prepro.loading_data(config['text_dir'], params_prepro['method'],\ 'all', params_prepro['threshold']) # Remove sub-directory from "data/dataset/text" FileUtil.remove_file(config['text_dir'], ignore_errors=True) #load dataset from file (feature data) filename = "doc_freq_" + str(params_prepro['threshold']) + ".csv" dataset_path = FileUtil.dataset_path(config, filename) dataset_sample = FileUtil.load_csv(dataset_path) prepro_time = time.time() - whole_st ml = MachineLearning(**config) # choose your algorithm nb_algo = ml.NiaveBayes() params_nn = params['NN'] nn_algo = ml.NeuralNetwork(hidden_layer_sizes=params_nn['hidden_layer_sizes'],\ learning_rate=params_nn['learning_rate'], momentum=params_nn['momentum'],\ random_state=params_nn['random_state'], max_iter=params_nn['max_iter'],\ activation=params_nn['activation']) params_dt = params['DT'] dt_algo = ml.DecisionTree(criterion=params_dt['criterion'], prune='depth',\ max_depth=params_dt['max_depth'], min_criterion=params_dt['min_criterion']) nb_result = MLManager.perform_algo(ml, nb_algo, dataset_sample) nn_result = MLManager.perform_algo(ml, nn_algo, dataset_sample) dt_result = MLManager.perform_algo(ml, dt_algo, dataset_sample) print(nb_result, nn_result, dt_result) total_execution_time = time.time() - whole_st result = { 'com_time': round(total_execution_time, 2), 'text_extract_time': round(prepro_time, 2), 'figure_on_testing_data': { 'NB': nb_result['acc'], 'NN': nn_result['acc'], 'DT': dt_result['acc'], }, 'figure_on_training_data': { 'NB': nb_result['acc_train'], 'NN': nn_result['acc_train'], 'DT': dt_result['acc_train'], }, 'on_testing_data': { 'NB': { 'accuracy': nb_result['acc'], 'time': nb_result['exec_time'] }, 'NN': { 'accuracy': nn_result['acc'], 'time': nn_result['exec_time'] }, 'DT': { 'accuracy': dt_result['acc'], 'time': dt_result['exec_time'] }, }, 'on_training_data': { 'NB': { 'accuracy': nb_result['acc_train'], 'time': nb_result['exec_time'] }, 'NN': { 'accuracy': nn_result['acc_train'], 'time': nn_result['exec_time'] }, 'DT': { 'accuracy': dt_result['acc_train'], 'time': dt_result['exec_time'] }, } } return result
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--mode',type=str,default ='chat',help='There are two mode (chat, train, train_c, test and none), The defaul value is chat.') # parser.add_argument("--benchmark", help="run benchmark",action="store_true") # parser.add_argument('--mode',type=float,default =0.2,help='There two mode?(train and chat)') args = parser.parse_args() config = { 'text_dir': 'data/dataset/chatbot', 'dataset': 'data/matrix', 'bag_of_words': 'data/bag_of_words', 'train_model': 'data/model/train.model', 'is_unicode': False } ml = MachineLearning(**config) # choose your algorithm # algo = ml.NiaveBayes() algo = ml.DecisionTree(criterion='gini', prune='depth', max_depth=50, min_criterion=0.05) prepro = Preprocessing(**config) # -- mode if args.mode == 'train' : # preposessing dataset_matrix = prepro.loading_data(config['text_dir'], 'doc_freq', 'all', 1) #load dataset from file (feature data) filename = "doc_freq_1.csv" dataset_path = FileUtil.dataset_path(config, filename) dataset_sample = FileUtil.load_csv(dataset_path) # dataset_sample = prepro.normalize_dataset(dataset_sample) # print(dataset_sample)