def get_data_loader(rp: str, data_type: str): """ Reads the labels for the training data and converts it into a tensor of "(features, target)" for Neural Network using PyTorch. 1. Loads the training classes, both coarse and fine and creates label for each row by "concatenating coarse_class :: fine_class". 2. Converts the labels_numpy to labels_bin - Binarized form to be used in NN. 3. Loads the features using the `get_ft_obj` function in numpy arrays. 4. Get the number of features used. 5. Converts label_numpy into PyTorch tensor - labels. 6. Converts x_ft(features - independent variables) into PyTorch :argument :param rp: Absolute path of the root directory of the project. :param data_type: String either `training` or `test`. :return: feat_size: Number of features which are being used, so that we can keep the data_loader: Loader object containing train data and labels used to train the Neural Network. """ labels_numpy = [] crf, coarse = read_file("coarse_classes_{0}".format(data_type), rp) frf, fine = read_file("fine_classes_{0}".format(data_type), rp) c_lb = [remove_endline_char(c).strip() for c in coarse] f_lb = [remove_endline_char(f).strip() for f in fine] if not crf: print("Error in reading actual ({0}) coarse classes".format(data_type)) exit(-11) if not frf: print("Error in reading actual ({0}) fine classes".format(data_type)) exit(-11) label_len = len(f_lb) for i in range(0, label_len): labels_numpy.append(c_lb[i] + " :: " + f_lb[i]) mlb = MultiLabelBinarizer().fit(labels_numpy) if data_type == "training" \ else read_obj("label_binarizer", rp + "/{0}".format(nn_model_str))[1] labels_bin = mlb.transform(labels_numpy) write_obj(mlb, "label_binarizer", rp + "/{0}".format(nn_model_str)) print("- Labels loading into numpy done.") x_ft = get_ft_obj(data_type, rp, "{0}".format(nn_model_str), "coarse").toarray() feat_size = x_ft.shape[1] print("- Features loading into numpy done.") labels = torch.from_numpy(labels_bin) data = torch.from_numpy(x_ft).float() print("- Features and labels as tensors, done.") train_data = TensorDataset(data, labels) data_loader = DataLoader(train_data, batch_size=batch_size) print("- {0} loader done.".format(data_type)) return feat_size, data_loader
def train_one_node(rp: str, cat_type: str, ml_algo: str): """ Gets data in the form of sparse matrix from `qc.dataprep.feature_stack` module which is ready for use in a machine learning model. Using the data trains a ml node and serialize the trained object to the secondary memory (hard-disk). :argument: :param rp: Absolute path of the root directory of the project. :param cat_type: Type of categorical class `coarse` or any of the 6 main classes. (`abbr` | `desc` | `enty` | `hum` | `loc` | `num`) :param ml_algo: The type of machine learning models to be used. (svm | lr | linear_svm) :return: boolean_flag: True for successful operation. model: trained SVC model """ x_ft = get_ft_obj("training", rp, ml_algo, cat_type) labels = read_file("{0}_classes_training".format(cat_type), rp)[1] y_lb = [remove_endline_char(c).strip() for c in labels] machine = None # -----------------------------------Experimental code-------------------------------------------------------------- # 1. This is the part where you can experiment and play with the parameters. # 2. If you want to add more models or combinations, you just need to add an `elif` condition and # provide the condition value in argument from the shell. e.g `train svm`, # here `svm` will be in the variable {ml_algo}. if ml_algo == "svm": machine = svm.SVC() elif ml_algo == "linear_svm": machine = svm.LinearSVC() elif ml_algo == "lr": machine = linear_model.LogisticRegression(solver="newton-cg") else: print( "- Error while training {0} model. {0} is unexpected ML algorithm". format(ml_algo)) # Parameter tuning ends here. # ------------------------------------------------------------------------------------------------------------------ model = machine.fit(x_ft, y_lb) mw_flag = write_obj(model, "{0}_model".format(cat_type), rp + "/{0}".format(ml_algo)) if mw_flag: print("- Training done for {0} model of {1}".format(cat_type, ml_algo)) return True else: print("- Error in writing trained {0} model of {1}".format( cat_type, ml_algo)) return False
def get_predictions(question_doc: Doc, rp: str, ml_algo: str): """ Gets data in the form of sparse matrix from `qc.dataprep.feature_stack` module which is ready for use in a machine learning model. Using the test data in `question-classification/dataset` tests [[ml_algo]] model which is pre-trained and saved as serialized pickle files. :argument :param question_doc The question to be classified given as tagged Doc from NLP process. :param rp: Absolute path of the root directory of the project. :param ml_algo: The type of machine learning models to be used. (svm | lr | linear_svm) :return: pred: Question class prediction. """ c_ft = get_ft_obj("api", rp, ml_algo, "coarse", [question_doc]).tocsr() a_ft = get_ft_obj("api", rp, ml_algo, "abbr", [question_doc]).tocsr() d_ft = get_ft_obj("api", rp, ml_algo, "desc", [question_doc]).tocsr() e_ft = get_ft_obj("api", rp, ml_algo, "enty", [question_doc]).tocsr() h_ft = get_ft_obj("api", rp, ml_algo, "hum", [question_doc]).tocsr() l_ft = get_ft_obj("api", rp, ml_algo, "loc", [question_doc]).tocsr() n_ft = get_ft_obj("api", rp, ml_algo, "num", [question_doc]).tocsr() print("- DataPrep done.") c = coarse_model.predict(c_ft[0])[0] if c == "ABBR": f = abbr_model.predict(a_ft[0])[0] elif c == "DESC": f = desc_model.predict(d_ft[0])[0] elif c == "ENTY": f = enty_model.predict(e_ft[0])[0] elif c == "HUM": f = hum_model.predict(h_ft[0])[0] elif c == "LOC": f = loc_model.predict(l_ft[0])[0] else: f = num_model.predict(n_ft[0])[0] print("- Predict done.") return [c, f]
def get_predictions(rp: str, ml_algo: str): """ Gets data in the form of sparse matrix from `qc.dataprep.feature_stack` module which is ready for use in a machine learning model. Using the test data in `question-classification/dataset` tests [[ml_algo]] model which is pre-trained and saved as serialized pickle files. :argument :param rp: Absolute path of the root directory of the project. :param ml_algo: The type of machine learning models to be used. (svm | lr | linear_svm) :return: pred: List of prediction for each of the test questions (test data). """ start_test = datetime.datetime.now().timestamp() print("\n* Testing started - {0} model".format(ml_algo)) pred = [] # load all the trained models crf, coarse_model = read_obj("coarse_model", rp + "/{0}".format(ml_algo)) arf, abbr_model = read_obj("abbr_model", rp + "/{0}".format(ml_algo)) drf, desc_model = read_obj("desc_model", rp + "/{0}".format(ml_algo)) erf, enty_model = read_obj("enty_model", rp + "/{0}".format(ml_algo)) hrf, hum_model = read_obj("hum_model", rp + "/{0}".format(ml_algo)) lrf, loc_model = read_obj("loc_model", rp + "/{0}".format(ml_algo)) nrf, num_model = read_obj("num_model", rp + "/{0}".format(ml_algo)) if not crf: print("- Error in reading coarse {0} model".format(ml_algo)) if not arf: print("- Error in reading abbr {0} model".format(ml_algo)) if not drf: print("- Error in reading desc {0} model".format(ml_algo)) if not erf: print("- Error in reading enty {0} model".format(ml_algo)) if not hrf: print("- Error in reading hum {0} model".format(ml_algo)) if not lrf: print("- Error in reading loc {0} model".format(ml_algo)) if not nrf: print("- Error in reading num {0} model".format(ml_algo)) if crf and arf and drf and erf and hrf and lrf and nrf: print("- Loading the {0} models complete".format(ml_algo)) else: print("- Error in loading the pre-trained model") exit(-10) c_ft = get_ft_obj("test", rp, ml_algo, "coarse").tocsr() a_ft = get_ft_obj("test", rp, ml_algo, "abbr").tocsr() d_ft = get_ft_obj("test", rp, ml_algo, "desc").tocsr() e_ft = get_ft_obj("test", rp, ml_algo, "enty").tocsr() h_ft = get_ft_obj("test", rp, ml_algo, "hum").tocsr() l_ft = get_ft_obj("test", rp, ml_algo, "loc").tocsr() n_ft = get_ft_obj("test", rp, ml_algo, "num").tocsr() print("- DataPrep for test data done.") for i in range(0, c_ft.shape[0]): c = coarse_model.predict(c_ft[i])[0] if c == "ABBR": f = abbr_model.predict(a_ft[i])[0] elif c == "DESC": f = desc_model.predict(d_ft[i])[0] elif c == "ENTY": f = enty_model.predict(e_ft[i])[0] elif c == "HUM": f = hum_model.predict(h_ft[i])[0] elif c == "LOC": f = loc_model.predict(l_ft[i])[0] else: f = num_model.predict(n_ft[i])[0] row_pred = [c, f] pred.append(row_pred) end_test = datetime.datetime.now().timestamp() total_test = datetime.datetime.utcfromtimestamp(end_test - start_test) print("- Predicting done : {3} models in {0}h {1}m {2}s".format( total_test.hour, total_test.minute, total_test.second, ml_algo)) return pred