Beispiel #1
0
def get_data_loader(rp: str, data_type: str):
    """
    Reads the labels for the training data and converts it into a tensor
    of "(features, target)" for Neural Network using PyTorch.
    1. Loads the training classes, both coarse and fine and creates label for
    each row by "concatenating coarse_class :: fine_class".
    2. Converts the labels_numpy to labels_bin - Binarized form to be used in NN.
    3. Loads the features using the `get_ft_obj` function in numpy arrays.
    4. Get the number of features used.
    5. Converts label_numpy into PyTorch tensor - labels.
    6. Converts x_ft(features - independent variables) into PyTorch

    :argument
        :param rp: Absolute path of the root directory of the project.
        :param data_type: String either `training` or `test`.

    :return:
        feat_size: Number of features which are being used, so that we can keep the
        data_loader: Loader object containing train data and labels used to train the Neural Network.
    """
    labels_numpy = []
    crf, coarse = read_file("coarse_classes_{0}".format(data_type), rp)
    frf, fine = read_file("fine_classes_{0}".format(data_type), rp)
    c_lb = [remove_endline_char(c).strip() for c in coarse]
    f_lb = [remove_endline_char(f).strip() for f in fine]
    if not crf:
        print("Error in reading actual ({0}) coarse classes".format(data_type))
        exit(-11)
    if not frf:
        print("Error in reading actual ({0}) fine classes".format(data_type))
        exit(-11)
    label_len = len(f_lb)
    for i in range(0, label_len):
        labels_numpy.append(c_lb[i] + " :: " + f_lb[i])
    mlb = MultiLabelBinarizer().fit(labels_numpy) if data_type == "training" \
        else read_obj("label_binarizer", rp + "/{0}".format(nn_model_str))[1]
    labels_bin = mlb.transform(labels_numpy)
    write_obj(mlb, "label_binarizer", rp + "/{0}".format(nn_model_str))
    print("- Labels loading into numpy done.")
    x_ft = get_ft_obj(data_type, rp, "{0}".format(nn_model_str),
                      "coarse").toarray()
    feat_size = x_ft.shape[1]
    print("- Features loading into numpy done.")
    labels = torch.from_numpy(labels_bin)
    data = torch.from_numpy(x_ft).float()
    print("- Features and labels as tensors, done.")
    train_data = TensorDataset(data, labels)
    data_loader = DataLoader(train_data, batch_size=batch_size)
    print("- {0} loader done.".format(data_type))
    return feat_size, data_loader
def train_one_node(rp: str, cat_type: str, ml_algo: str):
    """
    Gets data in the form of sparse matrix from `qc.dataprep.feature_stack` module
    which is ready for use in a machine learning model. Using the data trains a ml node
    and serialize the trained object to the secondary memory (hard-disk).

    :argument:
        :param rp: Absolute path of the root directory of the project.
        :param cat_type: Type of categorical class `coarse` or any of the 6 main classes.
                                        (`abbr` | `desc` | `enty` | `hum` | `loc` | `num`)
        :param ml_algo: The type of machine learning models to be used. (svm | lr | linear_svm)
    :return:
        boolean_flag: True for successful operation.
        model: trained SVC model
    """
    x_ft = get_ft_obj("training", rp, ml_algo, cat_type)
    labels = read_file("{0}_classes_training".format(cat_type), rp)[1]
    y_lb = [remove_endline_char(c).strip() for c in labels]
    machine = None
    # -----------------------------------Experimental code--------------------------------------------------------------
    # 1. This is the part where you can experiment and play with the parameters.
    # 2. If you want to add more models or combinations, you just need to add an `elif` condition and
    #    provide the condition value in argument from the shell. e.g `train svm`,
    #    here `svm` will be in the variable {ml_algo}.

    if ml_algo == "svm":
        machine = svm.SVC()
    elif ml_algo == "linear_svm":
        machine = svm.LinearSVC()
    elif ml_algo == "lr":
        machine = linear_model.LogisticRegression(solver="newton-cg")
    else:
        print(
            "- Error while training {0} model. {0} is unexpected ML algorithm".
            format(ml_algo))

    # Parameter tuning ends here.
    # ------------------------------------------------------------------------------------------------------------------
    model = machine.fit(x_ft, y_lb)
    mw_flag = write_obj(model, "{0}_model".format(cat_type),
                        rp + "/{0}".format(ml_algo))
    if mw_flag:
        print("- Training done for {0} model of {1}".format(cat_type, ml_algo))
        return True
    else:
        print("- Error in writing trained {0} model of {1}".format(
            cat_type, ml_algo))
        return False
def get_predictions(question_doc: Doc, rp: str, ml_algo: str):
    """
    Gets data in the form of sparse matrix from `qc.dataprep.feature_stack` module
    which is ready for use in a machine learning model. Using the test data in `question-classification/dataset`
    tests [[ml_algo]] model which is pre-trained and saved as serialized pickle files.

    :argument
        :param question_doc The question to be classified given as tagged Doc from NLP process.
        :param rp: Absolute path of the root directory of the project.
        :param ml_algo: The type of machine learning models to be used. (svm | lr | linear_svm)
    :return:
        pred: Question class prediction.
    """

    c_ft = get_ft_obj("api", rp, ml_algo, "coarse", [question_doc]).tocsr()
    a_ft = get_ft_obj("api", rp, ml_algo, "abbr", [question_doc]).tocsr()
    d_ft = get_ft_obj("api", rp, ml_algo, "desc", [question_doc]).tocsr()
    e_ft = get_ft_obj("api", rp, ml_algo, "enty", [question_doc]).tocsr()
    h_ft = get_ft_obj("api", rp, ml_algo, "hum", [question_doc]).tocsr()
    l_ft = get_ft_obj("api", rp, ml_algo, "loc", [question_doc]).tocsr()
    n_ft = get_ft_obj("api", rp, ml_algo, "num", [question_doc]).tocsr()
    print("- DataPrep done.")

    c = coarse_model.predict(c_ft[0])[0]

    if c == "ABBR":
        f = abbr_model.predict(a_ft[0])[0]
    elif c == "DESC":
        f = desc_model.predict(d_ft[0])[0]
    elif c == "ENTY":
        f = enty_model.predict(e_ft[0])[0]
    elif c == "HUM":
        f = hum_model.predict(h_ft[0])[0]
    elif c == "LOC":
        f = loc_model.predict(l_ft[0])[0]
    else:
        f = num_model.predict(n_ft[0])[0]

    print("- Predict done.")

    return [c, f]
def get_predictions(rp: str, ml_algo: str):
    """
    Gets data in the form of sparse matrix from `qc.dataprep.feature_stack` module
    which is ready for use in a machine learning model. Using the test data in `question-classification/dataset`
    tests [[ml_algo]] model which is pre-trained and saved as serialized pickle files.

    :argument
        :param rp: Absolute path of the root directory of the project.
        :param ml_algo: The type of machine learning models to be used. (svm | lr | linear_svm)
    :return:
        pred: List of prediction for each of the test questions (test data).
    """
    start_test = datetime.datetime.now().timestamp()
    print("\n* Testing started - {0} model".format(ml_algo))
    pred = []
    # load all the trained models
    crf, coarse_model = read_obj("coarse_model", rp + "/{0}".format(ml_algo))
    arf, abbr_model = read_obj("abbr_model", rp + "/{0}".format(ml_algo))
    drf, desc_model = read_obj("desc_model", rp + "/{0}".format(ml_algo))
    erf, enty_model = read_obj("enty_model", rp + "/{0}".format(ml_algo))
    hrf, hum_model = read_obj("hum_model", rp + "/{0}".format(ml_algo))
    lrf, loc_model = read_obj("loc_model", rp + "/{0}".format(ml_algo))
    nrf, num_model = read_obj("num_model", rp + "/{0}".format(ml_algo))
    if not crf:
        print("- Error in reading coarse {0} model".format(ml_algo))
    if not arf:
        print("- Error in reading abbr {0} model".format(ml_algo))
    if not drf:
        print("- Error in reading desc {0} model".format(ml_algo))
    if not erf:
        print("- Error in reading enty {0} model".format(ml_algo))
    if not hrf:
        print("- Error in reading hum {0} model".format(ml_algo))
    if not lrf:
        print("- Error in reading loc {0} model".format(ml_algo))
    if not nrf:
        print("- Error in reading num {0} model".format(ml_algo))
    if crf and arf and drf and erf and hrf and lrf and nrf:
        print("- Loading the {0} models complete".format(ml_algo))
    else:
        print("- Error in loading the pre-trained model")
        exit(-10)
    c_ft = get_ft_obj("test", rp, ml_algo, "coarse").tocsr()
    a_ft = get_ft_obj("test", rp, ml_algo, "abbr").tocsr()
    d_ft = get_ft_obj("test", rp, ml_algo, "desc").tocsr()
    e_ft = get_ft_obj("test", rp, ml_algo, "enty").tocsr()
    h_ft = get_ft_obj("test", rp, ml_algo, "hum").tocsr()
    l_ft = get_ft_obj("test", rp, ml_algo, "loc").tocsr()
    n_ft = get_ft_obj("test", rp, ml_algo, "num").tocsr()
    print("- DataPrep for test data done.")
    for i in range(0, c_ft.shape[0]):
        c = coarse_model.predict(c_ft[i])[0]
        if c == "ABBR":
            f = abbr_model.predict(a_ft[i])[0]
        elif c == "DESC":
            f = desc_model.predict(d_ft[i])[0]
        elif c == "ENTY":
            f = enty_model.predict(e_ft[i])[0]
        elif c == "HUM":
            f = hum_model.predict(h_ft[i])[0]
        elif c == "LOC":
            f = loc_model.predict(l_ft[i])[0]
        else:
            f = num_model.predict(n_ft[i])[0]
        row_pred = [c, f]
        pred.append(row_pred)
    end_test = datetime.datetime.now().timestamp()
    total_test = datetime.datetime.utcfromtimestamp(end_test - start_test)
    print("- Predicting done : {3} models in {0}h {1}m {2}s".format(
        total_test.hour, total_test.minute, total_test.second, ml_algo))
    return pred