Beispiel #1
0
def get_data_loader(rp: str, data_type: str):
    """
    Reads the labels for the training data and converts it into a tensor
    of "(features, target)" for Neural Network using PyTorch.
    1. Loads the training classes, both coarse and fine and creates label for
    each row by "concatenating coarse_class :: fine_class".
    2. Converts the labels_numpy to labels_bin - Binarized form to be used in NN.
    3. Loads the features using the `get_ft_obj` function in numpy arrays.
    4. Get the number of features used.
    5. Converts label_numpy into PyTorch tensor - labels.
    6. Converts x_ft(features - independent variables) into PyTorch

    :argument
        :param rp: Absolute path of the root directory of the project.
        :param data_type: String either `training` or `test`.

    :return:
        feat_size: Number of features which are being used, so that we can keep the
        data_loader: Loader object containing train data and labels used to train the Neural Network.
    """
    labels_numpy = []
    crf, coarse = read_file("coarse_classes_{0}".format(data_type), rp)
    frf, fine = read_file("fine_classes_{0}".format(data_type), rp)
    c_lb = [remove_endline_char(c).strip() for c in coarse]
    f_lb = [remove_endline_char(f).strip() for f in fine]
    if not crf:
        print("Error in reading actual ({0}) coarse classes".format(data_type))
        exit(-11)
    if not frf:
        print("Error in reading actual ({0}) fine classes".format(data_type))
        exit(-11)
    label_len = len(f_lb)
    for i in range(0, label_len):
        labels_numpy.append(c_lb[i] + " :: " + f_lb[i])
    mlb = MultiLabelBinarizer().fit(labels_numpy) if data_type == "training" \
        else read_obj("label_binarizer", rp + "/{0}".format(nn_model_str))[1]
    labels_bin = mlb.transform(labels_numpy)
    write_obj(mlb, "label_binarizer", rp + "/{0}".format(nn_model_str))
    print("- Labels loading into numpy done.")
    x_ft = get_ft_obj(data_type, rp, "{0}".format(nn_model_str),
                      "coarse").toarray()
    feat_size = x_ft.shape[1]
    print("- Features loading into numpy done.")
    labels = torch.from_numpy(labels_bin)
    data = torch.from_numpy(x_ft).float()
    print("- Features and labels as tensors, done.")
    train_data = TensorDataset(data, labels)
    data_loader = DataLoader(train_data, batch_size=batch_size)
    print("- {0} loader done.".format(data_type))
    return feat_size, data_loader
def get_vect(data_type: str, rp: str, prop_type: str, ml_algo: str,
             cat_type: str, text_data):
    """
    This method takes the list of text data and fits the Word Vectorizer (CountVectorizer) over the list of text data.

    :argument:
        :param data_type: String either `training` or `test`.
        :param rp: Absolute path of the root directory of the project.
        :param prop_type:  Natural language property either `word` (from spaCy) or `ner` (from StanfordNER).
        :param ml_algo: Machine algorithm for which the dataprep is running.
        :param cat_type: Type of categorical class `coarse` or any of the 6 main classes.
                                        (`abbr` | `desc` | `enty` | `hum` | `loc` | `num`)
        :param text_data: Data on which CountVectorizer is fitted on while training.
    :return:
        boolean_flag: True for successful operation.
        count_vec: CountVectorizer object.
    """
    # --------------------------------------------Experimental code---------------------------------------------------------
    # Other word embeddings technique can also be tried out - e.g GloVe
    if data_type == "training":
        count_vec = CountVectorizer(ngram_range=(1, 2)).fit(text_data)
        wflag = write_obj(count_vec, "{0}_{1}_vec".format(cat_type, prop_type),
                          rp + "/{0}".format(ml_algo))
        return wflag, count_vec
    elif data_type == "test":
        rflag, count_vec = read_obj("{0}_{1}_vec".format(cat_type, prop_type),
                                    rp + "/{0}".format(ml_algo))
        return rflag, count_vec
    else:
        print(
            "Error: Wrong `data_type` param to function `dataprep.text.get_vect`"
        )
        return False
def coarse_ann_computations(data_type: str, rp: str):
    """
    This method handles the process to compute text annotations using spaCy lib,
    and does for the test/train text data as per the arguments passed.

    :argument:
        :param data_type: String either `training` or `test`
        :param rp: Absolute path of the root directory of the project
    :return:
        boolean_flag: True for successful operation.
    """
    data = "training" if data_type == "training" else "test"
    doc_flag, doc_annot = com_annotations(data, rp)
    if doc_flag:
        doc_w_flag = write_obj(doc_annot, "coarse_{0}_doc".format(data), rp)
        if doc_w_flag:
            print("- Computing annotations for {0} data done.".format(data))
            return True
        else:
            print("\n- ERROR: While writing annotations for {0} data.".format(
                data))
            return False
    else:
        print("\n- ERROR: While computing annotations for {0} data.".format(
            data))
        return False
Beispiel #4
0
def fine_prop_separation(data_type: str, rp: str, prop_type: str):
    """
    :argument:
        :param data_type: String either `training` or `test`
        :param rp: Absolute path of the root directory of the project
        :param prop_type: Natural language property either `doc` (from spaCy) or `ner` (from StanfordNER)
    :return:
        boolean_flag: True for successful operation.
    """
    data = "training" if data_type == "training" else "test"
    prop_flag, abbr_prop, desc_prop, enty_prop, hum_prop, loc_prop, num_prop = sep_lang_prop(
        data, rp, prop_type)
    if prop_flag:
        wf_1 = write_obj(abbr_prop, "abbr_{0}_{1}".format(data, prop_type), rp)
        wf_2 = write_obj(desc_prop, "desc_{0}_{1}".format(data, prop_type), rp)
        wf_3 = write_obj(enty_prop, "enty_{0}_{1}".format(data, prop_type), rp)
        wf_4 = write_obj(hum_prop, "hum_{0}_{1}".format(data, prop_type), rp)
        wf_5 = write_obj(loc_prop, "loc_{0}_{1}".format(data, prop_type), rp)
        wf_6 = write_obj(num_prop, "num_{0}_{1}".format(data, prop_type), rp)
        if wf_1 and wf_2 and wf_3 and wf_4 and wf_5 and wf_6:
            print("- Separating {1} tags for {0} data done.".format(
                data, prop_type))
            return True
        else:
            print("\n- ERROR: While writing {1} tags for {0} data.".format(
                data, prop_type))
            return False
    else:
        print("\n- ERROR: While computing {1} tags for {0} data.".format(
            data, prop_type))
        return False
def fine_prop_separation(data_type: str, rp: str, prop_type: str):
    """
    This method handles the process to generate separate files (data) for fine class prediction model.
    `abbr` | `desc` | `enty` | `hum` | `loc` | `num` questions having the following coarse class are
    combined together and stored in the respective files.

    :argument:
        :param data_type: String either `training` or `test`
        :param rp: Absolute path of the root directory of the project
        :param prop_type: Natural language property either `doc` (from spaCy) or `ner` (from StanfordNER)
    :return:
        boolean_flag: True for successful operation.
    """
    data = "training" if data_type == "training" else "test"
    prop_flag, abbr_prop, desc_prop, enty_prop, hum_prop, loc_prop, num_prop = sep_lang_prop(
        data, rp, prop_type)
    if prop_flag:
        wf_1 = write_obj(abbr_prop, "abbr_{0}_{1}".format(data, prop_type), rp)
        wf_2 = write_obj(desc_prop, "desc_{0}_{1}".format(data, prop_type), rp)
        wf_3 = write_obj(enty_prop, "enty_{0}_{1}".format(data, prop_type), rp)
        wf_4 = write_obj(hum_prop, "hum_{0}_{1}".format(data, prop_type), rp)
        wf_5 = write_obj(loc_prop, "loc_{0}_{1}".format(data, prop_type), rp)
        wf_6 = write_obj(num_prop, "num_{0}_{1}".format(data, prop_type), rp)
        if wf_1 and wf_2 and wf_3 and wf_4 and wf_5 and wf_6:
            print("- Separating {1} tags for {0} data done.".format(
                data, prop_type))
            return True
        else:
            print("\n- ERROR: While writing {1} tags for {0} data.".format(
                data, prop_type))
            return False
    else:
        print("\n- ERROR: While computing {1} tags for {0} data.".format(
            data, prop_type))
        return False
def train_one_node(rp: str, cat_type: str, ml_algo: str):
    """
    Gets data in the form of sparse matrix from `qc.dataprep.feature_stack` module
    which is ready for use in a machine learning model. Using the data trains a ml node
    and serialize the trained object to the secondary memory (hard-disk).

    :argument:
        :param rp: Absolute path of the root directory of the project.
        :param cat_type: Type of categorical class `coarse` or any of the 6 main classes.
                                        (`abbr` | `desc` | `enty` | `hum` | `loc` | `num`)
        :param ml_algo: The type of machine learning models to be used. (svm | lr | linear_svm)
    :return:
        boolean_flag: True for successful operation.
        model: trained SVC model
    """
    x_ft = get_ft_obj("training", rp, ml_algo, cat_type)
    labels = read_file("{0}_classes_training".format(cat_type), rp)[1]
    y_lb = [remove_endline_char(c).strip() for c in labels]
    machine = None
    # -----------------------------------Experimental code--------------------------------------------------------------
    # 1. This is the part where you can experiment and play with the parameters.
    # 2. If you want to add more models or combinations, you just need to add an `elif` condition and
    #    provide the condition value in argument from the shell. e.g `train svm`,
    #    here `svm` will be in the variable {ml_algo}.

    if ml_algo == "svm":
        machine = svm.SVC()
    elif ml_algo == "linear_svm":
        machine = svm.LinearSVC()
    elif ml_algo == "lr":
        machine = linear_model.LogisticRegression(solver="newton-cg")
    else:
        print(
            "- Error while training {0} model. {0} is unexpected ML algorithm".
            format(ml_algo))

    # Parameter tuning ends here.
    # ------------------------------------------------------------------------------------------------------------------
    model = machine.fit(x_ft, y_lb)
    mw_flag = write_obj(model, "{0}_model".format(cat_type),
                        rp + "/{0}".format(ml_algo))
    if mw_flag:
        print("- Training done for {0} model of {1}".format(cat_type, ml_algo))
        return True
    else:
        print("- Error in writing trained {0} model of {1}".format(
            cat_type, ml_algo))
        return False
Beispiel #7
0
def coarse_ner_computations(data_type: str, rp: str):
    """
    :argument:
        :param data_type: String either `training` or `test`
        :param rp: Absolute path of the root directory of the project
    :return:
        boolean_flag: True for successful operation.
    """
    data = "training" if data_type == "training" else "test"
    ner_flag, ner_tags = com_ner(data, rp)
    if ner_flag:
        ner_w_flag = write_obj(ner_tags, "coarse_{0}_ner".format(data), rp)
        if ner_w_flag:
            print("- Computing NER tags for {0} data done.".format(data))
            return True
        else:
            print(
                "\n- ERROR: While writing NER tags for {0} data.".format(data))
            return False
    else:
        print("\n- ERROR: While computing NER tags for {0} data.".format(data))
        return False