def get_actual(rp: str):
    """
    Reads the test labels and returns the same in python list.

    :argument
        :param rp: Absolute path of the root directory of the project.
    :return:
        actual: List of actual labels for each of the test questions (test data).
    """
    actual = []
    crf, coarse = read_file("coarse_classes_test", rp)
    frf, fine = read_file("fine_classes_test", rp)
    if not crf:
        print("Error in reading actual (test) coarse classes")
        exit(-11)
    if not frf:
        print("Error in reading actual (test) fine classes")
        exit(-11)
    c_lb = [remove_endline_char(c).strip() for c in coarse]
    f_lb = [remove_endline_char(f).strip() for f in fine]
    ll = len(c_lb)
    for i in range(0, ll):
        row_lb = [c_lb[i], f_lb[i]]
        actual.append(row_lb)
    return actual
Exemple #2
0
def get_data_loader(rp: str, data_type: str):
    """
    Reads the labels for the training data and converts it into a tensor
    of "(features, target)" for Neural Network using PyTorch.
    1. Loads the training classes, both coarse and fine and creates label for
    each row by "concatenating coarse_class :: fine_class".
    2. Converts the labels_numpy to labels_bin - Binarized form to be used in NN.
    3. Loads the features using the `get_ft_obj` function in numpy arrays.
    4. Get the number of features used.
    5. Converts label_numpy into PyTorch tensor - labels.
    6. Converts x_ft(features - independent variables) into PyTorch

    :argument
        :param rp: Absolute path of the root directory of the project.
        :param data_type: String either `training` or `test`.

    :return:
        feat_size: Number of features which are being used, so that we can keep the
        data_loader: Loader object containing train data and labels used to train the Neural Network.
    """
    labels_numpy = []
    crf, coarse = read_file("coarse_classes_{0}".format(data_type), rp)
    frf, fine = read_file("fine_classes_{0}".format(data_type), rp)
    c_lb = [remove_endline_char(c).strip() for c in coarse]
    f_lb = [remove_endline_char(f).strip() for f in fine]
    if not crf:
        print("Error in reading actual ({0}) coarse classes".format(data_type))
        exit(-11)
    if not frf:
        print("Error in reading actual ({0}) fine classes".format(data_type))
        exit(-11)
    label_len = len(f_lb)
    for i in range(0, label_len):
        labels_numpy.append(c_lb[i] + " :: " + f_lb[i])
    mlb = MultiLabelBinarizer().fit(labels_numpy) if data_type == "training" \
        else read_obj("label_binarizer", rp + "/{0}".format(nn_model_str))[1]
    labels_bin = mlb.transform(labels_numpy)
    write_obj(mlb, "label_binarizer", rp + "/{0}".format(nn_model_str))
    print("- Labels loading into numpy done.")
    x_ft = get_ft_obj(data_type, rp, "{0}".format(nn_model_str),
                      "coarse").toarray()
    feat_size = x_ft.shape[1]
    print("- Features loading into numpy done.")
    labels = torch.from_numpy(labels_bin)
    data = torch.from_numpy(x_ft).float()
    print("- Features and labels as tensors, done.")
    train_data = TensorDataset(data, labels)
    data_loader = DataLoader(train_data, batch_size=batch_size)
    print("- {0} loader done.".format(data_type))
    return feat_size, data_loader
def com_annotations(data_type: str, rp: str):
    """
    This method computes all the annotations, Lemma, POS, IS_STOPWORD etc., in the form of spaCy doc container
    for all the rows (lines) in the text data and saves the object of list of the doc (container) for each line.
    # doc = spaCy container - [https://spacy.io/api/doc]

    :argument:
        :param data_type: String either `training` or `test`
        :param rp: Absolute path of the root directory of the project
    :return:
        boolean_flag: True for successful operation.
        all_annotations: List of doc (spaCy containers) of all the lines in the data.
    """
    load_spacy()

    all_annotations = []
    read_flag, file = read_file("raw_sentence_{0}".format(data_type), rp)
    if read_flag:
        for line in file:
            doc = nlp(remove_endline_char(line))
            all_annotations.append(doc)
        file.close()
        return True, all_annotations
    else:
        return False
def sep_lang_prop(data_type: str, rp: str, prop_type: str):
    """
    Function gets all the Natural language properties, which are pre-computed and separate outs for annotations for
    each coarse classes which will make things easier to train our sub-models
    (model for fine classes given the particular coarse class).

    :argument:
        :param data_type: String either `training` or `test`
        :param rp: Absolute path of the root directory of the project
        :param prop_type: Natural language property either `doc` (from spaCy) or `ner` (from StanfordNER)
    :return:
        boolean_flag: True for successful operation.
        abbr_prop: List of prop (spaCy containers or NER tags) of questions belonging to ABBR coarse class.
        desc_prop: List of prop (spaCy containers or NER tags) of questions belonging to DESC coarse class.
        enty_prop: List of prop (spaCy containers or NER tags) of questions belonging to ENTY coarse class.
        hum_prop: List of prop (spaCy containers or NER tags) of questions belonging to HUM coarse class.
        loc_prop: List of prop (spaCy containers or NER tags) of questions belonging to LOC coarse class.
        num_prop: List of prop (spaCy containers or NER tags) of questions belonging to NUM coarse class.
    """
    abbr_prop = []
    desc_prop = []
    enty_prop = []
    hum_prop = []
    loc_prop = []
    num_prop = []
    read_obj_flag, all_prop_obj = read_obj(
        "coarse_{0}_{1}".format(data_type, prop_type), rp)
    read_file_flag, coarse_classes_file = read_file(
        "coarse_classes_{0}".format(data_type), rp)
    if read_obj_flag and read_file_flag:
        i = 0
        for line in coarse_classes_file:
            coarse_c = remove_endline_char(line).strip()
            if coarse_c == "ABBR":
                abbr_prop.append(all_prop_obj[i])
            elif coarse_c == "DESC":
                desc_prop.append(all_prop_obj[i])
            elif coarse_c == "ENTY":
                enty_prop.append(all_prop_obj[i])
            elif coarse_c == "HUM":
                hum_prop.append(all_prop_obj[i])
            elif coarse_c == "LOC":
                loc_prop.append(all_prop_obj[i])
            elif coarse_c == "NUM":
                num_prop.append(all_prop_obj[i])
            else:
                print("{0} is an unexpected coarse class".format(coarse_c))
            # increment i by one, so that the proper lines are match
            i = i + 1
        # for validation
        if i != len(all_prop_obj):
            print("Something went wrong in mapping the processed annotations.")
            return False
        return True, abbr_prop, desc_prop, enty_prop, hum_prop, loc_prop, num_prop
    else:
        return False
def train_one_node(rp: str, cat_type: str, ml_algo: str):
    """
    Gets data in the form of sparse matrix from `qc.dataprep.feature_stack` module
    which is ready for use in a machine learning model. Using the data trains a ml node
    and serialize the trained object to the secondary memory (hard-disk).

    :argument:
        :param rp: Absolute path of the root directory of the project.
        :param cat_type: Type of categorical class `coarse` or any of the 6 main classes.
                                        (`abbr` | `desc` | `enty` | `hum` | `loc` | `num`)
        :param ml_algo: The type of machine learning models to be used. (svm | lr | linear_svm)
    :return:
        boolean_flag: True for successful operation.
        model: trained SVC model
    """
    x_ft = get_ft_obj("training", rp, ml_algo, cat_type)
    labels = read_file("{0}_classes_training".format(cat_type), rp)[1]
    y_lb = [remove_endline_char(c).strip() for c in labels]
    machine = None
    # -----------------------------------Experimental code--------------------------------------------------------------
    # 1. This is the part where you can experiment and play with the parameters.
    # 2. If you want to add more models or combinations, you just need to add an `elif` condition and
    #    provide the condition value in argument from the shell. e.g `train svm`,
    #    here `svm` will be in the variable {ml_algo}.

    if ml_algo == "svm":
        machine = svm.SVC()
    elif ml_algo == "linear_svm":
        machine = svm.LinearSVC()
    elif ml_algo == "lr":
        machine = linear_model.LogisticRegression(solver="newton-cg")
    else:
        print(
            "- Error while training {0} model. {0} is unexpected ML algorithm".
            format(ml_algo))

    # Parameter tuning ends here.
    # ------------------------------------------------------------------------------------------------------------------
    model = machine.fit(x_ft, y_lb)
    mw_flag = write_obj(model, "{0}_model".format(cat_type),
                        rp + "/{0}".format(ml_algo))
    if mw_flag:
        print("- Training done for {0} model of {1}".format(cat_type, ml_algo))
        return True
    else:
        print("- Error in writing trained {0} model of {1}".format(
            cat_type, ml_algo))
        return False
Exemple #6
0
def read_raw_data(file_key, rp):
    """
    This method reads the dataset present as it is originally and
    forms the structure which will be easy to use for the further process.

    :argument:
        :param file_key: A string which represents the raw data file, in properties.conf,
                          used for the process (experiment).
        :param rp: Absolute path of the root directory of the project

    :Expects:
        Expected line format "coarse_class:fine_class This is the question string"
    :returns:
        boolean flag: True for successful operation
        coarse_classes_list: List of coarse classes for questions
        fine_classes_list: List of fine classes for questions
        questions_list: List of the questions in the raw data file
    :Example for a single line:
        ENTY:cremat What films featured the character Popeye Doyle ?
        :return: coarse_classes_list = ['ENTY'], fine_classes_list = ['cremat'],
                 questions_list = ['What films featured the character Popeye Doyle ?']
    """
    coarse_classes_list = []
    fine_classes_list = []
    questions_list = []
    flag, file = read_file(file_key, rp)
    if flag:
        for line in file:
            space_separated_row = line.split(" ")
            classes = space_separated_row[0].split(":")
            question = " ".join(space_separated_row[1:])
            coarse_class, fine_class = classes[0], classes[1]
            coarse_classes_list.append(coarse_class)
            fine_classes_list.append(fine_class)
            questions_list.append(question)
        file.close()
        return True, coarse_classes_list, fine_classes_list, questions_list
    else:
        return False
def com_ner(data_type: str, rp: str):
    """
    Function gets the NER tags of the sentence using the sequential model pre-trained by Stanford NLP programs.

     :argument:
        :param data_type: String either `training` or `test`
        :param rp: Absolute path of the root directory of the project
    :return:
        boolean_flag: True for successful operation.
        all_ners: List of NER tags of each line
    """
    # initialize the tagger corresponding to StandfordNER server
    tagger = Ner(host="localhost", port=9199)
    all_ners = []
    read_flag, file = read_file("raw_sentence_{0}".format(data_type), rp)
    if read_flag:
        for line in file:
            word_tags = tagger.get_entities(line)
            ner_tags = [x[1] for x in word_tags]
            all_ners.append(" ".join(ner_tags))
        return True, all_ners
    else:
        return False