Esempio n. 1
0
def text_ft_arr(data_type: str, rp: str, prop_type: str, ml_algo: str,
                cat_type: str):
    """
    :argument:
        :param data_type: String either `training` or `test`.
        :param rp: Absolute path of the root directory of the project.
        :param prop_type: Natural language property either `word` | `lemma` | `pos` | `tag` | `dep` |
                          `shape` | `alpha` | `stop` (from spaCy) or `ner` (from StanfordNER).
        :param ml_algo: Machine algorithm for which the dataprep is running.
        :param cat_type: Type of categorical class `coarse` or any of the 6 main classes.
                         (`abbr` | `desc` | `enty` | `hum` | `loc` | `num`)
    :return:
        boolean_flag: True for successful operation.
        text_ft: feature vectorized to be used in ML algorithms.
    """
    list_doc_prop = [
        "word", "lemma", "pos", "tag", "dep", "shape", "alpha", "stop"
    ]
    if prop_type in list_doc_prop:
        if data_type == "training":
            flag, doc_list_obj = read_obj(
                "{1}_{0}_doc".format(data_type, cat_type), rp)
        else:
            flag, doc_list_obj = read_obj("coarse_{0}_doc".format(data_type),
                                          rp)
        if flag:
            text_data = get_info_doc(prop_type, doc_list_obj)
            vflag, vectorizer = get_vect(data_type, rp, prop_type, ml_algo,
                                         cat_type, text_data)
            if vflag:
                text_ft = vectorizer.transform(text_data)
                return True, text_ft
            else:
                return False
        else:
            return False
    elif prop_type == "ner":
        if data_type == "training":
            flag, ner_l = read_obj("{1}_{0}_ner".format(data_type, cat_type),
                                   rp)
        else:
            flag, ner_l = read_obj("coarse_{0}_ner".format(data_type), rp)
        if flag:
            vflag, vectorizer = get_vect(data_type, rp, prop_type, ml_algo,
                                         cat_type, ner_l)
            if vflag:
                text_ft = vectorizer.transform(ner_l)
                return True, text_ft
            else:
                return False
        else:
            return False
    else:
        print("- Error: Invalid `prop_type` to function `text_ft_vec`")
        return False
def get_vect(data_type: str, rp: str, prop_type: str, ml_algo: str,
             cat_type: str, text_data):
    """
    This method takes the list of text data and fits the Word Vectorizer (CountVectorizer) over the list of text data.

    :argument:
        :param data_type: String either `training` or `test`.
        :param rp: Absolute path of the root directory of the project.
        :param prop_type:  Natural language property either `word` (from spaCy) or `ner` (from StanfordNER).
        :param ml_algo: Machine algorithm for which the dataprep is running.
        :param cat_type: Type of categorical class `coarse` or any of the 6 main classes.
                                        (`abbr` | `desc` | `enty` | `hum` | `loc` | `num`)
        :param text_data: Data on which CountVectorizer is fitted on while training.
    :return:
        boolean_flag: True for successful operation.
        count_vec: CountVectorizer object.
    """
    # --------------------------------------------Experimental code---------------------------------------------------------
    # Other word embeddings technique can also be tried out - e.g GloVe
    if data_type == "training":
        count_vec = CountVectorizer(ngram_range=(1, 2)).fit(text_data)
        wflag = write_obj(count_vec, "{0}_{1}_vec".format(cat_type, prop_type),
                          rp + "/{0}".format(ml_algo))
        return wflag, count_vec
    elif data_type == "test":
        rflag, count_vec = read_obj("{0}_{1}_vec".format(cat_type, prop_type),
                                    rp + "/{0}".format(ml_algo))
        return rflag, count_vec
    else:
        print(
            "Error: Wrong `data_type` param to function `dataprep.text.get_vect`"
        )
        return False
Esempio n. 3
0
def text_ft_arr(data_type: str, rp: str, prop_type: str, ml_algo: str,
                cat_type: str, data: list):
    """
    This method reads the list of `doc` objects written to secondary memory by NLP process.
    For the list of objects gets the Word Vectorizer to convert text data to numeric data and transforms
    the list of text data to vectorized features.

    :argument:
        :param data_type: String either `training`, `test` or `api`.
        :param rp: Absolute path of the root directory of the project.
        :param prop_type: Natural language property either `word` | `lemma` | `pos` | `tag` | `dep` |
                          `shape` | `alpha` | `stop` | `ner` | (from spaCy)
        :param ml_algo: Machine algorithm for which the dataprep is running.
        :param cat_type: Type of categorical class `coarse` or any of the 6 main classes.
                         (`abbr` | `desc` | `enty` | `hum` | `loc` | `num`)
        :param data: if data_type='api' then provide the list of Docs here (does not read them from file)
    :return:
        boolean_flag: True for successful operation.
        text_ft: feature vectorized to be used in ML algorithms.
    """
    list_doc_prop = [
        "word", "lemma", "pos", "tag", "dep", "shape", "alpha", "stop", "ner"
    ]
    if prop_type in list_doc_prop:
        if data_type == "api":
            flag = True
            doc_list_obj = data
        elif data_type == "training":
            flag, doc_list_obj = read_obj(
                "{1}_{0}_doc".format(data_type, cat_type), rp)
        else:
            flag, doc_list_obj = read_obj("coarse_{0}_doc".format(data_type),
                                          rp)
        if flag:
            text_data = get_info_doc(prop_type, doc_list_obj)
            vflag, vectorizer = get_vect(data_type, rp, prop_type, ml_algo,
                                         cat_type, text_data)
            if vflag:
                text_ft = vectorizer.transform(text_data)
                return True, text_ft
            else:
                return False
        else:
            return False
    else:
        print("- Error: Invalid `prop_type` to function `text_ft_vec`")
        return False
def sep_lang_prop(data_type: str, rp: str, prop_type: str):
    """
    Function gets all the Natural language properties, which are pre-computed and separate outs for annotations for
    each coarse classes which will make things easier to train our sub-models
    (model for fine classes given the particular coarse class).

    :argument:
        :param data_type: String either `training` or `test`
        :param rp: Absolute path of the root directory of the project
        :param prop_type: Natural language property either `doc` (from spaCy) or `ner` (from StanfordNER)
    :return:
        boolean_flag: True for successful operation.
        abbr_prop: List of prop (spaCy containers or NER tags) of questions belonging to ABBR coarse class.
        desc_prop: List of prop (spaCy containers or NER tags) of questions belonging to DESC coarse class.
        enty_prop: List of prop (spaCy containers or NER tags) of questions belonging to ENTY coarse class.
        hum_prop: List of prop (spaCy containers or NER tags) of questions belonging to HUM coarse class.
        loc_prop: List of prop (spaCy containers or NER tags) of questions belonging to LOC coarse class.
        num_prop: List of prop (spaCy containers or NER tags) of questions belonging to NUM coarse class.
    """
    abbr_prop = []
    desc_prop = []
    enty_prop = []
    hum_prop = []
    loc_prop = []
    num_prop = []
    read_obj_flag, all_prop_obj = read_obj(
        "coarse_{0}_{1}".format(data_type, prop_type), rp)
    read_file_flag, coarse_classes_file = read_file(
        "coarse_classes_{0}".format(data_type), rp)
    if read_obj_flag and read_file_flag:
        i = 0
        for line in coarse_classes_file:
            coarse_c = remove_endline_char(line).strip()
            if coarse_c == "ABBR":
                abbr_prop.append(all_prop_obj[i])
            elif coarse_c == "DESC":
                desc_prop.append(all_prop_obj[i])
            elif coarse_c == "ENTY":
                enty_prop.append(all_prop_obj[i])
            elif coarse_c == "HUM":
                hum_prop.append(all_prop_obj[i])
            elif coarse_c == "LOC":
                loc_prop.append(all_prop_obj[i])
            elif coarse_c == "NUM":
                num_prop.append(all_prop_obj[i])
            else:
                print("{0} is an unexpected coarse class".format(coarse_c))
            # increment i by one, so that the proper lines are match
            i = i + 1
        # for validation
        if i != len(all_prop_obj):
            print("Something went wrong in mapping the processed annotations.")
            return False
        return True, abbr_prop, desc_prop, enty_prop, hum_prop, loc_prop, num_prop
    else:
        return False
Esempio n. 5
0
def get_data_loader(rp: str, data_type: str):
    """
    Reads the labels for the training data and converts it into a tensor
    of "(features, target)" for Neural Network using PyTorch.
    1. Loads the training classes, both coarse and fine and creates label for
    each row by "concatenating coarse_class :: fine_class".
    2. Converts the labels_numpy to labels_bin - Binarized form to be used in NN.
    3. Loads the features using the `get_ft_obj` function in numpy arrays.
    4. Get the number of features used.
    5. Converts label_numpy into PyTorch tensor - labels.
    6. Converts x_ft(features - independent variables) into PyTorch

    :argument
        :param rp: Absolute path of the root directory of the project.
        :param data_type: String either `training` or `test`.

    :return:
        feat_size: Number of features which are being used, so that we can keep the
        data_loader: Loader object containing train data and labels used to train the Neural Network.
    """
    labels_numpy = []
    crf, coarse = read_file("coarse_classes_{0}".format(data_type), rp)
    frf, fine = read_file("fine_classes_{0}".format(data_type), rp)
    c_lb = [remove_endline_char(c).strip() for c in coarse]
    f_lb = [remove_endline_char(f).strip() for f in fine]
    if not crf:
        print("Error in reading actual ({0}) coarse classes".format(data_type))
        exit(-11)
    if not frf:
        print("Error in reading actual ({0}) fine classes".format(data_type))
        exit(-11)
    label_len = len(f_lb)
    for i in range(0, label_len):
        labels_numpy.append(c_lb[i] + " :: " + f_lb[i])
    mlb = MultiLabelBinarizer().fit(labels_numpy) if data_type == "training" \
        else read_obj("label_binarizer", rp + "/{0}".format(nn_model_str))[1]
    labels_bin = mlb.transform(labels_numpy)
    write_obj(mlb, "label_binarizer", rp + "/{0}".format(nn_model_str))
    print("- Labels loading into numpy done.")
    x_ft = get_ft_obj(data_type, rp, "{0}".format(nn_model_str),
                      "coarse").toarray()
    feat_size = x_ft.shape[1]
    print("- Features loading into numpy done.")
    labels = torch.from_numpy(labels_bin)
    data = torch.from_numpy(x_ft).float()
    print("- Features and labels as tensors, done.")
    train_data = TensorDataset(data, labels)
    data_loader = DataLoader(train_data, batch_size=batch_size)
    print("- {0} loader done.".format(data_type))
    return feat_size, data_loader
def load_models(rp: str, ml_algo: str):
    global coarse_model, abbr_model, desc_model, enty_model, hum_model, loc_model, num_model
    #
    """
    Load all the trained models

    :argument
        :param rp: Absolute path of the root directory of the project.
        :param ml_algo: The type of machine learning models to be used. (svm | lr | linear_svm)
    """
    crf, coarse_model = read_obj("coarse_model", rp + "/{0}".format(ml_algo).format(ml_algo))
    arf, abbr_model = read_obj("abbr_model", rp + "/{0}".format(ml_algo))
    drf, desc_model = read_obj("desc_model", rp + "/{0}".format(ml_algo))
    erf, enty_model = read_obj("enty_model", rp + "/{0}".format(ml_algo))
    hrf, hum_model = read_obj("hum_model", rp + "/{0}".format(ml_algo))
    lrf, loc_model = read_obj("loc_model", rp + "/{0}".format(ml_algo))
    nrf, num_model = read_obj("num_model", rp + "/{0}".format(ml_algo))

    if not crf:
        print("- Error in reading coarse {0} model".format(ml_algo))
    if not arf:
        print("- Error in reading abbr {0} model".format(ml_algo))
    if not drf:
        print("- Error in reading desc {0} model".format(ml_algo))
    if not erf:
        print("- Error in reading enty {0} model".format(ml_algo))
    if not hrf:
        print("- Error in reading hum {0} model".format(ml_algo))
    if not lrf:
        print("- Error in reading loc {0} model".format(ml_algo))
    if not nrf:
        print("- Error in reading num {0} model".format(ml_algo))

    if crf and arf and drf and erf and hrf and lrf and nrf:
        print("- Loading the {0} models complete".format(ml_algo))
    else:
        print("- Error in loading the pre-trained model")
        exit(-10)
def get_predictions(rp: str, ml_algo: str):
    """
    Gets data in the form of sparse matrix from `qc.dataprep.feature_stack` module
    which is ready for use in a machine learning model. Using the test data in `question-classification/dataset`
    tests [[ml_algo]] model which is pre-trained and saved as serialized pickle files.

    :argument
        :param rp: Absolute path of the root directory of the project.
        :param ml_algo: The type of machine learning models to be used. (svm | lr | linear_svm)
    :return:
        pred: List of prediction for each of the test questions (test data).
    """
    start_test = datetime.datetime.now().timestamp()
    print("\n* Testing started - {0} model".format(ml_algo))
    pred = []
    # load all the trained models
    crf, coarse_model = read_obj("coarse_model", rp + "/{0}".format(ml_algo))
    arf, abbr_model = read_obj("abbr_model", rp + "/{0}".format(ml_algo))
    drf, desc_model = read_obj("desc_model", rp + "/{0}".format(ml_algo))
    erf, enty_model = read_obj("enty_model", rp + "/{0}".format(ml_algo))
    hrf, hum_model = read_obj("hum_model", rp + "/{0}".format(ml_algo))
    lrf, loc_model = read_obj("loc_model", rp + "/{0}".format(ml_algo))
    nrf, num_model = read_obj("num_model", rp + "/{0}".format(ml_algo))
    if not crf:
        print("- Error in reading coarse {0} model".format(ml_algo))
    if not arf:
        print("- Error in reading abbr {0} model".format(ml_algo))
    if not drf:
        print("- Error in reading desc {0} model".format(ml_algo))
    if not erf:
        print("- Error in reading enty {0} model".format(ml_algo))
    if not hrf:
        print("- Error in reading hum {0} model".format(ml_algo))
    if not lrf:
        print("- Error in reading loc {0} model".format(ml_algo))
    if not nrf:
        print("- Error in reading num {0} model".format(ml_algo))
    if crf and arf and drf and erf and hrf and lrf and nrf:
        print("- Loading the {0} models complete".format(ml_algo))
    else:
        print("- Error in loading the pre-trained model")
        exit(-10)
    c_ft = get_ft_obj("test", rp, ml_algo, "coarse").tocsr()
    a_ft = get_ft_obj("test", rp, ml_algo, "abbr").tocsr()
    d_ft = get_ft_obj("test", rp, ml_algo, "desc").tocsr()
    e_ft = get_ft_obj("test", rp, ml_algo, "enty").tocsr()
    h_ft = get_ft_obj("test", rp, ml_algo, "hum").tocsr()
    l_ft = get_ft_obj("test", rp, ml_algo, "loc").tocsr()
    n_ft = get_ft_obj("test", rp, ml_algo, "num").tocsr()
    print("- DataPrep for test data done.")
    for i in range(0, c_ft.shape[0]):
        c = coarse_model.predict(c_ft[i])[0]
        if c == "ABBR":
            f = abbr_model.predict(a_ft[i])[0]
        elif c == "DESC":
            f = desc_model.predict(d_ft[i])[0]
        elif c == "ENTY":
            f = enty_model.predict(e_ft[i])[0]
        elif c == "HUM":
            f = hum_model.predict(h_ft[i])[0]
        elif c == "LOC":
            f = loc_model.predict(l_ft[i])[0]
        else:
            f = num_model.predict(n_ft[i])[0]
        row_pred = [c, f]
        pred.append(row_pred)
    end_test = datetime.datetime.now().timestamp()
    total_test = datetime.datetime.utcfromtimestamp(end_test - start_test)
    print("- Predicting done : {3} models in {0}h {1}m {2}s".format(
        total_test.hour, total_test.minute, total_test.second, ml_algo))
    return pred