Beispiel #1
0
def main():
    random.seed(17)
    global testing
    global questions_yn
    _, doc = read_doc(sys.argv[1])
    doc = preprocess(doc, nlp)
    num_questions = int(sys.argv[2])

    if sys.argv[0].startswith("python"):
        if len(sys.argv) > 3:
            testing = bool(sys.argv[3])
    else:
        if len(sys.argv) > 2:
            testing = bool(sys.argv[2])

    try:
        yesno_questions(doc)
        questions_yn = fudge_questions(questions_yn)
        evaluate_questions(questions_yn, 1)
        wh_questions(doc)
        evaluate_questions(questions_wh, 2)
        subj_verb_obj_questions(doc)
        evaluate_questions(questions_subj_verb_obj, 3)
    except Exception, e:
        print(traceback.format_exc())
Beispiel #2
0
def answer_questions(doc_text, questions, format_answer):
    nlp = spacy.load("en")
    doc = preprocess(doc_text, nlp)
    A = Answerer(doc, nlp)
    for q in questions:
        if len(q) == 0:
            print " "
        else:
            answer = A.get_answer(q, 0, format_answer)
            print answer
Beispiel #3
0
def main():
    df = pd.read_csv(config["train"]["data_path"])
    y = np.array(df[config["train"]["label_column"]])

    df = preprocess(df)

    label_nbr = len(df[config["train"]["label_column"]].unique())
    label_names = config["train"]["label"]

    y = np.array(df[config["train"]["label_column"]])
    df = df.drop([config["train"]["label_column"]] +
                 config['train']['to_drop'],
                 axis=1)
    X = np.array(df)

    print(X.shape, y.shape)

    try:
        device = torch.device(config["train"]["device"])
    except:
        device = torch.device("cpu")

    classifier = Net(input_dim=df.shape[1],
                     hidden_dim=config["train"]["hidden_dim"]).to(device)
    criterion = torch.nn.functional.mse_loss

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=.25,
                                                        random_state=42)

    X_train, y_train = torch.tensor(X_train).float(), torch.tensor(
        y_train).float()
    X_test, y_test = torch.tensor(X_test).float(), torch.tensor(y_test).float()

    # create dataloader with specified batch_size
    ds_train = torch.utils.data.TensorDataset(X_train, y_train)
    dataloader_train = torch.utils.data.DataLoader(
        ds_train, batch_size=config["train"]["batch_size"], shuffle=True)

    ds_test = torch.utils.data.TensorDataset(X_test, y_test)
    dataloader_test = torch.utils.data.DataLoader(
        ds_test, batch_size=config["train"]["batch_size"], shuffle=True)

    trainer = Trainer(classifier, device=device, criterion=criterion)
    trainer.train(dataloader_train,
                  dataloader_test,
                  config["train"]["epochs"],
                  config["train"]["log_every"],
                  task="regression")

    # eval step

    metrics = {}
    metrics["mse"] = trainer.metric
    mlflow.log_params(metrics)

    mlflow.pytorch.log_model(
        pytorch_model=classifier,
        artifact_path="model",
        registered_model_name=config["mlflow"]["model_name"])

    api_request_model = get_request_features(df)
    with open("request_model.json", "w") as rmodel:
        json.dump(api_request_model, rmodel, indent=4)

    # checking if there are any productions models,
    # so we can put at least one in production

    model_name = config['mlflow']['model_name']

    try:
        mlflow.pytorch.load_model(f"models:/{model_name}/Production")
    except:
        client = MlflowClient()
        version = client.search_model_versions(
            f"name='{model_name}'")[0].version

        client.transition_model_version_stage(name=model_name,
                                              version=version,
                                              stage="Production")
Beispiel #4
0
import glob
import lib as lb
'''preprocessing the text then tokenize it '''
processed_text = []
for file in glob.glob(
        "*.txt"
):  #make sure that documents are in the same folder with this py file
    print(file)
    fname = file
    file = open(file, "r")
    text = file.read().strip()
    file.close()

    processed_text.append(lb.word_tokenize(str(lb.preprocess(text))))
'''docs converted to uniqe sets of words'''
unique_text = []
for lis in processed_text:
    unique_text.append(set(lis))
'''document freq of every word in the collection of docs'''
n = len(processed_text)  #or len(unique_text)
df = {}
for i in range(n):
    tokens = unique_text[i]
    for w in tokens:
        if w in df.keys():
            df[w] = df[w] + 1
        else:
            df[w] = 1

num_vocabs = len(df)
all_vocab = [x for x in df]  #list of all words in the collection of docs
Beispiel #5
0
def main():
    df = pd.read_csv(config["train"]["data_path"])
    y = np.array(df[config["train"]["labels_column"]])

    df = preprocess(df)

    label_nbr = len(df[config["train"]["labels_column"]].unique())
    label_names = config["train"]["labels"]

    y = np.array(df[config["train"]["labels_column"]])
    df = df.drop([config["train"]["labels_column"]] +
                 config['train']['to_drop'],
                 axis=1)
    X = np.array(df)

    print(X.shape, y.shape)

    try:
        device = torch.device(config["train"]["device"])
    except:
        device = torch.device("cpu")

    classifier = Net(input_dim=df.shape[1],
                     output_dim=label_nbr,
                     hidden_dim=config["train"]["hidden_dim"]).to(device)
    criterion = torch.nn.CrossEntropyLoss()

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=.25,
                                                        random_state=42)

    X_train, y_train = torch.tensor(X_train).float(), torch.tensor(
        y_train).float()
    X_test, y_test = torch.tensor(X_test).float(), torch.tensor(y_test).float()

    # create dataloader with specified batch_size
    ds_train = torch.utils.data.TensorDataset(X_train, y_train)
    dataloader_train = torch.utils.data.DataLoader(
        ds_train, batch_size=config["train"]["batch_size"], shuffle=True)

    ds_test = torch.utils.data.TensorDataset(X_test, y_test)
    dataloader_test = torch.utils.data.DataLoader(
        ds_test, batch_size=config["train"]["batch_size"], shuffle=True)

    trainer = Trainer(classifier, device=device, criterion=criterion)
    trainer.train(dataloader_train, dataloader_test, config["train"]["epochs"],
                  config["train"]["log_every"])

    # eval step

    y_true, y_pred, scores = get_preds_labels_scores(dataloader_test,
                                                     classifier, device)

    metrics = eval_model_per_class(y_true, y_pred, label_names)
    metrics["accuracy"] = trainer.metric / 100
    mlflow.log_params(metrics)

    mlflow.pytorch.log_model(
        pytorch_model=classifier,
        artifact_path="model",
        registered_model_name=config["mlflow"]["model_name"])

    conf_matrix_fname = save_confusion_matrix(y_true, y_pred, label_names)
    mlflow.log_artifact(conf_matrix_fname)
    os.remove(conf_matrix_fname)

    roc_curve_fname = save_roc_curve(y_true, scores, label_names)
    mlflow.log_artifact(roc_curve_fname)
    os.remove(roc_curve_fname)

    pr_curve_fname = save_pr_curve(y_true, scores, label_names)
    mlflow.log_artifact(pr_curve_fname)
    os.remove(pr_curve_fname)

    eval_fnames = eval_classification_model_predictions_per_feature(
        config["train"]["data_path"],
        classifier,
        config['train']['labels_column'],
        config['train']['labels'],
        config['train']['to_drop'],
        use_torch=True,
        device=device,
        preprocess=preprocess)
    for eval_fname in eval_fnames:
        mlflow.log_artifact(eval_fname)
        os.remove(eval_fname)

    api_request_model = get_request_features(df)
    with open("request_model.json", "w") as rmodel:
        json.dump(api_request_model, rmodel, indent=4)

    # checking if there are any productions models,
    # so we can put at least one in production

    model_name = config['mlflow']['model_name']

    try:
        mlflow.pytorch.load_model(f"models:/{model_name}/Production")
    except:
        client = MlflowClient()
        version = client.search_model_versions(
            f"name='{model_name}'")[0].version

        client.transition_model_version_stage(name=model_name,
                                              version=version,
                                              stage="Production")
Beispiel #6
0
def main():
    df = pd.read_csv(config["train"]["data_path"])

    label_names = config["train"]["labels"]

    df = preprocess(df)

    y = np.array(df[config["train"]["labels_column"]])
    df = df.drop([config["train"]["labels_column"]] + config['train']['to_drop'], axis=1)
    X = np.array(df)

    print(X.shape, y.shape)
    print(df.columns)

    classifier = parse_model_option(config["train"]["model"],
                                    config["train"]["model_args"])

    print(classifier)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=.25,
                                                        random_state=42)

    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    y_true = y_test

    metrics = eval_model_per_class(y_true, y_pred, label_names)
    metrics["accuracy"] = (y_pred == y_true).astype(np.float32).mean()
    mlflow.log_params(metrics)

    mlflow.sklearn.log_model(
        sk_model=classifier,
        artifact_path="model",
        registered_model_name=config["mlflow"]["model_name"]
    )

    conf_matrix_fname = save_confusion_matrix(y_true,
                                              y_pred,
                                              label_names)
    mlflow.log_artifact(conf_matrix_fname)
    os.remove(conf_matrix_fname)

    eval_fnames = eval_classification_model_predictions_per_feature(config["train"]["data_path"],
                                                                    classifier,
                                                                    config['train']['labels_column'],
                                                                    config['train']['labels'],
                                                                    config['train']['to_drop'],
                                                                    preprocess=preprocess)
    for eval_fname in eval_fnames:
        mlflow.log_artifact(eval_fname)
        os.remove(eval_fname)

    api_request_model = get_request_features(df)
    with open("request_model.json", "w") as rmodel:
        json.dump(api_request_model, rmodel, indent=4)

    # checking if there are any productions models,
    # so we can put at least one in production

    model_name = config['mlflow']['model_name']

    try:
        model = mlflow.sklearn.load_model(f"models:/{model_name}/Production")
    except:
        client = MlflowClient()
        version = client.search_model_versions(f"name='{model_name}'")[0].version

        client.transition_model_version_stage(
            name=model_name,
            version=version,
            stage="Production"
        )

    print(metrics)
Beispiel #7
0

def test(test_docs, prior, likelihood, classes, vocab):
    results = []
    for test_doc in test_docs:
        results.append(
            test_naive_bayes(test_doc, prior, likelihood, classes, vocab))
    print_test_results(results, test_documents)


# Prevent running if imported as a module
if __name__ == '__main__':
    classes = [0, 1]
    # perform test on the training data
    sys.stdout = open('output.txt', 'wt')
    vocab, documents = preprocess('trainingSet.txt', 'preprocessed_train.txt')
    _, test_documents = preprocess('trainingSet.txt', 'preprocessed_train.txt')

    prior, likelihood = train_naive_bayes(vocab, documents, classes)

    print("\n")
    print("testing on training data")
    test(documents, prior, likelihood, classes, vocab)

    # perform test on the testing data
    vocab, documents = preprocess('trainingSet.txt', 'preprocessed_train.txt')
    _, test_documents = preprocess('testSet.txt', 'preprocessed_test.txt')

    prior, likelihood = train_naive_bayes(vocab, documents, classes)
    print("\n")
    print("testing on testing data")