def main(): random.seed(17) global testing global questions_yn _, doc = read_doc(sys.argv[1]) doc = preprocess(doc, nlp) num_questions = int(sys.argv[2]) if sys.argv[0].startswith("python"): if len(sys.argv) > 3: testing = bool(sys.argv[3]) else: if len(sys.argv) > 2: testing = bool(sys.argv[2]) try: yesno_questions(doc) questions_yn = fudge_questions(questions_yn) evaluate_questions(questions_yn, 1) wh_questions(doc) evaluate_questions(questions_wh, 2) subj_verb_obj_questions(doc) evaluate_questions(questions_subj_verb_obj, 3) except Exception, e: print(traceback.format_exc())
def answer_questions(doc_text, questions, format_answer): nlp = spacy.load("en") doc = preprocess(doc_text, nlp) A = Answerer(doc, nlp) for q in questions: if len(q) == 0: print " " else: answer = A.get_answer(q, 0, format_answer) print answer
def main(): df = pd.read_csv(config["train"]["data_path"]) y = np.array(df[config["train"]["label_column"]]) df = preprocess(df) label_nbr = len(df[config["train"]["label_column"]].unique()) label_names = config["train"]["label"] y = np.array(df[config["train"]["label_column"]]) df = df.drop([config["train"]["label_column"]] + config['train']['to_drop'], axis=1) X = np.array(df) print(X.shape, y.shape) try: device = torch.device(config["train"]["device"]) except: device = torch.device("cpu") classifier = Net(input_dim=df.shape[1], hidden_dim=config["train"]["hidden_dim"]).to(device) criterion = torch.nn.functional.mse_loss X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42) X_train, y_train = torch.tensor(X_train).float(), torch.tensor( y_train).float() X_test, y_test = torch.tensor(X_test).float(), torch.tensor(y_test).float() # create dataloader with specified batch_size ds_train = torch.utils.data.TensorDataset(X_train, y_train) dataloader_train = torch.utils.data.DataLoader( ds_train, batch_size=config["train"]["batch_size"], shuffle=True) ds_test = torch.utils.data.TensorDataset(X_test, y_test) dataloader_test = torch.utils.data.DataLoader( ds_test, batch_size=config["train"]["batch_size"], shuffle=True) trainer = Trainer(classifier, device=device, criterion=criterion) trainer.train(dataloader_train, dataloader_test, config["train"]["epochs"], config["train"]["log_every"], task="regression") # eval step metrics = {} metrics["mse"] = trainer.metric mlflow.log_params(metrics) mlflow.pytorch.log_model( pytorch_model=classifier, artifact_path="model", registered_model_name=config["mlflow"]["model_name"]) api_request_model = get_request_features(df) with open("request_model.json", "w") as rmodel: json.dump(api_request_model, rmodel, indent=4) # checking if there are any productions models, # so we can put at least one in production model_name = config['mlflow']['model_name'] try: mlflow.pytorch.load_model(f"models:/{model_name}/Production") except: client = MlflowClient() version = client.search_model_versions( f"name='{model_name}'")[0].version client.transition_model_version_stage(name=model_name, version=version, stage="Production")
import glob import lib as lb '''preprocessing the text then tokenize it ''' processed_text = [] for file in glob.glob( "*.txt" ): #make sure that documents are in the same folder with this py file print(file) fname = file file = open(file, "r") text = file.read().strip() file.close() processed_text.append(lb.word_tokenize(str(lb.preprocess(text)))) '''docs converted to uniqe sets of words''' unique_text = [] for lis in processed_text: unique_text.append(set(lis)) '''document freq of every word in the collection of docs''' n = len(processed_text) #or len(unique_text) df = {} for i in range(n): tokens = unique_text[i] for w in tokens: if w in df.keys(): df[w] = df[w] + 1 else: df[w] = 1 num_vocabs = len(df) all_vocab = [x for x in df] #list of all words in the collection of docs
def main(): df = pd.read_csv(config["train"]["data_path"]) y = np.array(df[config["train"]["labels_column"]]) df = preprocess(df) label_nbr = len(df[config["train"]["labels_column"]].unique()) label_names = config["train"]["labels"] y = np.array(df[config["train"]["labels_column"]]) df = df.drop([config["train"]["labels_column"]] + config['train']['to_drop'], axis=1) X = np.array(df) print(X.shape, y.shape) try: device = torch.device(config["train"]["device"]) except: device = torch.device("cpu") classifier = Net(input_dim=df.shape[1], output_dim=label_nbr, hidden_dim=config["train"]["hidden_dim"]).to(device) criterion = torch.nn.CrossEntropyLoss() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42) X_train, y_train = torch.tensor(X_train).float(), torch.tensor( y_train).float() X_test, y_test = torch.tensor(X_test).float(), torch.tensor(y_test).float() # create dataloader with specified batch_size ds_train = torch.utils.data.TensorDataset(X_train, y_train) dataloader_train = torch.utils.data.DataLoader( ds_train, batch_size=config["train"]["batch_size"], shuffle=True) ds_test = torch.utils.data.TensorDataset(X_test, y_test) dataloader_test = torch.utils.data.DataLoader( ds_test, batch_size=config["train"]["batch_size"], shuffle=True) trainer = Trainer(classifier, device=device, criterion=criterion) trainer.train(dataloader_train, dataloader_test, config["train"]["epochs"], config["train"]["log_every"]) # eval step y_true, y_pred, scores = get_preds_labels_scores(dataloader_test, classifier, device) metrics = eval_model_per_class(y_true, y_pred, label_names) metrics["accuracy"] = trainer.metric / 100 mlflow.log_params(metrics) mlflow.pytorch.log_model( pytorch_model=classifier, artifact_path="model", registered_model_name=config["mlflow"]["model_name"]) conf_matrix_fname = save_confusion_matrix(y_true, y_pred, label_names) mlflow.log_artifact(conf_matrix_fname) os.remove(conf_matrix_fname) roc_curve_fname = save_roc_curve(y_true, scores, label_names) mlflow.log_artifact(roc_curve_fname) os.remove(roc_curve_fname) pr_curve_fname = save_pr_curve(y_true, scores, label_names) mlflow.log_artifact(pr_curve_fname) os.remove(pr_curve_fname) eval_fnames = eval_classification_model_predictions_per_feature( config["train"]["data_path"], classifier, config['train']['labels_column'], config['train']['labels'], config['train']['to_drop'], use_torch=True, device=device, preprocess=preprocess) for eval_fname in eval_fnames: mlflow.log_artifact(eval_fname) os.remove(eval_fname) api_request_model = get_request_features(df) with open("request_model.json", "w") as rmodel: json.dump(api_request_model, rmodel, indent=4) # checking if there are any productions models, # so we can put at least one in production model_name = config['mlflow']['model_name'] try: mlflow.pytorch.load_model(f"models:/{model_name}/Production") except: client = MlflowClient() version = client.search_model_versions( f"name='{model_name}'")[0].version client.transition_model_version_stage(name=model_name, version=version, stage="Production")
def main(): df = pd.read_csv(config["train"]["data_path"]) label_names = config["train"]["labels"] df = preprocess(df) y = np.array(df[config["train"]["labels_column"]]) df = df.drop([config["train"]["labels_column"]] + config['train']['to_drop'], axis=1) X = np.array(df) print(X.shape, y.shape) print(df.columns) classifier = parse_model_option(config["train"]["model"], config["train"]["model_args"]) print(classifier) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42) classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) y_true = y_test metrics = eval_model_per_class(y_true, y_pred, label_names) metrics["accuracy"] = (y_pred == y_true).astype(np.float32).mean() mlflow.log_params(metrics) mlflow.sklearn.log_model( sk_model=classifier, artifact_path="model", registered_model_name=config["mlflow"]["model_name"] ) conf_matrix_fname = save_confusion_matrix(y_true, y_pred, label_names) mlflow.log_artifact(conf_matrix_fname) os.remove(conf_matrix_fname) eval_fnames = eval_classification_model_predictions_per_feature(config["train"]["data_path"], classifier, config['train']['labels_column'], config['train']['labels'], config['train']['to_drop'], preprocess=preprocess) for eval_fname in eval_fnames: mlflow.log_artifact(eval_fname) os.remove(eval_fname) api_request_model = get_request_features(df) with open("request_model.json", "w") as rmodel: json.dump(api_request_model, rmodel, indent=4) # checking if there are any productions models, # so we can put at least one in production model_name = config['mlflow']['model_name'] try: model = mlflow.sklearn.load_model(f"models:/{model_name}/Production") except: client = MlflowClient() version = client.search_model_versions(f"name='{model_name}'")[0].version client.transition_model_version_stage( name=model_name, version=version, stage="Production" ) print(metrics)
def test(test_docs, prior, likelihood, classes, vocab): results = [] for test_doc in test_docs: results.append( test_naive_bayes(test_doc, prior, likelihood, classes, vocab)) print_test_results(results, test_documents) # Prevent running if imported as a module if __name__ == '__main__': classes = [0, 1] # perform test on the training data sys.stdout = open('output.txt', 'wt') vocab, documents = preprocess('trainingSet.txt', 'preprocessed_train.txt') _, test_documents = preprocess('trainingSet.txt', 'preprocessed_train.txt') prior, likelihood = train_naive_bayes(vocab, documents, classes) print("\n") print("testing on training data") test(documents, prior, likelihood, classes, vocab) # perform test on the testing data vocab, documents = preprocess('trainingSet.txt', 'preprocessed_train.txt') _, test_documents = preprocess('testSet.txt', 'preprocessed_test.txt') prior, likelihood = train_naive_bayes(vocab, documents, classes) print("\n") print("testing on testing data")