} train_documents, train_labels, val_documents, val_labels, test_documents, test_labels, end_index = data_handler.load_data( data_info["source"], data_info["path"], data_info["n_samples_train"], data_info["n_samples_val"], data_info["n_samples_test"], data_info["class_labels"], is_balanced=data_info["is_balanced"]) print("end_index", end_index) extractor = data_handler.generate_bag_of_ngrams_extractor( train_documents, classifier_info["nfeatures"], classifier_info["ngrams"]) pickle.dump(extractor, open(PATH_TO_EXTRACTOR, "wb")) train_input = data_handler.generate_input(train_documents, extractor) val_input = data_handler.generate_input(val_documents, extractor) train_label_input = np.array(train_labels) val_label_input = np.array(val_labels) print( "#################################################################### \n") print("TRAINING: LOGISTIC REGRESSION\n") print("####################################################################\n") lr_classifier = logistic_regression.LogisticRegressionClassifier( data_info, classifier_info) lr_classifier.train(train_input, train_label_input, "batch") pickle.dump(lr_classifier, open(PATH_TO_CLASSIFIER, "wb"))
extractor = pickle.load(open(PATH_TO_EXTRACTOR, "rb")) classifier = lstm_keras.load_keras(PATH_TO_CLASSIFIER, PATH_TO_WRAPPER) data_info = classifier.data_info classifier_info = classifier.classifier_info train_documents, train_labels, val_documents, val_labels, test_documents, test_labels, end_index = data_handler.load_data( data_info["source"], data_info["path"], data_info["n_samples_train"], data_info["n_samples_val"], data_info["n_samples_test"], data_info["class_labels"], is_balanced=data_info["is_balanced"]) test_input = data_handler.generate_input(test_documents, extractor) test_labels = np.array(test_labels) test_label_input = keras.utils.to_categorical(test_labels - 1, len(data_info["class_labels"])) print( "#################################################################### \n") print("DATA INFO: \n") print("Source : ", data_info["source"]) print("Is distribution balanced? ", data_info["is_balanced"]) print("Number of training samples: ", data_info["n_samples_train"]) print("Number of validation samples: ", data_info["n_samples_val"]) print("Number of testing samples: ", data_info["n_samples_test"]) print("\n") print("ARCHITECTURE INFO: \n") print("embed_size: ", classifier_info["embed_size"])
"nfeatures" : 1000, "hidden_dim" : 40, "nbatches" : 100, "nepochs" : 1, "alpha" : 0.001 } PATH_TO_GLOVE_EMBEDDINGS = '../data/glove.42B.300d.txt' PATH_TO_EXTRACTOR = "../pickle/pytorch_lstm_extractor.p" train_documents, train_labels, val_documents, val_labels, test_documents, test_labels, end_index = data_handler.load_data(data_info["source"], data_info["path"], data_info["n_samples_train"], data_info["n_samples_val"], data_info["n_samples_test"], data_info["class_labels"], is_balanced=data_info["is_balanced"]) extractor = data_handler.generate_glove_extractor(train_documents, classifier_info["nfeatures"]) embeddings = data_handler.generate_glove_embeddings(extractor, PATH_TO_GLOVE_EMBEDDINGS, classifier_info["nfeatures"], classifier_info["embed_size"]) pickle.dump(extractor, open(PATH_TO_EXTRACTOR, "wb")) train_input = data_handler.generate_input(train_documents, extractor, SHOULD_ADD_NEGATIONS=False) val_input = data_handler.generate_input(val_documents, extractor, SHOULD_ADD_NEGATIONS=False) train_label_input = np.array(train_labels) val_label_input = np.array(val_labels) train_label_class_indices = data_handler.labels_to_indices(train_label_input, data_info["class_labels"]) print("#################################################################### \n") print("TRAINING: LSTM\n") print("#################################################################### \n") lstm_classifier = lstm_pytorch.PyTorchLSTMClassifier(data_info, classifier_info) lstm_classifier.train(train_input, train_label_class_indices, embeddings) pickle.dump(lstm_classifier, open("../pickle/pytorch_lstm_classifier.p", "wb"))
open(AMAZON_PREFIX + "pytorch_lstm_classifier.p", "rb")) extractor = pickle.load(open(AMAZON_PREFIX + "pytorch_lstm_extractor.p", "rb")) data_info = classifier.data_info classifier_info = classifier.classifier_info train_documents, train_labels, val_documents, val_labels, test_documents, test_labels, end_index = data_handler.load_data( data_info["source"], data_info["path"], data_info["n_samples_train"], data_info["n_samples_val"], data_info["n_samples_test"], data_info["class_labels"], is_balanced=data_info["is_balanced"]) test_input = data_handler.generate_input(test_documents, extractor, SHOULD_ADD_NEGATIONS=False) test_label_input = np.array(test_labels) print( "#################################################################### \n") print("DATA INFO: \n") print("Source : ", data_info["source"]) print("Is distribution balanced? ", data_info["is_balanced"]) print("Number of training samples: ", data_info["n_samples_train"]) print("Number of validation samples: ", data_info["n_samples_val"]) print("Number of testing samples: ", data_info["n_samples_test"]) print("\n") print("CLASSIFIER INFO: \n") print("nfeatures: ", classifier_info["nfeatures"]) print("embed_size: ", classifier_info["embed_size"])