Ejemplo n.º 1
0
}

train_documents, train_labels, val_documents, val_labels, test_documents, test_labels, end_index = data_handler.load_data(
    data_info["source"],
    data_info["path"],
    data_info["n_samples_train"],
    data_info["n_samples_val"],
    data_info["n_samples_test"],
    data_info["class_labels"],
    is_balanced=data_info["is_balanced"])
print("end_index", end_index)
extractor = data_handler.generate_bag_of_ngrams_extractor(
    train_documents, classifier_info["nfeatures"], classifier_info["ngrams"])
pickle.dump(extractor, open(PATH_TO_EXTRACTOR, "wb"))

train_input = data_handler.generate_input(train_documents, extractor)
val_input = data_handler.generate_input(val_documents, extractor)

train_label_input = np.array(train_labels)
val_label_input = np.array(val_labels)

print(
    "#################################################################### \n")
print("TRAINING: LOGISTIC REGRESSION\n")
print("####################################################################\n")

lr_classifier = logistic_regression.LogisticRegressionClassifier(
    data_info, classifier_info)
lr_classifier.train(train_input, train_label_input, "batch")
pickle.dump(lr_classifier, open(PATH_TO_CLASSIFIER, "wb"))
Ejemplo n.º 2
0
extractor = pickle.load(open(PATH_TO_EXTRACTOR, "rb"))
classifier = lstm_keras.load_keras(PATH_TO_CLASSIFIER, PATH_TO_WRAPPER)
data_info = classifier.data_info
classifier_info = classifier.classifier_info

train_documents, train_labels, val_documents, val_labels, test_documents, test_labels, end_index = data_handler.load_data(
    data_info["source"],
    data_info["path"],
    data_info["n_samples_train"],
    data_info["n_samples_val"],
    data_info["n_samples_test"],
    data_info["class_labels"],
    is_balanced=data_info["is_balanced"])

test_input = data_handler.generate_input(test_documents, extractor)
test_labels = np.array(test_labels)
test_label_input = keras.utils.to_categorical(test_labels - 1,
                                              len(data_info["class_labels"]))

print(
    "#################################################################### \n")
print("DATA INFO: \n")
print("Source : ", data_info["source"])
print("Is distribution balanced? ", data_info["is_balanced"])
print("Number of training samples: ", data_info["n_samples_train"])
print("Number of validation samples: ", data_info["n_samples_val"])
print("Number of testing samples: ", data_info["n_samples_test"])
print("\n")
print("ARCHITECTURE INFO: \n")
print("embed_size: ", classifier_info["embed_size"])
                   "nfeatures" : 1000,
                   "hidden_dim" : 40,
                   "nbatches" : 100,
                   "nepochs" : 1,
                   "alpha" : 0.001
}

PATH_TO_GLOVE_EMBEDDINGS = '../data/glove.42B.300d.txt'
PATH_TO_EXTRACTOR = "../pickle/pytorch_lstm_extractor.p"
train_documents, train_labels, val_documents, val_labels, test_documents, test_labels, end_index = data_handler.load_data(data_info["source"], data_info["path"], data_info["n_samples_train"], data_info["n_samples_val"], data_info["n_samples_test"], data_info["class_labels"], is_balanced=data_info["is_balanced"])

extractor = data_handler.generate_glove_extractor(train_documents, classifier_info["nfeatures"])
embeddings = data_handler.generate_glove_embeddings(extractor, PATH_TO_GLOVE_EMBEDDINGS, classifier_info["nfeatures"], classifier_info["embed_size"])
pickle.dump(extractor, open(PATH_TO_EXTRACTOR, "wb"))

train_input = data_handler.generate_input(train_documents, extractor, SHOULD_ADD_NEGATIONS=False)
val_input = data_handler.generate_input(val_documents, extractor, SHOULD_ADD_NEGATIONS=False)

train_label_input = np.array(train_labels)
val_label_input = np.array(val_labels)

train_label_class_indices = data_handler.labels_to_indices(train_label_input, data_info["class_labels"])

print("#################################################################### \n")
print("TRAINING: LSTM\n")
print("#################################################################### \n")

lstm_classifier = lstm_pytorch.PyTorchLSTMClassifier(data_info, classifier_info)
lstm_classifier.train(train_input, train_label_class_indices, embeddings)
pickle.dump(lstm_classifier, open("../pickle/pytorch_lstm_classifier.p", "wb"))
Ejemplo n.º 4
0
    open(AMAZON_PREFIX + "pytorch_lstm_classifier.p", "rb"))
extractor = pickle.load(open(AMAZON_PREFIX + "pytorch_lstm_extractor.p", "rb"))

data_info = classifier.data_info
classifier_info = classifier.classifier_info

train_documents, train_labels, val_documents, val_labels, test_documents, test_labels, end_index = data_handler.load_data(
    data_info["source"],
    data_info["path"],
    data_info["n_samples_train"],
    data_info["n_samples_val"],
    data_info["n_samples_test"],
    data_info["class_labels"],
    is_balanced=data_info["is_balanced"])
test_input = data_handler.generate_input(test_documents,
                                         extractor,
                                         SHOULD_ADD_NEGATIONS=False)
test_label_input = np.array(test_labels)

print(
    "#################################################################### \n")
print("DATA INFO: \n")
print("Source : ", data_info["source"])
print("Is distribution balanced? ", data_info["is_balanced"])
print("Number of training samples: ", data_info["n_samples_train"])
print("Number of validation samples: ", data_info["n_samples_val"])
print("Number of testing samples: ", data_info["n_samples_test"])
print("\n")
print("CLASSIFIER INFO: \n")
print("nfeatures: ", classifier_info["nfeatures"])
print("embed_size: ", classifier_info["embed_size"])