Ejemplo n.º 1
0
def main(preprocess_params, classifier_params):
    dataset_loader = DatasetLoader()
    X_train, y_train = dataset_loader.load_train()
    X_test, y_test = dataset_loader.load_test()

    clf = Pipeline([("preprocessing", TfidfVectorizer(**preprocess_params)),
                    ("classifier",
                     RandomForestClassifier(n_jobs=-1, **classifier_params))])

    clf.fit(X_train, y_train)
    result_storage = ResultStorage(ex, clf)
    result_storage.store_experiment_data(X_test, y_test)
Ejemplo n.º 2
0
def main(classifier_params):
    dataset_loader = DatasetLoader()
    x_train, y_train = dataset_loader.load_train()
    w2v_model = Word2Vec.load("word2vec_models/word2vec.model")
    X_train = np.array(
        [get_sentence_embedding(w2v_model, sentence) for sentence in x_train])

    x_test, y_test = dataset_loader.load_test()
    X_test = np.array(
        [get_sentence_embedding(w2v_model, sentence) for sentence in x_test])

    clf = SVC(verbose=2, max_iter=10000, **classifier_params)

    clf.fit(X_train, y_train)
    result_storage = ResultStorage(ex, clf)
    result_storage.store_experiment_data(X_test, y_test)
Ejemplo n.º 3
0
def main(max_df):
    dataset_loader = DatasetLoader()
    X, y = dataset_loader.load_train()
    vectorizer = TfidfVectorizer(strip_accents='ascii',
                                 max_df=max_df,
                                 max_features=50000)
    X_train = vectorizer.fit_transform(X).todense()
    X_train, X_val, y_train, y_val = train_test_split(X_train, y)

    model = keras.models.Sequential([
        keras.layers.Dense(128, input_shape=(50000, ), activation='relu'),
        keras.layers.Dropout(0.5),
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dropout(0.5),
        keras.layers.Dense(32, activation='relu'),
        keras.layers.Dense(len(np.unique(y)), activation='softmax'),
    ])
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    history = model.fit(X_train,
                        y_train,
                        epochs=15,
                        validation_data=(X_val, y_val))

    X_test, y_test = dataset_loader.load_test()
    X_test = vectorizer.transform(X_test)

    score = model.evaluate(X_test, y_test)
    print(score)

    # summarize history for accuracy
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    # summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
Ejemplo n.º 4
0
def main(model_conf, _run):
    dataset_loader = DatasetLoader()
    X, y = dataset_loader.load_train()

    w2v_model = Word2Vec.load("word2vec_models/word2vec.model")
    x_w2v = np.array([get_sentence_embedding(w2v_model, sentence)
                      for sentence in X])

    X_train, X_val, y_train, y_val = train_test_split(x_w2v, y)
    model = keras.models.Sequential([
        keras.layers.Dense(128, input_shape=(3000,), activation='relu'),
        keras.layers.Dropout(0.5),
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dropout(0.5),
        keras.layers.Dense(32, activation='relu'),
        keras.layers.Dense(len(np.unique(y)), activation='softmax'),
    ])
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam', metrics=['accuracy'])
    history = model.fit(X_train, y_train, epochs=20, batch_size=1024,
                        validation_data=(X_val, y_val))

    X_test, y_test = dataset_loader.load_test()
    w_test = np.array([get_sentence_embedding(w2v_model, sentence)
                       for sentence in X_test])

    score = model.evaluate(w_test, y_test)

    print(score)
    # summarize history for accuracy
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    # summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
Ejemplo n.º 5
0
def main(clf_params):
    dataset_loader = DatasetLoader()
    w2v_model = Word2Vec.load("word2vec.model")

    X_train, y_train = dataset_loader.load_train()
    X_train = preprocess_x(X_train, w2v_model)
    X_test, y_test = dataset_loader.load_test()
    X_test = preprocess_x(X_test, w2v_model)

    grid = GridSearchCV(SVC(),
                        clf_params,
                        n_jobs=-1,
                        cv=5,
                        verbose=2,
                        return_train_score=True,
                        refit=True)
    grid.fit(X_train, y_train)

    result_storage = ResultStorage(ex, grid)
    result_storage.store_experiment_data(X_test, y_test)
from sklearn.metrics.pairwise import cosine_similarity

from utils import DatasetLoader

# This example finds the most "similar" review (in terms of words used) based on a user-generated review. This is done
# by creating a so-called bag-of-words model. Each unique word in the dataset is given an index in a vector. Each review
# is in turn transformed into a vector, where the value in each index represents how many times a specific word is
# present. For instance, the word "dress" may have index=159. If a review has the value 3 at index 159, "dress" is
# mentioned 3 times in the review. The vector representation is a format that can be taken as input by machine learning
# algorithms.
#
# In this task, we calculate the cosine similarity of two vectors, and obtain a metric on how similar they are. Note
# that this is in terms of which words are used -- the vectors have no understanding on how the different words relate.

# Load dataset
dataset = DatasetLoader.load_reviews()

# Transform each text in the dataset to its corresponding vector.
print("Vectorizing dataset")
vectorizer = TfidfVectorizer()
texts = [row.full_text() for row in dataset]
vectorized_dataset = vectorizer.fit_transform(texts)


# Method for finding the review in the dataset most similar to `query`.
def find_most_similar_review(query: str) -> str:
    # find the vector of the query
    vectorized_query = vectorizer.transform([query])

    # Transform all reviews' vectors to the cosine similarity to the vector of query. This is a measurement on how
    # similar the vectors are.
Ejemplo n.º 7
0
import torch.nn.functional as F

#
# Settings.
#

torch.cuda.set_device(4)
learning_rate = 0.001
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#
# load datasets
#
batch_size = 32
bert_dim = 300
train_data = DatasetLoader('mr', set_name="train")
vocab = train_data.vocab
test_data = DatasetLoader('mr', set_name="test")
max_seq_len = train_data.nnodes
train_data_loader = DataLoader(dataset=train_data,
                               batch_size=batch_size,
                               shuffle=True)
test_data_loader = DataLoader(dataset=test_data,
                              batch_size=batch_size,
                              shuffle=True)
nhid = 300
vote_dim = 100
nclass = train_data.nclass()
input_cols = ['node_embeddings', 'dependency_graph', 'polarity']

#
Ejemplo n.º 8
0
                                               'real_A': real_A,
                                               'real_B': real_B,
                                               'fake_A': fake_A,
                                               'fake_B': fake_B
                                           })

                loss_dict['loss_D'] = loss_D
                loss_dict['loss_G'] = loss_G
                loss_dict['loss_G_GAN'] = loss_G_GAN
                loss_dict['loss_G_identity'] = loss_G_identity
                loss_dict['loss_G_cycle'] = loss_G_cycle

                # Update learning rates
        client.lr_update()

        return loss_dict


if __name__ == '__main__':

    clear.clear_records(if_clients=True, if_servers=True, if_logs=True)

    clients, server, config = init_federated()

    datasetLoader = DatasetLoader()
    datasetLoader.load_dataset_default()

    for client in clients:
        client.load_dataset_from_dir("clients/" + str(client.id) + "/dataset/")
    train_federated(config, clients, server)
from utils import DatasetLoader
import gensim
from gensim.models import Word2Vec

dataset_loader = DatasetLoader()
X, y = dataset_loader.load_train()
print("loaded data")

w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=3000,
                     sample=6e-5,
                     alpha=0.03,
                     min_alpha=0.0007,
                     negative=20)

cleaned_text = [x.split() for x in X]
w2v_model.build_vocab(cleaned_text)

print("start training")
w2v_model.train(cleaned_text,
                total_examples=w2v_model.corpus_count,
                epochs=100)
w2v_model.save("word2vec_models/word2vec.model")