Ejemplo n.º 1
0
def main(preprocess_params, classifier_params):
    dataset_loader = DatasetLoader()
    X_train, y_train = dataset_loader.load_train()
    X_test, y_test = dataset_loader.load_test()

    clf = Pipeline([("preprocessing", TfidfVectorizer(**preprocess_params)),
                    ("classifier",
                     RandomForestClassifier(n_jobs=-1, **classifier_params))])

    clf.fit(X_train, y_train)
    result_storage = ResultStorage(ex, clf)
    result_storage.store_experiment_data(X_test, y_test)
Ejemplo n.º 2
0
def main(classifier_params):
    dataset_loader = DatasetLoader()
    x_train, y_train = dataset_loader.load_train()
    w2v_model = Word2Vec.load("word2vec_models/word2vec.model")
    X_train = np.array(
        [get_sentence_embedding(w2v_model, sentence) for sentence in x_train])

    x_test, y_test = dataset_loader.load_test()
    X_test = np.array(
        [get_sentence_embedding(w2v_model, sentence) for sentence in x_test])

    clf = SVC(verbose=2, max_iter=10000, **classifier_params)

    clf.fit(X_train, y_train)
    result_storage = ResultStorage(ex, clf)
    result_storage.store_experiment_data(X_test, y_test)
Ejemplo n.º 3
0
def main(max_df):
    dataset_loader = DatasetLoader()
    X, y = dataset_loader.load_train()
    vectorizer = TfidfVectorizer(strip_accents='ascii',
                                 max_df=max_df,
                                 max_features=50000)
    X_train = vectorizer.fit_transform(X).todense()
    X_train, X_val, y_train, y_val = train_test_split(X_train, y)

    model = keras.models.Sequential([
        keras.layers.Dense(128, input_shape=(50000, ), activation='relu'),
        keras.layers.Dropout(0.5),
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dropout(0.5),
        keras.layers.Dense(32, activation='relu'),
        keras.layers.Dense(len(np.unique(y)), activation='softmax'),
    ])
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    history = model.fit(X_train,
                        y_train,
                        epochs=15,
                        validation_data=(X_val, y_val))

    X_test, y_test = dataset_loader.load_test()
    X_test = vectorizer.transform(X_test)

    score = model.evaluate(X_test, y_test)
    print(score)

    # summarize history for accuracy
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    # summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
Ejemplo n.º 4
0
def main(model_conf, _run):
    dataset_loader = DatasetLoader()
    X, y = dataset_loader.load_train()

    w2v_model = Word2Vec.load("word2vec_models/word2vec.model")
    x_w2v = np.array([get_sentence_embedding(w2v_model, sentence)
                      for sentence in X])

    X_train, X_val, y_train, y_val = train_test_split(x_w2v, y)
    model = keras.models.Sequential([
        keras.layers.Dense(128, input_shape=(3000,), activation='relu'),
        keras.layers.Dropout(0.5),
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dropout(0.5),
        keras.layers.Dense(32, activation='relu'),
        keras.layers.Dense(len(np.unique(y)), activation='softmax'),
    ])
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam', metrics=['accuracy'])
    history = model.fit(X_train, y_train, epochs=20, batch_size=1024,
                        validation_data=(X_val, y_val))

    X_test, y_test = dataset_loader.load_test()
    w_test = np.array([get_sentence_embedding(w2v_model, sentence)
                       for sentence in X_test])

    score = model.evaluate(w_test, y_test)

    print(score)
    # summarize history for accuracy
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    # summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
Ejemplo n.º 5
0
def main(clf_params):
    dataset_loader = DatasetLoader()
    w2v_model = Word2Vec.load("word2vec.model")

    X_train, y_train = dataset_loader.load_train()
    X_train = preprocess_x(X_train, w2v_model)
    X_test, y_test = dataset_loader.load_test()
    X_test = preprocess_x(X_test, w2v_model)

    grid = GridSearchCV(SVC(),
                        clf_params,
                        n_jobs=-1,
                        cv=5,
                        verbose=2,
                        return_train_score=True,
                        refit=True)
    grid.fit(X_train, y_train)

    result_storage = ResultStorage(ex, grid)
    result_storage.store_experiment_data(X_test, y_test)
from utils import DatasetLoader
import gensim
from gensim.models import Word2Vec

dataset_loader = DatasetLoader()
X, y = dataset_loader.load_train()
print("loaded data")

w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=3000,
                     sample=6e-5,
                     alpha=0.03,
                     min_alpha=0.0007,
                     negative=20)

cleaned_text = [x.split() for x in X]
w2v_model.build_vocab(cleaned_text)

print("start training")
w2v_model.train(cleaned_text,
                total_examples=w2v_model.corpus_count,
                epochs=100)
w2v_model.save("word2vec_models/word2vec.model")