def load_train_and_test_bow(train_ixes,
                            test_ixes,
                            top_n_to_inc=None,
                            include_auto_labeled=None):
    # total of 39
    X, Y = load_subtask1_data(train_ixes, tokenized_folder=tokenized_dir)

    #X_auto = open(os.path.join(ext_data_dir,
    #                           'top_3_auto_labeled_from_brown_external.txt')).readlines()
    if top_n_to_inc is not None:
        X_auto, Y_auto = load_subtask1_brown_auto_labeled(top_n=top_n_to_inc)
        #p = os.path.join(ext_data_dir, include_auto_labeled)
        #X_auto = open(p, encoding='utf-8').readlines()
        #Y_auto = np.ones(len(X_auto))

        X = np.concatenate([X, X_auto])
        Y = np.concatenate([Y, Y_auto])

    X_test, Y_test = load_subtask1_data(test_ixes,
                                        tokenized_folder=tokenized_dir)

    cvec = CountVectorizer(ngram_range=(1, 2), stop_words='english', min_df=0)

    cvec_X = cvec.fit_transform(X).toarray()

    cvec_X_test = cvec.transform(X_test).toarray()

    return cvec_X, Y, cvec_X_test, Y_test
def fit_committee(models=None, feature_type='bow'):
    if models is None:
        models = dict(nb=MultinomialNB(alpha=3.25),
                      gb=GradientBoostingClassifier(n_estimators=170,
                                                    max_depth=5,
                                                    learning_rate=0.5,
                                                    min_samples_leaf=3,
                                                    min_samples_split=4),
                      dt=DecisionTreeClassifier(criterion='gini',
                                                max_depth=25,
                                                max_leaf_nodes=None,
                                                min_samples_leaf=3,
                                                min_samples_split=4))

    raw_X, Y = load_subtask1_data(list(range(53)))
    if feature_type == 'bow':
        cvec = CountVectorizer(ngram_range=(1, 2),
                               stop_words='english',
                               min_df=3)

        X = cvec.fit_transform(raw_X).toarray()

    vote_clf = VotingClassifier(estimators=[(k, v) for k, v in models.items()],
                                n_jobs=3,
                                flatten_transform=False,
                                voting='soft').fit(X, Y)

    return vote_clf
def build_train_count_vectorizer(raw_data=None):
    if raw_data is None:
        raw_data, _ = load_subtask1_data(list(range(53)))

    cvec = CountVectorizer(ngram_range=(1, 2), stop_words='english', min_df=3)

    return cvec.fit(raw_data)
Beispiel #4
0
def run():
    np.random.seed(42)
    file_ixs = list(range(39))
    np.random.shuffle(file_ixs)

    X, Y = load_subtask1_data(file_ixs[:25])
    X_test, Y_test = load_subtask1_data(file_ixs[25:35])

    tokenizer, sequences = preprocessing.tokenize_texts(X, nb_words=1000)

    test_sequences = tokenizer.texts_to_sequences(X_test)
    # TODO: replace with pad_packed_sequence in torch?
    test_sequences = pad_sequences(test_sequences, maxlen=100)

    np_embeddings = loaders.load_glove_wiki_embedding(tokenizer.word_index)

    ######
    print("Torch Time")
    torch_Y = torch.from_numpy(np.array(Y).astype('float32'))
    torch_Y = Variable(torch_Y)

    torch_sequences = torch.LongTensor(sequences.astype('long'))
    var_torch_sequences = Variable(torch_sequences)

    ###
    torch_Y_test = torch.from_numpy(np.array(Y_test).astype('float32'))
    torch_Y_test = Variable(torch_Y_test)

    torch_test_sequences = torch.LongTensor(test_sequences.astype('long'))
    var_torch_test_sequences = Variable(torch_test_sequences)

    # Try various learning rates
    res = {
        lr: train(Task1TorchRNN(np_embeddings, n_lstm_layers=2, hidden_dim=30),
                  var_torch_sequences,
                  torch_Y,
                  var_torch_test_sequences,
                  torch_Y_test,
                  lr=lr)
        for lr in [0.0025, 0.003, 0.005, 0.01, 0.001]
    }

    print(res)
Beispiel #5
0
def load_train_and_test_bow(train_ixes,
                            test_ixes,
                            top_n_to_inc=None,
                            include_auto_labeled=None,
                            resample=None):
    # total of 39
    X, Y = load_subtask1_data(train_ixes, tokenized_folder=tokenized_dir)

    #X_auto = open(os.path.join(ext_data_dir,
    #                           'top_3_auto_labeled_from_brown_external.txt')).readlines()
    if top_n_to_inc is not None:
        #X_auto, Y_auto = load_subtask1_brown_auto_labeled(top_n=top_n_to_inc)
        X_auto, Y_auto = load_auto_labeled(top_n=top_n_to_inc)
        #p = os.path.join(ext_data_dir, include_auto_labeled)
        #X_auto = open(p, encoding='utf-8').readlines()
        #Y_auto = np.ones(len(X_auto))

        X = np.concatenate([X, X_auto])
        Y = np.concatenate([Y, Y_auto])

    if resample is not None:
        print("Resample N: %s" % str(resample))
        # Easiest to just load into a DF for group-by-then-sample
        _df = pd.DataFrame(X)
        _df['target'] = Y
        rs_df = _df.groupby('target').apply(
            lambda df: df.sample(resample, replace=True))
        X = rs_df.drop('target', axis=1).values.reshape(-1)
        Y = rs_df['target'].values.reshape(-1)

    X_test, Y_test = load_subtask1_data(test_ixes,
                                        tokenized_folder=tokenized_dir)

    cvec = CountVectorizer(ngram_range=(1, 2), stop_words='english', min_df=0)

    cvec_X = cvec.fit_transform(X).toarray()

    cvec_X_test = cvec.transform(X_test).toarray()

    return cvec_X, Y, cvec_X_test, Y_test
def load_data(embedding_dim=100, return_holdout=False):
    file_ixs = list(range(65))
    X, Y = load_subtask1_data(file_ixs[:40])

    # Add in auto labeled
    X_auto, Y_auto = load_subtask1_brown_auto_labeled()
    X = np.concatenate([X, X_auto])
    Y = np.concatenate([Y, Y_auto])

    ix = list(range(len(X)))
    np.random.shuffle(ix)
    X = X[ix]
    Y = Y[ix]

    X_test, Y_test = load_subtask1_data(file_ixs[40:53])
    test_ix = list(range(len(X_test)))
    np.random.shuffle(test_ix)

    X_test = X_test[test_ix]
    Y_test = Y_test[test_ix]

    tokenizer, sequences = preprocessing.tokenize_texts(X, nb_words=5000)
    # TODO: Masking implementation rather than padding?
    sequences = pad_sequences(sequences, maxlen=100)

    test_sequences = tokenizer.texts_to_sequences(X_test)
    test_sequences = pad_sequences(test_sequences, maxlen=100)

    embeddings = loaders.load_glove_wiki_embedding(tokenizer.word_index,
                                                   embedding_dim=embedding_dim)

    if return_holdout:
        X_holdout, Y_holdout = load_subtask1_data(file_ixs[53:])
        holdout_sequences = tokenizer.texts_to_sequences(X_holdout)
        holdout_sequences = pad_sequences(holdout_sequences, maxlen=100)
        return embeddings, sequences, Y, test_sequences, Y_test, holdout_sequences, Y_holdout
    else:
        return embeddings, sequences, Y, test_sequences, Y_test
def run():
    np.random.seed(42)
    file_ixs = list(range(39))
    np.random.shuffle(file_ixs)

    X, Y = load_subtask1_data(file_ixs[:20])
    X_test, Y_test = load_subtask1_data(file_ixs[20:30])

    # Turn sentences into sequences of integers, with each integer representing
    # a word. Only the top occuring nb_words are kept
    tokenizer, sequences = preprocessing.tokenize_texts(X, nb_words=1000)

    sequences = pad_sequences(sequences, maxlen=50)

    # Tokenizer does conversion on new texts - treat test as unseen
    test_sequences = tokenizer.texts_to_sequences(X_test)
    # TODO: replace with pad_packed_sequence in torch?
    test_sequences = pad_sequences(test_sequences, maxlen=50)

    np_embeddings = loaders.load_glove_wiki_embedding(tokenizer.word_index)

    ######
    print("Torch Time")
    torch_Y = torch.from_numpy(np.array(Y).astype('float32'))
    torch_Y = Variable(torch_Y)

    #torch_sequences = torch.LongTensor(np.array([np.array(s).astype('int')
    #                                             for s in sequences]))
    #torch_sequences = [torch.LongTensor(np.array(s).astype('long'))
    #                        for s in sequences]
    # Need to pad if doing this way
    torch_sequences = torch.LongTensor(np.array(sequences).astype('long'))
    var_torch_sequences = Variable(torch_sequences)

    ###
    torch_Y_test = torch.from_numpy(np.array(Y_test).astype('float32'))
    torch_Y_test = Variable(torch_Y_test)

    torch_test_sequences = torch.LongTensor(test_sequences.astype('long'))
    var_torch_test_sequences = Variable(torch_test_sequences)

    # Try various learning rates
    res = dict()
    for lr in [0.001, 0.0009, 0.0011]:  #[0.0001, 0.00017, 0.00025, 0.0035]:
        print("=" * 20)
        print(lr)
        res[lr] = train(Task1TorchRNN(np_embeddings,
                                      n_lstm_layers=2,
                                      hidden_dim=10),
                        var_torch_sequences,
                        torch_Y,
                        var_torch_test_sequences,
                        torch_Y_test,
                        lr=lr,
                        weight_decay=0.0031)

        print(max([perf['val_acc'] for perf in res[lr]]))

    for lr, r in res.items():
        print("lr: %f" % lr)
        print(max([perf['val_acc'] for perf in r]))

    fname = './task1_lstm_torch_results_%d.pkl' % int(time.time())
    print("Saving results to %s" % fname)
    with open(fname, 'wb') as f:
        pickle.dump(res, f)