def load_train_and_test_bow(train_ixes, test_ixes, top_n_to_inc=None, include_auto_labeled=None): # total of 39 X, Y = load_subtask1_data(train_ixes, tokenized_folder=tokenized_dir) #X_auto = open(os.path.join(ext_data_dir, # 'top_3_auto_labeled_from_brown_external.txt')).readlines() if top_n_to_inc is not None: X_auto, Y_auto = load_subtask1_brown_auto_labeled(top_n=top_n_to_inc) #p = os.path.join(ext_data_dir, include_auto_labeled) #X_auto = open(p, encoding='utf-8').readlines() #Y_auto = np.ones(len(X_auto)) X = np.concatenate([X, X_auto]) Y = np.concatenate([Y, Y_auto]) X_test, Y_test = load_subtask1_data(test_ixes, tokenized_folder=tokenized_dir) cvec = CountVectorizer(ngram_range=(1, 2), stop_words='english', min_df=0) cvec_X = cvec.fit_transform(X).toarray() cvec_X_test = cvec.transform(X_test).toarray() return cvec_X, Y, cvec_X_test, Y_test
def fit_committee(models=None, feature_type='bow'): if models is None: models = dict(nb=MultinomialNB(alpha=3.25), gb=GradientBoostingClassifier(n_estimators=170, max_depth=5, learning_rate=0.5, min_samples_leaf=3, min_samples_split=4), dt=DecisionTreeClassifier(criterion='gini', max_depth=25, max_leaf_nodes=None, min_samples_leaf=3, min_samples_split=4)) raw_X, Y = load_subtask1_data(list(range(53))) if feature_type == 'bow': cvec = CountVectorizer(ngram_range=(1, 2), stop_words='english', min_df=3) X = cvec.fit_transform(raw_X).toarray() vote_clf = VotingClassifier(estimators=[(k, v) for k, v in models.items()], n_jobs=3, flatten_transform=False, voting='soft').fit(X, Y) return vote_clf
def build_train_count_vectorizer(raw_data=None): if raw_data is None: raw_data, _ = load_subtask1_data(list(range(53))) cvec = CountVectorizer(ngram_range=(1, 2), stop_words='english', min_df=3) return cvec.fit(raw_data)
def run(): np.random.seed(42) file_ixs = list(range(39)) np.random.shuffle(file_ixs) X, Y = load_subtask1_data(file_ixs[:25]) X_test, Y_test = load_subtask1_data(file_ixs[25:35]) tokenizer, sequences = preprocessing.tokenize_texts(X, nb_words=1000) test_sequences = tokenizer.texts_to_sequences(X_test) # TODO: replace with pad_packed_sequence in torch? test_sequences = pad_sequences(test_sequences, maxlen=100) np_embeddings = loaders.load_glove_wiki_embedding(tokenizer.word_index) ###### print("Torch Time") torch_Y = torch.from_numpy(np.array(Y).astype('float32')) torch_Y = Variable(torch_Y) torch_sequences = torch.LongTensor(sequences.astype('long')) var_torch_sequences = Variable(torch_sequences) ### torch_Y_test = torch.from_numpy(np.array(Y_test).astype('float32')) torch_Y_test = Variable(torch_Y_test) torch_test_sequences = torch.LongTensor(test_sequences.astype('long')) var_torch_test_sequences = Variable(torch_test_sequences) # Try various learning rates res = { lr: train(Task1TorchRNN(np_embeddings, n_lstm_layers=2, hidden_dim=30), var_torch_sequences, torch_Y, var_torch_test_sequences, torch_Y_test, lr=lr) for lr in [0.0025, 0.003, 0.005, 0.01, 0.001] } print(res)
def load_train_and_test_bow(train_ixes, test_ixes, top_n_to_inc=None, include_auto_labeled=None, resample=None): # total of 39 X, Y = load_subtask1_data(train_ixes, tokenized_folder=tokenized_dir) #X_auto = open(os.path.join(ext_data_dir, # 'top_3_auto_labeled_from_brown_external.txt')).readlines() if top_n_to_inc is not None: #X_auto, Y_auto = load_subtask1_brown_auto_labeled(top_n=top_n_to_inc) X_auto, Y_auto = load_auto_labeled(top_n=top_n_to_inc) #p = os.path.join(ext_data_dir, include_auto_labeled) #X_auto = open(p, encoding='utf-8').readlines() #Y_auto = np.ones(len(X_auto)) X = np.concatenate([X, X_auto]) Y = np.concatenate([Y, Y_auto]) if resample is not None: print("Resample N: %s" % str(resample)) # Easiest to just load into a DF for group-by-then-sample _df = pd.DataFrame(X) _df['target'] = Y rs_df = _df.groupby('target').apply( lambda df: df.sample(resample, replace=True)) X = rs_df.drop('target', axis=1).values.reshape(-1) Y = rs_df['target'].values.reshape(-1) X_test, Y_test = load_subtask1_data(test_ixes, tokenized_folder=tokenized_dir) cvec = CountVectorizer(ngram_range=(1, 2), stop_words='english', min_df=0) cvec_X = cvec.fit_transform(X).toarray() cvec_X_test = cvec.transform(X_test).toarray() return cvec_X, Y, cvec_X_test, Y_test
def load_data(embedding_dim=100, return_holdout=False): file_ixs = list(range(65)) X, Y = load_subtask1_data(file_ixs[:40]) # Add in auto labeled X_auto, Y_auto = load_subtask1_brown_auto_labeled() X = np.concatenate([X, X_auto]) Y = np.concatenate([Y, Y_auto]) ix = list(range(len(X))) np.random.shuffle(ix) X = X[ix] Y = Y[ix] X_test, Y_test = load_subtask1_data(file_ixs[40:53]) test_ix = list(range(len(X_test))) np.random.shuffle(test_ix) X_test = X_test[test_ix] Y_test = Y_test[test_ix] tokenizer, sequences = preprocessing.tokenize_texts(X, nb_words=5000) # TODO: Masking implementation rather than padding? sequences = pad_sequences(sequences, maxlen=100) test_sequences = tokenizer.texts_to_sequences(X_test) test_sequences = pad_sequences(test_sequences, maxlen=100) embeddings = loaders.load_glove_wiki_embedding(tokenizer.word_index, embedding_dim=embedding_dim) if return_holdout: X_holdout, Y_holdout = load_subtask1_data(file_ixs[53:]) holdout_sequences = tokenizer.texts_to_sequences(X_holdout) holdout_sequences = pad_sequences(holdout_sequences, maxlen=100) return embeddings, sequences, Y, test_sequences, Y_test, holdout_sequences, Y_holdout else: return embeddings, sequences, Y, test_sequences, Y_test
def run(): np.random.seed(42) file_ixs = list(range(39)) np.random.shuffle(file_ixs) X, Y = load_subtask1_data(file_ixs[:20]) X_test, Y_test = load_subtask1_data(file_ixs[20:30]) # Turn sentences into sequences of integers, with each integer representing # a word. Only the top occuring nb_words are kept tokenizer, sequences = preprocessing.tokenize_texts(X, nb_words=1000) sequences = pad_sequences(sequences, maxlen=50) # Tokenizer does conversion on new texts - treat test as unseen test_sequences = tokenizer.texts_to_sequences(X_test) # TODO: replace with pad_packed_sequence in torch? test_sequences = pad_sequences(test_sequences, maxlen=50) np_embeddings = loaders.load_glove_wiki_embedding(tokenizer.word_index) ###### print("Torch Time") torch_Y = torch.from_numpy(np.array(Y).astype('float32')) torch_Y = Variable(torch_Y) #torch_sequences = torch.LongTensor(np.array([np.array(s).astype('int') # for s in sequences])) #torch_sequences = [torch.LongTensor(np.array(s).astype('long')) # for s in sequences] # Need to pad if doing this way torch_sequences = torch.LongTensor(np.array(sequences).astype('long')) var_torch_sequences = Variable(torch_sequences) ### torch_Y_test = torch.from_numpy(np.array(Y_test).astype('float32')) torch_Y_test = Variable(torch_Y_test) torch_test_sequences = torch.LongTensor(test_sequences.astype('long')) var_torch_test_sequences = Variable(torch_test_sequences) # Try various learning rates res = dict() for lr in [0.001, 0.0009, 0.0011]: #[0.0001, 0.00017, 0.00025, 0.0035]: print("=" * 20) print(lr) res[lr] = train(Task1TorchRNN(np_embeddings, n_lstm_layers=2, hidden_dim=10), var_torch_sequences, torch_Y, var_torch_test_sequences, torch_Y_test, lr=lr, weight_decay=0.0031) print(max([perf['val_acc'] for perf in res[lr]])) for lr, r in res.items(): print("lr: %f" % lr) print(max([perf['val_acc'] for perf in r])) fname = './task1_lstm_torch_results_%d.pkl' % int(time.time()) print("Saving results to %s" % fname) with open(fname, 'wb') as f: pickle.dump(res, f)