def feats_to_vec(features): bigrams = utils.text_to_bigrams(features) feature_vector = np.zeros(len(utils.vocab)) for b in bigrams: if b in utils.vocab: feature_vector[utils.F2I[b]] += 1 return feature_vector / len(bigrams)
def feats_to_vec(features): counters = defaultdict(int) bigrams = ut.text_to_bigrams(features) for bigram in bigrams: id = ut.F2I(bigram) counters[id] += 1 # Should return a numpy vector of features. return counters
def feats_to_vec(features): features = ut.text_to_bigrams(features) feat_vec = np.array(np.zeros(len(ut.F2I))) matches_counter = 0 for bigram in features: if bigram in ut.F2I: feat_vec[ut.F2I[bigram]] += 1 matches_counter += 1 return np.divide(feat_vec, matches_counter)
def feats_to_vec(features): # Should return a numpy vector of features. feats = ut.text_to_bigrams(features) feat_vec = np.array(np.zeros(len(ut.F2I))) matches = 0 for bigram in feats: if bigram in ut.F2I: feat_vec[ut.F2I[bigram]] += 1 matches += 1 return np.divide(feat_vec, matches)
def feats_to_vec(features): """ Calculates most common features histogram. features: list of features. """ bigrams = utils.text_to_bigrams(features) feat_vec = np.zeros(len(utils.vocab)) for b in bigrams: if b in utils.vocab: feat_vec[utils.F2I[b]] += 1 return feat_vec/len(bigrams)
:return: """ f = open("test.pred", 'w') for label, features in test_data: x = feats_to_vec(features) y_hat = ll.predict(x, params) for l, i in utils.L2I.items(): if y_hat == i: label = l break f.write(label + "\n") f.close() LR = 0.001 NUM_ITERATIONS = 30 if __name__ == '__main__': # YOUR CODE HERE # write code to load the train and dev sets, set up whatever you need, # and call train_classifier. # ... params = ll.create_classifier(len(utils.F2I), len(utils.L2I)) trained_params = train_classifier(utils.TRAIN, utils.DEV, NUM_ITERATIONS, LR, params) TEST = [(l, utils.text_to_bigrams(t)) for l, t in utils.read_data("test")] create_test_pred_file(TEST, trained_params)
W, b = params params[0] = grad_W*learning_rate + params[0] params[1] = train_loss = cum_loss / len(train_data) train_accuracy = accuracy_on_dataset(train_data, params) #dev_accuracy = accuracy_on_dataset(dev_data, params) #print (I, train_loss, train_accuracy, dev_accuracy) return params if __name__ == '__main__': # YOUR CODE HERE # write code to load the train and dev sets, set up whatever you need, # and call train_classifier. TRAIN = [(l, text_to_bigrams(t)) for l, t in read_data(open('train', 'r'))] train_vocab = vocabu(TRAIN) L2I = {l: i for i, l in enumerate(list(sorted(set([l for l, t in TRAIN]))))} F2I = {f: i for i, f in enumerate(list(sorted(vocabu(TRAIN))))} features=[] for idx in range(len(TRAIN)): #take the features out of TRAIN features.append(TRAIN[idx][-1]) ##print(F2I[features[1][3]]) features_vec=feats_to_vec(features,F2I) y=labels_to_y(TRAIN,L2I) in_dim = len(F2I) out_dim = len(L2I) params = create_classifier(in_dim, out_dim) # y=y.reshape(-1)
params: list of parameters (initial values) """ for I in xrange(num_iterations): cum_loss = 0.0 # total loss in this iteration. random.shuffle(train_data) for label, features in train_data: x = feats_to_vec(features) # convert features to a vector. y = label # convert the label to number if needed. loss, grads = ll.loss_and_gradients(x, y, params) cum_loss += loss params = params - learning_rate * grads train_loss = cum_loss / len(train_data) train_accuracy = accuracy_on_dataset(train_data, params) dev_accuracy = accuracy_on_dataset(dev_data, params) print I, train_loss, train_accuracy, dev_accuracy return params if __name__ == '__main__': TRAIN = [(l, ut.text_to_bigrams(t)) for l, t in ut.read_data("train")] DEV = [(l, ut.text_to_bigrams(t)) for l, t in ut.read_data("dev")] from collections import Counter fc = Counter() for l, feats in TRAIN: fc.update(feats) params = ll.create_classifier(ut.vocab, 12) trained_params = train_classifier(TRAIN, DEV, 10000, 0.5, params)