word_cnt = 0 correct = 0 for sentence in predicted: for row in sentence: word_cnt += 1 if row['chunk'] == row['pchunk']: correct += 1 return correct / word_cnt if __name__ == '__main__': column_names = ['form', 'pos', 'chunk'] train_file = '../../../corpus/conll2000/train.txt' test_file = '../../../corpus/conll2000/test.txt' train_corpus = conll_reader.read_sentences(train_file) train_corpus = conll_reader.split_rows(train_corpus, column_names) test_corpus = conll_reader.read_sentences(test_file) test_corpus = conll_reader.split_rows(test_corpus, column_names) model = train(train_corpus) predicted = predict(model, test_corpus) accuracy = eval(predicted) print("Accuracy", accuracy) f_out = open('out', 'w', newline='\n') # We write the word (form), part of speech (pos), # gold-standard chunk (chunk), and predicted chunk (pchunk) for sentence in predicted: for row in sentence: f_out.write(row['form'] + ' ' + row['pos'] + ' ' + row['chunk'] +
rows = [rows[i] + ' ' + y_test_predicted[i] for i in range(len(rows))] for row in rows: f_out.write(row + '\n') f_out.write('\n') f_out.close() if __name__ == '__main__': start_time = time.clock() train_corpus = '../../../corpus/conll2000/train.txt' test_corpus = '../../../corpus/conll2000/test.txt' w_size = 2 # The size of the context window to the left and right of the word feature_names = ['word_n2', 'word_n1', 'word', 'word_p1', 'word_p2', 'pos_n2', 'pos_n1', 'pos', 'pos_p1', 'pos_p2'] train_sentences = conll_reader.read_sentences(train_corpus) print("Extracting the features...") X_dict, y = extract_features(train_sentences, w_size, feature_names) print("Encoding the features...") # Vectorize the feature matrix and carry out a one-hot encoding vec = DictVectorizer(sparse=True) X = vec.fit_transform(X_dict) # The statement below will swallow a considerable memory # X = vec.fit_transform(X_dict).toarray() # print(vec.get_feature_names()) training_start_time = time.clock() print("Training the model...") classifier = linear_model.LogisticRegression(penalty='l2', dual=True, solver='liblinear')
if __name__ == '__main__': start_time = time.clock() train_corpus = '../Corpus/train.txt' test_corpus = '../Corpus/test.txt' w_size = 2 # The size of the context window to the left and right of the word # feature_names = ['word_n2', 'word_n1', 'word', 'word_p1', 'word_p2', # 'pos_n2', 'pos_n1', 'pos', 'pos_p1', 'pos_p2'] # feature_names = ['pos_n2', 'pos_n1', 'pos', 'pos_p1', 'pos_p2'] feature_names = [ 'word_n2', 'word_n1', 'word', 'word_p1', 'word_p2', 'pos_n2', 'pos_n1', 'pos', 'pos_p1', 'pos_p2', 'prev_tag_n2', 'prev_tag_n1' ] train_sentences = conll_reader.read_sentences(train_corpus) print("Extracting the features...") X_dict, y = extract_features(train_sentences, w_size, feature_names) # It is fine that X_dict has the previous 2 chunk tags in it's feature vector, since this is the training # and you need those for the training of the model print("Encoding the features...") # Vectorize the feature matrix and carry out a one-hot encoding vec = DictVectorizer(sparse=True) X = vec.fit_transform(X_dict) # The statement below will swallow a considerable memory # X = vec.fit_transform(X_dict).toarray() # print(vec.get_feature_names()) training_start_time = time.clock() print("Training the model...")
word_cnt = 0 correct = 0 for sentence in predicted: for row in sentence: word_cnt += 1 if row['chunk'] == row['pchunk']: correct += 1 return correct / word_cnt if __name__ == '__main__': column_names = ['form', 'pos', 'chunk'] train_file = '../../corpus/conll2000/train.txt' test_file = '../../corpus/conll2000/test.txt' train_corpus = conll_reader.read_sentences(train_file) train_corpus = conll_reader.split_rows(train_corpus, column_names) test_corpus = conll_reader.read_sentences(test_file) test_corpus = conll_reader.split_rows(test_corpus, column_names) model = train(train_corpus) predicted = predict(model, test_corpus) accuracy = eval(predicted) print("Accuracy", accuracy) f_out = open('out', 'w') # We write the word (form), part of speech (pos), # gold-standard chunk (chunk), and predicted chunk (pchunk) for sentence in predicted: for row in sentence: f_out.write(row['form'] + ' ' + row['pos'] + ' ' +