Ejemplo n.º 1
0
    word_cnt = 0
    correct = 0
    for sentence in predicted:
        for row in sentence:
            word_cnt += 1
            if row['chunk'] == row['pchunk']:
                correct += 1
    return correct / word_cnt


if __name__ == '__main__':
    column_names = ['form', 'pos', 'chunk']
    train_file = '../../../corpus/conll2000/train.txt'
    test_file = '../../../corpus/conll2000/test.txt'

    train_corpus = conll_reader.read_sentences(train_file)
    train_corpus = conll_reader.split_rows(train_corpus, column_names)
    test_corpus = conll_reader.read_sentences(test_file)
    test_corpus = conll_reader.split_rows(test_corpus, column_names)

    model = train(train_corpus)

    predicted = predict(model, test_corpus)
    accuracy = eval(predicted)
    print("Accuracy", accuracy)
    f_out = open('out', 'w', newline='\n')
    # We write the word (form), part of speech (pos),
    # gold-standard chunk (chunk), and predicted chunk (pchunk)
    for sentence in predicted:
        for row in sentence:
            f_out.write(row['form'] + ' ' + row['pos'] + ' ' + row['chunk'] +
Ejemplo n.º 2
0
        rows = [rows[i] + ' ' + y_test_predicted[i] for i in range(len(rows))]
        for row in rows:
            f_out.write(row + '\n')
        f_out.write('\n')
    f_out.close()


if __name__ == '__main__':
    start_time = time.clock()
    train_corpus = '../../../corpus/conll2000/train.txt'
    test_corpus = '../../../corpus/conll2000/test.txt'
    w_size = 2  # The size of the context window to the left and right of the word
    feature_names = ['word_n2', 'word_n1', 'word', 'word_p1', 'word_p2',
                     'pos_n2', 'pos_n1', 'pos', 'pos_p1', 'pos_p2']

    train_sentences = conll_reader.read_sentences(train_corpus)

    print("Extracting the features...")
    X_dict, y = extract_features(train_sentences, w_size, feature_names)

    print("Encoding the features...")
    # Vectorize the feature matrix and carry out a one-hot encoding
    vec = DictVectorizer(sparse=True)
    X = vec.fit_transform(X_dict)
    # The statement below will swallow a considerable memory
    # X = vec.fit_transform(X_dict).toarray()
    # print(vec.get_feature_names())

    training_start_time = time.clock()
    print("Training the model...")
    classifier = linear_model.LogisticRegression(penalty='l2', dual=True, solver='liblinear')
Ejemplo n.º 3
0

if __name__ == '__main__':
    start_time = time.clock()
    train_corpus = '../Corpus/train.txt'
    test_corpus = '../Corpus/test.txt'
    w_size = 2  # The size of the context window to the left and right of the word
    # feature_names = ['word_n2', 'word_n1', 'word', 'word_p1', 'word_p2',
    #                  'pos_n2', 'pos_n1', 'pos', 'pos_p1', 'pos_p2']
    # feature_names = ['pos_n2', 'pos_n1', 'pos', 'pos_p1', 'pos_p2']
    feature_names = [
        'word_n2', 'word_n1', 'word', 'word_p1', 'word_p2', 'pos_n2', 'pos_n1',
        'pos', 'pos_p1', 'pos_p2', 'prev_tag_n2', 'prev_tag_n1'
    ]

    train_sentences = conll_reader.read_sentences(train_corpus)

    print("Extracting the features...")
    X_dict, y = extract_features(train_sentences, w_size, feature_names)
    # It is fine that X_dict has the previous 2 chunk tags in it's feature vector, since this is the training
    # and you need those for the training of the model
    print("Encoding the features...")
    # Vectorize the feature matrix and carry out a one-hot encoding
    vec = DictVectorizer(sparse=True)
    X = vec.fit_transform(X_dict)
    # The statement below will swallow a considerable memory
    # X = vec.fit_transform(X_dict).toarray()
    # print(vec.get_feature_names())

    training_start_time = time.clock()
    print("Training the model...")
Ejemplo n.º 4
0
    word_cnt = 0
    correct = 0
    for sentence in predicted:
        for row in sentence:
            word_cnt += 1
            if row['chunk'] == row['pchunk']:
                correct += 1
    return correct / word_cnt


if __name__ == '__main__':
    column_names = ['form', 'pos', 'chunk']
    train_file = '../../corpus/conll2000/train.txt'
    test_file = '../../corpus/conll2000/test.txt'

    train_corpus = conll_reader.read_sentences(train_file)
    train_corpus = conll_reader.split_rows(train_corpus, column_names)
    test_corpus = conll_reader.read_sentences(test_file)
    test_corpus = conll_reader.split_rows(test_corpus, column_names)

    model = train(train_corpus)

    predicted = predict(model, test_corpus)
    accuracy = eval(predicted)
    print("Accuracy", accuracy)
    f_out = open('out', 'w')
    # We write the word (form), part of speech (pos),
    # gold-standard chunk (chunk), and predicted chunk (pchunk)
    for sentence in predicted:
        for row in sentence:
            f_out.write(row['form'] + ' ' + row['pos'] + ' ' +