Example #1
0
def datasets():
    #select random number for random selection of data sets
    start = random.random()
    if start > 0.9:
        start = 0.9

    end = start + 0.1
    #generate list of tagged data from corpus
    words = cs.tagged_words(tagset='universal')
    startn = round(len(words) * start)
    endn = round(len(words) * end)  #segment words into two data sets
    training = words[:startn] + words[endn:]
    testing = words[startn + 1:endn - 1]
    #return data sets
    return [training, testing]
Example #2
0
方法的性能和易用性。
"""
# 间隙定义有自己模式:}<tag>{
"""
7. 用任何你之前已经开发的分块器执行下列评估任务。(请注意,大多数分块语料库包
含一些内部的不一致,以至于任何合理的基于规则的方法都将产生错误。)
a. 在来自分块语料库的 100个句子上评估你的分块器, 报告精度、 召回率和F量度。
b. 使用 chunkscore.missed()和 chunkscore.incorrect()方法识别你的分块器的
错误,并讨论它。
"""
# 分块器评估的学习,以及chunksore的使用
from nltk.corpus import conll2000

cp = nltk.RegexpParser("")
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
data = conll2000.tagged_words('test.txt')
print(cp.evaluate(test_sents))
grammer = r"NP: {<[CDJNP].*>+}"
cp = nltk.RegexpParser(grammer)
chunkscore = nltk.chunk.ChunkScore()
guess = cp.parse(data)
chunkscore.score(test_sents, guess)
chunkscore.missed()
chunkscore.incorrect()
"""
10. bigram分块器的准确性得分约为 90%。研究它的错误,并试图找出它为什么不能获
得 100%的准确率。实验trigram分块。你能够在提高性能吗?
"""


# 考察第二种分块器:词袋分块器
Example #3
0
    return tag


# Return the POS of a rule (used for list sorting)
def get_key(rule):
    return rule.split()[1]


if __name__ == '__main__':
    # Get allowed words
    allowed_words_file = open('../../allowed_words.txt', 'r')
    allowed_words = allowed_words_file.read().split('\n')

    # Tagged words from corpora
    treebank_tagged_words = list(set(treebank.tagged_words()))
    conll2000_tagged_words = list(set(conll2000.tagged_words()))
    brown_tagged_words = list(set(brown.tagged_words()))
    nps_tagged_words = list(set(nps_chat.tagged_words()))

    vocab_rules = []
    unvocabbed_words = []

    # Find tags that occur with allowed words in the corpora
    for word in allowed_words:
        curr_tags = get_tags_linear(word, treebank_tagged_words)

        if not curr_tags:
            curr_tags = get_tags_linear(word, conll2000_tagged_words)

        if not curr_tags:
            curr_tags = get_tags_linear(word, brown_tagged_words)
Example #4
0
result = word_vectors.most_similar(positive=['child'],negative=['person'])
print("Most similar to 'child' but dissimilar to 'person':\n",result[:3])

#%%
from nltk.corpus import conll2000
from gensim.models import Word2Vec
import tensorflow as tf
from keras.layers import Dense, Embedding, Activation, Flatten
from keras import Sequential
from keras.utils import to_categorical
import numpy as np
import collections

#%%
train_words = conll2000.tagged_words("train.txt")
test_words = conll2000.tagged_words("test.txt")
print(train_words[:10])
#%%
def get_tag_vocabulary(tagged_words):
    tag2id={}
    for item in tagged_words:
        tag  = item[1]
        tag2id.setdefault(tag,len(tag2id))
    return tag2id

word2id = {k: v.index for k,v in word_vectors.vocab.items()}
tag2id = get_tag_vocabulary(train_words)
#%%
def get_int_data(tagged_words, word2id, tag2id):
    X,Y = [],[]
Example #5
0
# tagged corpora
print(brown.words())
print(brown.tagged_words())
print(brown.sents())  # doctest: +ELLIPSIS
print(brown.tagged_sents())  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
print(brown.paras(
    categories='reviews'))  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
print(brown.tagged_paras(
    categories='reviews'))  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
# nltk.download('indian')
print(indian.words())  # doctest: +SKIP
print(indian.tagged_words())  # doctest: +SKIP
# nltk.download('universal_tagset')
print(brown.tagged_sents(
    tagset='universal'))  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
print(conll2000.tagged_words(
    tagset='universal'))  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
# chunked corpora
print(conll2000.sents())  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
for tree in conll2000.chunked_sents()[:2]:
    print(tree)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
# nltk.download('conll2002')
print(conll2002.sents())  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
for tree in conll2002.chunked_sents()[:2]:
    print(tree)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
# nltk.download('semcor')
print(semcor.words())
print(semcor.chunks())
print(semcor.sents())  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
print(semcor.chunk_sents())  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
print(list(map(str, semcor.tagged_chunks(tag='both')[:3])))
print([[str(c) for c in s] for s in semcor.tagged_sents(tag='both')[:2]])
# init colorama
colorama.init()

# global constants
CONST_tagset = 'universal'

# global list of gold corpora
# C:\Users\admin\AppData\Roaming\nltk_data\corpora\
corp_names = [
    "brown", "nps_chat", "conll2000", "treebank", "twitter", "nhtsa_0",
    "nhtsa_1", "nhtsa_2", "nhtsa_3", "nhtsa_4", "nhtsa_5", "nhtsa_6"
]
corp_words_tagged = [
    brown.tagged_words(tagset=CONST_tagset),
    nps_chat.tagged_words(tagset=CONST_tagset),
    conll2000.tagged_words(tagset=CONST_tagset),
    treebank.tagged_words(tagset=CONST_tagset)
]
corp_words_untagged = [
    brown.words(),
    nps_chat.words(),
    conll2000.words(),
    treebank.words()
]
corp_sents_tagged = [
    brown.tagged_sents(tagset=CONST_tagset),
    nps_chat.tagged_posts(tagset=CONST_tagset),
    conll2000.tagged_sents(tagset=CONST_tagset),
    treebank.tagged_sents(tagset=CONST_tagset)
]
corp_sents_untagged = [