Beispiel #1
0
def main():
    """main function
    """
    n = 2  # Bigram HMM
    args = parse_arguments()
    treebank = TaggedCorpusReader(
        os.path.split(args.train_f)[0],
        os.path.split(args.train_f)[1])
    observation_space = [item[0] for item in treebank.sents()]  # all words
    state_space = [item[1] for item in treebank.sents()]  # all pos tags

    words = dict.fromkeys(observation_space)
    tags = dict.fromkeys(state_space)

    # HMM parameter estimation- initial, transition and emission probablity
    start = time.time()
    init_p = [item[1] for item in comp_initial(tags, treebank)]
    trans_p = comp_transition(n, tags, state_space)
    emission_p = comp_emission(words,
                               tags,
                               state_space,
                               treebank,
                               smoothing=args.smoothing)
    end = time.time()
    print("Runtime (training): %.3f s" % (end - start))

    # Test your HMM-trained model
    treebank = TaggedCorpusReader(
        os.path.split(args.eval_f)[0],
        os.path.split(args.eval_f)[1])
    viterbi_tags = []

    start = time.time()
    for sentence in treebank.paras():
        test_words = [item[0] for item in sentence]
        O, S, Y, pi, A, B = pre_process(words, tags, test_words, init_p,
                                        trans_p, emission_p)
        # Computes Viterbi's most likely tags

        if args.log_prob:
            X = viterbi_log(O, S, Y, pi, A, B)
        else:
            X = viterbi(O, S, Y, pi, A, B)
        viterbi_tags.append(X)
    end = time.time()

    print("Runtime (viterbi): %.3f s" % (end - start))
    output_path = "./" + "de-tagger.tt"
    post_processing(viterbi_tags, args.test_f, output_path)
 def take_ngrams_by_topic_from_file(self, ngram_directory, ngram_file):
     corpus = \
         TaggedCorpusReader(ngram_directory,
                            ngram_file,
                            sent_tokenizer=LineTokenizer(blanklines='discard'),
                            encoding='utf-8')
     corpus_paras = corpus.paras()[:]
     k = corpus_paras[::2]
     for i in range(2):
         k = list(chain(*k))
     v = corpus_paras[1::2]
     ngrams_by_topic_from_file = \
         {k.encode('utf-8'): list(set(chain(*v)))
            for k, v in dict(izip(k, v)).items()}
     return ngrams_by_topic_from_file
    def take_ngrams_by_topic_from_file(self, 
		                               ngram_directory, 
		                               ngram_file):
        corpus = \
            TaggedCorpusReader(ngram_directory, 
                               ngram_file, 
                               sent_tokenizer=LineTokenizer(blanklines='discard'), 
                               encoding='utf-8')
        corpus_paras = corpus.paras()[:]
        k = corpus_paras[::2]
        for i in range(2):
            k = list(chain(*k))
        v = corpus_paras[1::2]
        ngrams_by_topic_from_file = \
            {k.encode('utf-8'): list(set(chain(*v))) 
               for k, v in dict(izip(k, v)).items()}
        return ngrams_by_topic_from_file
Beispiel #4
0
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import SpaceTokenizer
import nltk

d = nltk.data.find('corpora/cookbook')
reader = TaggedCorpusReader(d, r'.*\.pos')
print(reader.words())
print(reader.tagged_words())
print(reader.sents())
print(reader.tagged_sents())
print(reader.paras())
print(reader.tagged_paras())

# custom tokenizer
reader = TaggedCorpusReader(d, r'.*\.pos', word_tokenizer=SpaceTokenizer())
print(reader.sents())
print(reader.tagged_sents())

# universal tagset
reader = TaggedCorpusReader(d, r'.*\.pos', word_tokenizer=SpaceTokenizer(), tagset='en-brown')
print(reader.tagged_sents(tagset='universal'))

# NLTK tagged corpora
from nltk.corpus import treebank
print(reader.tagged_words())
print(reader.tagged_words(tagset='universal'))
Beispiel #5
0
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import SpaceTokenizer
import nltk

d = nltk.data.find('corpora/cookbook')
reader = TaggedCorpusReader(d, r'.*\.pos')
print(reader.words())
print(reader.tagged_words())
print(reader.sents())
print(reader.tagged_sents())
print(reader.paras())
print(reader.tagged_paras())

# custom tokenizer
reader = TaggedCorpusReader(d, r'.*\.pos', word_tokenizer=SpaceTokenizer())
print(reader.sents())
print(reader.tagged_sents())

# universal tagset
reader = TaggedCorpusReader(d,
                            r'.*\.pos',
                            word_tokenizer=SpaceTokenizer(),
                            tagset='en-brown')
print(reader.tagged_sents(tagset='universal'))

# NLTK tagged corpora
from nltk.corpus import treebank
print(reader.tagged_words())
print(reader.tagged_words(tagset='universal'))
########## TAGGED CORPUS READER ###############

from nltk.corpus.reader import TaggedCorpusReader
root="C:\\Users\\Matrix\\AppData\\Roaming\\nltk_data\\corpora\\cookbook\\"
file="brown.pos"
source=root+file

#Using Regex to match all files with extension .pos
reader=TaggedCorpusReader(root,r'.*\.pos')

print reader.words()
print reader.tagged_words()
print reader.sents()
print reader.tagged_sents()
print reader.paras()
print reader.tagged_paras()


#TaggedCorpus uses default tokenizer but we can change it by customizing it
from nltk.tokenize import SpaceTokenizer
reader=TaggedCorpusReader(root,r'.*\.pos',word_tokenizer=SpaceTokenizer())
print reader.words()

#Customing TaggedCorpus's sentence tokenizer
from nltk.tokenize import LineTokenizer
reader=TaggedCorpusReader(root,r'.*\.pos',sent_tokenizer=LineTokenizer())
print reader.words()

#Customizing TaggedCorpus's paragraph Block reader
#Customizing TaggedCorpus's tag separator - Pg 57