def main(): """main function """ n = 2 # Bigram HMM args = parse_arguments() treebank = TaggedCorpusReader( os.path.split(args.train_f)[0], os.path.split(args.train_f)[1]) observation_space = [item[0] for item in treebank.sents()] # all words state_space = [item[1] for item in treebank.sents()] # all pos tags words = dict.fromkeys(observation_space) tags = dict.fromkeys(state_space) # HMM parameter estimation- initial, transition and emission probablity start = time.time() init_p = [item[1] for item in comp_initial(tags, treebank)] trans_p = comp_transition(n, tags, state_space) emission_p = comp_emission(words, tags, state_space, treebank, smoothing=args.smoothing) end = time.time() print("Runtime (training): %.3f s" % (end - start)) # Test your HMM-trained model treebank = TaggedCorpusReader( os.path.split(args.eval_f)[0], os.path.split(args.eval_f)[1]) viterbi_tags = [] start = time.time() for sentence in treebank.paras(): test_words = [item[0] for item in sentence] O, S, Y, pi, A, B = pre_process(words, tags, test_words, init_p, trans_p, emission_p) # Computes Viterbi's most likely tags if args.log_prob: X = viterbi_log(O, S, Y, pi, A, B) else: X = viterbi(O, S, Y, pi, A, B) viterbi_tags.append(X) end = time.time() print("Runtime (viterbi): %.3f s" % (end - start)) output_path = "./" + "de-tagger.tt" post_processing(viterbi_tags, args.test_f, output_path)
def take_ngrams_by_topic_from_file(self, ngram_directory, ngram_file): corpus = \ TaggedCorpusReader(ngram_directory, ngram_file, sent_tokenizer=LineTokenizer(blanklines='discard'), encoding='utf-8') corpus_paras = corpus.paras()[:] k = corpus_paras[::2] for i in range(2): k = list(chain(*k)) v = corpus_paras[1::2] ngrams_by_topic_from_file = \ {k.encode('utf-8'): list(set(chain(*v))) for k, v in dict(izip(k, v)).items()} return ngrams_by_topic_from_file
def take_ngrams_by_topic_from_file(self, ngram_directory, ngram_file): corpus = \ TaggedCorpusReader(ngram_directory, ngram_file, sent_tokenizer=LineTokenizer(blanklines='discard'), encoding='utf-8') corpus_paras = corpus.paras()[:] k = corpus_paras[::2] for i in range(2): k = list(chain(*k)) v = corpus_paras[1::2] ngrams_by_topic_from_file = \ {k.encode('utf-8'): list(set(chain(*v))) for k, v in dict(izip(k, v)).items()} return ngrams_by_topic_from_file
from nltk.corpus.reader import TaggedCorpusReader from nltk.tokenize import SpaceTokenizer import nltk d = nltk.data.find('corpora/cookbook') reader = TaggedCorpusReader(d, r'.*\.pos') print(reader.words()) print(reader.tagged_words()) print(reader.sents()) print(reader.tagged_sents()) print(reader.paras()) print(reader.tagged_paras()) # custom tokenizer reader = TaggedCorpusReader(d, r'.*\.pos', word_tokenizer=SpaceTokenizer()) print(reader.sents()) print(reader.tagged_sents()) # universal tagset reader = TaggedCorpusReader(d, r'.*\.pos', word_tokenizer=SpaceTokenizer(), tagset='en-brown') print(reader.tagged_sents(tagset='universal')) # NLTK tagged corpora from nltk.corpus import treebank print(reader.tagged_words()) print(reader.tagged_words(tagset='universal'))
from nltk.corpus.reader import TaggedCorpusReader from nltk.tokenize import SpaceTokenizer import nltk d = nltk.data.find('corpora/cookbook') reader = TaggedCorpusReader(d, r'.*\.pos') print(reader.words()) print(reader.tagged_words()) print(reader.sents()) print(reader.tagged_sents()) print(reader.paras()) print(reader.tagged_paras()) # custom tokenizer reader = TaggedCorpusReader(d, r'.*\.pos', word_tokenizer=SpaceTokenizer()) print(reader.sents()) print(reader.tagged_sents()) # universal tagset reader = TaggedCorpusReader(d, r'.*\.pos', word_tokenizer=SpaceTokenizer(), tagset='en-brown') print(reader.tagged_sents(tagset='universal')) # NLTK tagged corpora from nltk.corpus import treebank print(reader.tagged_words()) print(reader.tagged_words(tagset='universal'))
########## TAGGED CORPUS READER ############### from nltk.corpus.reader import TaggedCorpusReader root="C:\\Users\\Matrix\\AppData\\Roaming\\nltk_data\\corpora\\cookbook\\" file="brown.pos" source=root+file #Using Regex to match all files with extension .pos reader=TaggedCorpusReader(root,r'.*\.pos') print reader.words() print reader.tagged_words() print reader.sents() print reader.tagged_sents() print reader.paras() print reader.tagged_paras() #TaggedCorpus uses default tokenizer but we can change it by customizing it from nltk.tokenize import SpaceTokenizer reader=TaggedCorpusReader(root,r'.*\.pos',word_tokenizer=SpaceTokenizer()) print reader.words() #Customing TaggedCorpus's sentence tokenizer from nltk.tokenize import LineTokenizer reader=TaggedCorpusReader(root,r'.*\.pos',sent_tokenizer=LineTokenizer()) print reader.words() #Customizing TaggedCorpus's paragraph Block reader #Customizing TaggedCorpus's tag separator - Pg 57