def cltk_pos_cv(full_training_set, local_dir_rel, counter): local_dir = os.path.expanduser(local_dir_rel) stdout_old = sys.stdout sys.stdout = open(os.path.join(local_dir, 'test_%d.out' % counter), 'w') # read POS corpora print("local_dir", local_dir) train_reader = TaggedCorpusReader(local_dir, 'train_%d.pos' % counter) train_sents = train_reader.tagged_sents() test_reader = TaggedCorpusReader(local_dir, 'test_%d.pos' % counter) test_sents = test_reader.tagged_sents() print('Loop #' + str(counter)) sys.stdout.flush() # make crf tagger crf_tagger = CRFTagger() crf_tagger.train(train_sents, 'model.crf.tagger') #crf_tagger = UnigramTagger(train_sents) # evaluate crf tagger crf_accuracy = None crf_accuracy = crf_tagger.evaluate(test_sents) print('crf:', crf_accuracy) sys.stdout = stdout_old
def main(): """main function """ n = 2 # Bigram HMM args = parse_arguments() treebank = TaggedCorpusReader( os.path.split(args.train_f)[0], os.path.split(args.train_f)[1]) observation_space = [item[0] for item in treebank.sents()] # all words state_space = [item[1] for item in treebank.sents()] # all pos tags words = dict.fromkeys(observation_space) tags = dict.fromkeys(state_space) # HMM parameter estimation- initial, transition and emission probablity start = time.time() init_p = [item[1] for item in comp_initial(tags, treebank)] trans_p = comp_transition(n, tags, state_space) emission_p = comp_emission(words, tags, state_space, treebank, smoothing=args.smoothing) end = time.time() print("Runtime (training): %.3f s" % (end - start)) # Test your HMM-trained model treebank = TaggedCorpusReader( os.path.split(args.eval_f)[0], os.path.split(args.eval_f)[1]) viterbi_tags = [] start = time.time() for sentence in treebank.paras(): test_words = [item[0] for item in sentence] O, S, Y, pi, A, B = pre_process(words, tags, test_words, init_p, trans_p, emission_p) # Computes Viterbi's most likely tags if args.log_prob: X = viterbi_log(O, S, Y, pi, A, B) else: X = viterbi(O, S, Y, pi, A, B) viterbi_tags.append(X) end = time.time() print("Runtime (viterbi): %.3f s" % (end - start)) output_path = "./" + "de-tagger.tt" post_processing(viterbi_tags, args.test_f, output_path)
def make_pos_model(model_type): now = time.time() reader = TaggedCorpusReader('.', 'greek_training_set.pos') train_sents = reader.tagged_sents() if model_type == 'unigram': tagger = UnigramTagger(train_sents) file = 'unigram.pickle' elif model_type == 'bigram': tagger = BigramTagger(train_sents) file = 'bigram.pickle' elif model_type == 'trigram': tagger = TrigramTagger(train_sents) file = 'trigram.pickle' elif model_type == 'backoff': tagger1 = UnigramTagger(train_sents) tagger2 = BigramTagger(train_sents, backoff=tagger1) tagger = TrigramTagger(train_sents, backoff=tagger2) file = '123grambackoff.pickle' elif model_type == 'tnt': tagger = tnt.TnT() tagger.train(train_sents) file = 'tnt.pickle' else: print('Invalid model_type.') _dir = os.path.expanduser('~/greek_models_cltk/taggers/pos') path = os.path.join(_dir, file) with open(path, 'wb') as f: pickle.dump(tagger, f) print('Completed training {0} model in {1} seconds to {2}.'.format( model_type, time.time() - now, path))
def get_brill_tagger(self): train_data = TaggedCorpusReader('.', 'tagged_input_sentences.txt', sep="/") traindata = list(train_data.tagged_sents()) postag = load('taggers/maxent_treebank_pos_tagger/english.pickle') templates = [ brill.Template(brill.Pos([-1])), brill.Template(brill.Pos([1])), brill.Template(brill.Pos([-2])), brill.Template(brill.Pos([2])), brill.Template(brill.Pos([-2, -1])), brill.Template(brill.Pos([1, 2])), brill.Template(brill.Pos([-3, -2, -1])), brill.Template(brill.Pos([1, 2, 3])), brill.Template(brill.Pos([-1]), brill.Pos([1])), brill.Template(brill.Word([-1])), brill.Template(brill.Word([1])), brill.Template(brill.Word([-2])), brill.Template(brill.Word([2])), brill.Template(brill.Word([-2, -1])), brill.Template(brill.Word([1, 2])), brill.Template(brill.Word([-3, -2, -1])), brill.Template(brill.Word([1, 2, 3])), brill.Template(brill.Word([-1]), brill.Word([1])) ] trainer = BrillTaggerTrainer(postag, templates=templates, trace=3) brill_tagger = trainer.train(traindata, max_rules=10) return brill_tagger
def NER_HINDINBC(): reader = TaggedCorpusReader('/python27/POS_9/', r'.*\.pos') f1 = reader.fileids() print "The Files of Corpus are:", f1 sents = reader.tagged_sents() sentn = reader.sents() #words=sentn.split() ls = len(sents) #lw=len(words) print "Length of Corpus Is:", ls #print "The Words are:",lw size1 = int(ls * 0.3) test_sents = sents[:size1] train_sents = sents[size1:] nbc_tagger = ClassifierBasedPOSTagger(train=train_sents) test = nbc_tagger.evaluate(test_sents) print "The Test Result is:", test #THE GIVEN INPUT given_sent = "नीतीश कुमार द्वारा भाजपा के साथ हाथ मिलाने से वहां का पूरा राजनीतिक परिदृश्य ही बदल गया है मगर शरद यादव इससे खुश नहीं हैं".decode( 'utf-8') gsw = given_sent.split() tag_gs = nbc_tagger.tag(gsw) print "GIVEN SENT TAG:", tag_gs ftag_gs = " ".join(list(itertools.chain(*tag_gs))) print "And its flattened Version is:", ftag_gs
def __init__(self, root, fileids='.*', encoding='utf8'): """ Reads all the files in root. :param root: Directory. :param fileids: List of files that have to be read. '.*' if all files have to be parsed. :param encoding: File enconding """ self._reader = TaggedCorpusReader(root, fileids, encoding=encoding)
def __init__(self, corpusroot, corpusname): #gunakan custom wordlist corpus dgn method WordListCorpusReader #wordlist = WordListCorpusReader(corpus_root, ['wordlist.txt']) #gunakan custom wordlist corpus dgn method PlaintextCorpusReader #wordlist = PlaintextCorpusReader(corpus_root,'wordlist.txt') reader = TaggedCorpusReader(corpusroot, corpusname) self.reader_train = reader.tagged_sents() self.test_sent = reader.tagged_sents()[1000:]
def read(self, file_path): logger.info('Reading instances from file %s', file_path) reader = TaggedCorpusReader(*os.path.split(file_path), sep='\t', word_tokenizer=RegexpTokenizer(r'\n', gaps=True), sent_tokenizer=BlanklineTokenizer(), para_block_reader=lambda s: [s.read()]) return Dataset([ self.text_to_instance(*tuple(zip(*tagged_sent))) for tagged_sent in reader.tagged_sents() ])
def take_ngrams_by_topic_from_file(self, ngram_directory, ngram_file): corpus = \ TaggedCorpusReader(ngram_directory, ngram_file, sent_tokenizer=LineTokenizer(blanklines='discard'), encoding='utf-8') corpus_paras = corpus.paras()[:] k = corpus_paras[::2] for i in range(2): k = list(chain(*k)) v = corpus_paras[1::2] ngrams_by_topic_from_file = \ {k.encode('utf-8'): list(set(chain(*v))) for k, v in dict(izip(k, v)).items()} return ngrams_by_topic_from_file
def make_morpho_model(language, model_type, feature, train_file, test_file=None): test_file = train_file if test_file == None else test_file reader_train = TaggedCorpusReader('.', train_file) reader_test = TaggedCorpusReader('.', test_file) train_sents = reader_train.tagged_sents() test_sents = reader_test.tagged_sents() verify_tagged_corpus(reader_train) verify_tagged_corpus(reader_test) tagger = train_tagger(language, model_type, feature, train_sents) acc = tagger.evaluate(test_sents) baseline = compute_baseline(reader_test.tagged_words()) kappa = (acc - baseline) / (1 - baseline) cm = conf_matrix(tagger, reader_test.words(), reader_test.tagged_words()) return (tagger, acc, kappa, cm)
def read_reviews(): """ read reviews from the given file(s). """ from glob import glob filenames = glob("input/food*.parsed") sent_end_pattern = ".\/[,\.]" reader = TaggedCorpusReader(root=".", fileids=filenames, sep="/", sent_tokenizer=RegexpTokenizer( sent_end_pattern, gaps=True)) li = reader.sents() return li
def __init__(self, corpusroot, corpusname): #gunakan custom wordlist corpus dgn method WordListCorpusReader #wordlist = WordListCorpusReader(corpus_root, ['wordlist.txt']) #gunakan custom wordlist corpus dgn method PlaintextCorpusReader #wordlist = PlaintextCorpusReader(corpus_root,'wordlist.txt') #nltk_old = [(3,0,1)] #nltk_current = [tuple([int(x) for x in nltk.__version__.split('.')])] reader = TaggedCorpusReader(corpusroot, corpusname) splitratio = 0.8 self.reader_train = reader.tagged_sents()[:int(len(reader.tagged_sents())*splitratio)] self.test_sent = reader.tagged_sents()[int(len(reader.tagged_sents())*splitratio):] print "split test ratio: ", int(len(reader.tagged_sents())*splitratio),"\n" print "reader_train len: ", len(self.reader_train) print "test_sent len: ", len(self.test_sent)
def __init__(self, corpus_path, corpus_files): """ Construct a Treebank object :param corpus_path: path to corpus files :param corpus_files: list of filenames for corpus text """ msg("Importing treebank...") # get a corpus reader object for our corpus using NLTK treebank = TaggedCorpusReader(corpus_path, corpus_files) # get all sentences from corpus in a tagged format self.tagged_sents = treebank.tagged_sents() # get all sentences from corpus in an untagged format self.sents = treebank.sents() msg("done!\n")
def generate_corpus_from_segmented_reports(self): re = ReportEnviroments() new_corpus_of_segmented_reports = TaggedCorpusReader( re.segmented_reports_corpus_path, '.*', sent_tokenizer=LineTokenizer(blanklines='discard'), encoding='utf-8') raw_segmented_reports = [] for i in range(len(new_corpus_of_segmented_reports.fileids())): raw_segmented_reports.append( new_corpus_of_segmented_reports.sents( fileids=new_corpus_of_segmented_reports.fileids()[i])) cut_of_segmented_reports = [] topics = ['DISCENTE', 'DOCENTE', 'INFRAESTRUTURA', 'UNCATEGORIZED'] for i in range(len(raw_segmented_reports)): cut_of_segmented_reports.append( raw_segmented_reports[i] [raw_segmented_reports[i].index([topics[0].decode('utf-8')]): raw_segmented_reports[i].index([topics[-1].decode('utf-8')]) + 1]) return cut_of_segmented_reports, topics
def cltk_pos_cv(full_training_set, local_dir_rel): print("full_training_set", full_training_set) crf_accuracies = [] with open(full_training_set) as f: training_set_string = f.read() pos_set = training_set_string.split('\n\n') # mk into a list sentence_count = len(pos_set) # 3473 tenth = math.ceil(int(sentence_count) / int(10)) random.seed(0) random.shuffle(pos_set) def chunks(l, n): """Yield successive n-sized chunks from l. http://stackoverflow.com/a/312464 """ for i in range(0, len(l), n): yield l[i:i+n] # a list of 10 lists ten_parts = list(chunks(pos_set, tenth)) # a list of 10 lists with ~347 sentences each #for counter in list(range(10)): for counter, part in list(enumerate(ten_parts)): # map test list to part of given loop test_set = ten_parts[counter] # or: test_set = part # filter out this loop's test index training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]] # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 ) training_set = [item for sublist in training_set_lists for item in sublist] # save shuffled tests to file (as NLTK trainers expect) #local_dir_rel = '~/cltk_data/user_data' local_dir = os.path.expanduser(local_dir_rel) if not os.path.isdir(local_dir): os.makedirs(local_dir) test_path = os.path.join(local_dir, 'test.pos') with open(test_path, 'w') as f: f.write('\n\n'.join(test_set)) train_path = os.path.join(local_dir, 'train.pos') with open(train_path, 'w') as f: f.write('\n\n'.join(training_set)) # read POS corpora print("local_dir", local_dir) train_reader = TaggedCorpusReader(local_dir, 'train.pos') train_sents = train_reader.tagged_sents() test_reader = TaggedCorpusReader(local_dir, 'test.pos') test_sents = test_reader.tagged_sents() print('Loop #' + str(counter)) # make crf tagger crf_tagger = CRFTagger() crf_tagger.train(train_sents, 'model.crf.tagger') # evaluate crf tagger crf_accuracy = None crf_accuracy = crf_tagger.evaluate(test_sents) crf_accuracies.append(crf_accuracy) print('crf:', crf_accuracy) #if counter> 0: break final_accuracies_list = [] mean_accuracy_crf = mean(crf_accuracies) standard_deviation_crf = stdev(crf_accuracies) uni = {'crf': {'mean': mean_accuracy_crf, 'sd': standard_deviation_crf}} final_accuracies_list.append(uni) final_dict = {} for x in final_accuracies_list: final_dict.update(x) return final_dict
def setUp(self): reader = TaggedCorpusReader('./corpora/oe', 'oe_train.pos') os.system('mkdir -p taggers/oe/pos') self.sents = reader.tagged_sents()
from nltk import tag from nltk.tag import brill from nltk.tag import brill_trainer import pickle # Brill tagger parameters max_rules = 300 min_score = 3 # Training parameters development_size = 5110 train = .85 # Read data from development.sdx data = TaggedCorpusReader('.', r'.*\.sdx', sep='|', sent_tokenizer=BlanklineTokenizer()) # Get the list of tagged sentences tagged_data = data.tagged_sents() # Lower words and return as a list tagged_data_list = [[t for t in sent] for sent in tagged_data] tagged_data_list = [[(w.lower(), t) for (w, t) in s] for s in tagged_data_list] ## print "Data is read! " # Randomize training and evaluation set random.seed(len(tagged_data_list)) random.shuffle(tagged_data_list) cutoff = int(development_size * train)
def split_10fold(full_training_set, local_dir_rel): print("full_training_set", full_training_set) crf_accuracies = [] with open(full_training_set) as f: training_set_string = f.read() pos_set = training_set_string.split('\n\n') # mk into a list sentence_count = len(pos_set) # 3473 tenth = math.ceil(int(sentence_count) / int(10)) random.seed(0) random.shuffle(pos_set) def chunks(l, n): """Yield successive n-sized chunks from l. http://stackoverflow.com/a/312464 """ for i in range(0, len(l), n): yield l[i:i + n] # a list of 10 lists ten_parts = list(chunks( pos_set, tenth)) # a list of 10 lists with ~347 sentences each #for counter in list(range(10)): for counter, part in list(enumerate(ten_parts)): # map test list to part of given loop test_set = [ item.rstrip() for item in ten_parts[counter] if len(item) > 0 ] # or: test_set = part if counter == 1: print(len(test_set[993]), len(test_set[994]), len(test_set[995]), len(test_set[996])) # filter out this loop's test index training_set_lists = [ x for x in ten_parts if x is not ten_parts[counter] ] # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 ) training_set = [ item.rstrip() for sublist in training_set_lists for item in sublist if len(item) > 0 ] # save shuffled tests to file (as NLTK trainers expect) #local_dir_rel = '~/cltk_data/user_data' local_dir = os.path.expanduser(local_dir_rel) if not os.path.isdir(local_dir): os.makedirs(local_dir) test_path = os.path.join(local_dir, 'test_%d.pos' % counter) with open(test_path, 'w') as f: f.write('\n\n'.join(test_set)) test_reader = TaggedCorpusReader(local_dir, 'test_%d.pos' % counter) test_sents = test_reader.tagged_sents() test_sents_tex = [] for test_sent in test_sents: test_sents_tex.append(' '.join([token for token, tag in test_sent])) test_text_path = os.path.join(local_dir, 'test_%d.txt' % counter) with open(test_text_path, 'w') as f: f.write('\n'.join(test_sents_tex)) test_path = os.path.join(local_dir, 'test_%d.pos' % counter) with open(test_path, 'w') as f: f.write('\n'.join(test_set)) train_path = os.path.join(local_dir, 'train_%d.pos' % counter) with open(train_path, 'w') as f: f.write('\n'.join(training_set))
# http://stevenloria.com/how-to-build-a-text-classification-system-with-python-and-textblob/ import nltk from textblob.classifiers import NaiveBayesClassifier from nltk.corpus.reader import TaggedCorpusReader from nltk.tokenize import sent_tokenize, word_tokenize reader = TaggedCorpusReader('.', 'idn.tsv') txt1 = """Presiden meresmikan kereta api super cepat Jakarta Bandung.""" sent_tokenize(txt1) print word_tokenize(sent_tokenize(txt1)[0])
# -*- coding: latin-1 -*- import re import nltk from nltk.tag import UnigramTagger from nltk.corpus.reader import TaggedCorpusReader from nltk.tokenize import PunktWordTokenizer from nltk import RegexpParser from nltk.corpus import stopwords from nltk.tokenize.regexp import WhitespaceTokenizer global corpus, sent_tags, tagger # corpus = TaggedCorpusReader('/root/adail/python/names',r'.*\.txt',word_tokenizer=PunktWordTokenizer(),sep="_") PATH no linux corpus = TaggedCorpusReader( 'C:/Users/jose.adail/workspace/TextProcessor/names', r'.*\.txt', word_tokenizer=WhitespaceTokenizer(), sep="_") name_tags = corpus.tagged_sents( ) # Recebe as sentenças marcadas com POS_Tags. tagger = UnigramTagger( name_tags ) # UnigramTagger é treinado com essas sentenças marcadas que o são repassadas. class RegexpReplacer(object): def __init__(self): self.replacement_patterns = [(r"'", ''), (r'#', 'hash'), (r'no', 'no_'), (r'not', 'not_'), (r'RT ', ''), (r'rs[rs]+', 'rs'), (r'ha[ha]+', 'haha'), (r's[s]+', 'sxs'), (r'r[r]+', 'rxr'), (r'a[a]+', 'aqa'),
import sys from nltk.corpus.reader import TaggedCorpusReader from nltk.tokenize import LineTokenizer filename = sys.argv[1] without_extension = filename.split('.') file_address = filename.split('/') directory = file_address[:-1] directory_address = '/'.join('{}'.format(x) for x in directory) + '/' corpus_reader = TaggedCorpusReader(directory_address, [filename], sent_tokenizer=LineTokenizer(), sep='|') corpus = corpus_reader.tagged_sents() new_tags_only = open( without_extension[0] + '_tag_sets.' + without_extension[1], 'a+') count = 1 for each in corpus: new_tags_only.write(' '.join('{}'.format(x[1]) for x in each)) new_tags_only.write('\n') print(count) count += 1 print(without_extension[1] + "Tag extracting finished") new_tags_only.close()
def cltk_pos_cv(full_training_set, local_dir_rel): print("full_training_set", full_training_set) unigram_accuracies = [] bigram_accuracies = [] trigram_accuracies = [] backoff_accuracies = [] tnt_accuracies = [] with open(full_training_set) as f: training_set_string = f.read() pos_set = training_set_string.split('\n\n') # mk into a list sentence_count = len(pos_set) # 3473 tenth = math.ceil(int(sentence_count) / int(10)) random.seed(0) random.shuffle(pos_set) def chunks(l, n): """Yield successive n-sized chunks from l. http://stackoverflow.com/a/312464 """ for i in range(0, len(l), n): yield l[i:i+n] # a list of 10 lists ten_parts = list(chunks(pos_set, tenth)) # a list of 10 lists with ~347 sentences each #for counter in list(range(10)): for counter, part in list(enumerate(ten_parts)): # map test list to part of given loop test_set = ten_parts[counter] # or: test_set = part # filter out this loop's test index training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]] # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 ) training_set = [item for sublist in training_set_lists for item in sublist] # save shuffled tests to file (as NLTK trainers expect) #local_dir_rel = '~/cltk_data/user_data' local_dir = os.path.expanduser(local_dir_rel) if not os.path.isdir(local_dir): os.makedirs(local_dir) test_path = os.path.join(local_dir, 'test.pos') with open(test_path, 'w') as f: f.write('\n\n'.join(test_set)) train_path = os.path.join(local_dir, 'train.pos') with open(train_path, 'w') as f: f.write('\n\n'.join(training_set)) # read POS corpora print("local_dir", local_dir) train_reader = TaggedCorpusReader(local_dir, 'train.pos') train_sents = train_reader.tagged_sents() test_reader = TaggedCorpusReader(local_dir, 'test.pos') test_sents = test_reader.tagged_sents() print('Loop #' + str(counter)) # make unigram tagger unigram_tagger = UnigramTagger(train_sents) # evaluate unigram tagger unigram_accuracy = None unigram_accuracy = unigram_tagger.evaluate(test_sents) unigram_accuracies.append(unigram_accuracy) print('Unigram:', unigram_accuracy) # make bigram tagger bigram_tagger = BigramTagger(train_sents) # evaluate bigram tagger bigram_accuracy = None bigram_accuracy = bigram_tagger.evaluate(test_sents) bigram_accuracies.append(bigram_accuracy) print('Bigram:', bigram_accuracy) # make trigram tagger trigram_tagger = TrigramTagger(train_sents) # evaluate trigram tagger trigram_accuracy = None trigram_accuracy = trigram_tagger.evaluate(test_sents) trigram_accuracies.append(trigram_accuracy) print('Trigram:', trigram_accuracy) # make 1, 2, 3-gram backoff tagger tagger1 = UnigramTagger(train_sents) tagger2 = BigramTagger(train_sents, backoff=tagger1) tagger3 = TrigramTagger(train_sents, backoff=tagger2) # evaluate trigram tagger backoff_accuracy = None backoff_accuracy = tagger3.evaluate(test_sents) backoff_accuracies.append(backoff_accuracy) print('1, 2, 3-gram backoff:', backoff_accuracy) # make tnt tagger tnt_tagger = tnt.TnT() tnt_tagger.train(train_sents) # evaulate tnt tagger tnt_accuracy = None tnt_accuracy = tnt_tagger.evaluate(test_sents) tnt_accuracies.append(tnt_accuracy) print('TnT:', tnt_accuracy) final_accuracies_list = [] mean_accuracy_unigram = mean(unigram_accuracies) standard_deviation_unigram = stdev(unigram_accuracies) uni = {'unigram': {'mean': mean_accuracy_unigram, 'sd': standard_deviation_unigram}} final_accuracies_list.append(uni) mean_accuracy_bigram = mean(bigram_accuracies) standard_deviation_bigram = stdev(bigram_accuracies) bi = {'bigram': {'mean': mean_accuracy_bigram, 'sd': standard_deviation_bigram}} final_accuracies_list.append(bi) mean_accuracy_trigram = mean(trigram_accuracies) standard_deviation_trigram = stdev(trigram_accuracies) tri = {'trigram': {'mean': mean_accuracy_trigram, 'sd': standard_deviation_trigram}} final_accuracies_list.append(tri) mean_accuracy_backoff = mean(backoff_accuracies) standard_deviation_backoff = stdev(backoff_accuracies) back = {'1, 2, 3-gram backoff': {'mean': mean_accuracy_backoff, 'sd': standard_deviation_backoff}} final_accuracies_list.append(back) mean_accuracy_tnt = mean(tnt_accuracies) standard_deviation_tnt = stdev(tnt_accuracies) tnt_score = {'tnt': {'mean': mean_accuracy_tnt, 'sd': standard_deviation_tnt}} final_accuracies_list.append(tnt_score) final_dict = {} for x in final_accuracies_list: final_dict.update(x) return final_dict
from nltk.tag import brill_trainer from nltk.tbl import Template from nltk.tokenize import BlanklineTokenizer # Brill tagger parameters max_rules = 300 min_score = 3 # Training parameters development_size = 5110 train = .85 # Read data from development.sdx data = TaggedCorpusReader('.', r'.*\.sdx', sep='|', sent_tokenizer=BlanklineTokenizer(), encoding='ISO-8859-9') # Get the list of tagged sentences tagged_data = data.tagged_sents() # Lower words and return as a list tagged_data_list = [[t for t in sent] for sent in tagged_data] tagged_data_list = [[(w.lower(), t) for (w, t) in s] for s in tagged_data_list] # print "Data is read! " # Randomize training and evaluation set random.seed(len(tagged_data_list)) random.shuffle(tagged_data_list)
print("Sentence - %s\n" % (sent[x])) print("Words - %s\n" % (nltk.word_tokenize(sent[x]))) ## Reading corpora from a text files ########## ## No POS tags, chunks or categories ########## reader = PlaintextCorpusReader("/Users/atul/nltk_data/corpora/gutenberg", r'^.*\.txt') files = reader.fileids() print("File IDs:", files) print("Number of files:", len(files)) print(reader.words(files[0])) print(reader.sents(files[0])) ## Reading tagged corpora ##################### reader = TaggedCorpusReader('/Users/atul/nltk_data', r'brown.pos', tagset='en-brown') reader1 = TaggedCorpusReader('/Users/atul/nltk_data', r'brown.pos', word_tokenizer=SpaceTokenizer()) print(reader.words()) print(reader.sents()) print(reader.tagged_words()) print(reader.tagged_sents()) print( reader.tagged_words(tagset='universal') ) ## Mapping tags to universal format, if tagset is not correct every TAG will have UNK ## Reading chunk corpora ####### reader = ChunkedCorpusReader('/Users/atul/nltk_data',
# tagged_sentences = nltk.corpus.brown.tagged_sents() from nltk.corpus.reader import TaggedCorpusReader reader = TaggedCorpusReader('/Users/lucasrosenblatt/nltk_data/corpora/oldenglish', 'taggedOEnpnounsDone.pos') tagged_sentences = reader.tagged_sents() print(tagged_sentences[0]) print("Tagged sentences: ", len(tagged_sentences)) def features(sentence, index): """ sentence: [w1, w2, ...], index: the index of the word """ return { 'word': sentence[index], 'is_first': index == 0, 'is_last': index == len(sentence) - 1, 'is_capitalized': sentence[index][0].upper() == sentence[index][0], 'is_all_caps': sentence[index].upper() == sentence[index], 'is_all_lower': sentence[index].lower() == sentence[index], 'prefix-1': sentence[index][0], 'prefix-2': sentence[index][:2], 'prefix-3': sentence[index][:3], 'suffix-1': sentence[index][-1], 'suffix-2': sentence[index][-2:], 'suffix-3': sentence[index][-3:], 'prev_word': '' if index == 0 else sentence[index - 1], 'next_word': '' if index == len(sentence) - 1 else sentence[index + 1], 'has_hyphen': '-' in sentence[index], 'is_numeric': sentence[index].isdigit(), 'capitals_inside': sentence[index][1:].lower() != sentence[index][1:] } import pprint pprint.pprint(features(['This', 'is', 'a', 'sentence'], 2))
def initClassifier(self): self.__corpus__ = TaggedCorpusReader(self.__root__, '.*\.txt', sep='#') self.__dev_corpus__ = TaggedCorpusReader(self.__dev_root__, '.*\.txt', sep='#')
import nltk.data from nltk.corpus.reader import WordListCorpusReader from nltk.corpus import names from nltk.corpus.reader import TaggedCorpusReader from nltk.tokenize import SpaceTokenizer from nltk.corpus import treebank wordlist = WordListCorpusReader("C:/nltk_data/corpora/cookbook", ['wordlist']) print(wordlist.words()) print(wordlist.fileids()) print(names.fileids()) print(len(names.words('male.txt'))) reader = TaggedCorpusReader("C:/nltk_data/corpora/treebank/tagged", r'.*\.pos', word_tokenizer=SpaceTokenizer(), tagset='en-brown') print(reader.words('wsj_0001.pos')) print(reader.tagged_words('wsj_0001.pos')) print(reader.tagged_sents('wsj_0001.pos')) print(reader.tagged_paras('wsj_0001.pos')) print(reader.fileids()) print("\n") print(reader.tagged_words('wsj_0001.pos', tagset='universal')) print(treebank.tagged_words())
""" from nltk.corpus.reader import TaggedCorpusReader from nltk import DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger from nltk.probability import FreqDist from numpy import mean # for kfold validation, not working though # cross-fold validation is just brute forced... #from sklearn.model_selection import KFold #import numpy as np mypath = "C:/Users/Lauren Shin/Documents/LING 111/.final project" EstonianCorpus = TaggedCorpusReader(mypath, "estonianCaps.txt", encoding = "latin-1") sentences = EstonianCorpus.tagged_sents() tags = [tag for _, tag in EstonianCorpus.tagged_words()] mostFrequent = FreqDist(tags).max() default = DefaultTagger(mostFrequent) # cross validation #kf = KFold(n_splits = 3) # ## turns the data into a 2d array #X = np.array(sentences) ## creates a 1d array with same length/number of rows as X
from nltk.corpus.reader import TaggedCorpusReader from nltk.tokenize import SpaceTokenizer import nltk d = nltk.data.find('corpora/cookbook') reader = TaggedCorpusReader(d, r'.*\.pos') print(reader.words()) print(reader.tagged_words()) print(reader.sents()) print(reader.tagged_sents()) print(reader.paras()) print(reader.tagged_paras()) # custom tokenizer reader = TaggedCorpusReader(d, r'.*\.pos', word_tokenizer=SpaceTokenizer()) print(reader.sents()) print(reader.tagged_sents()) # universal tagset reader = TaggedCorpusReader(d, r'.*\.pos', word_tokenizer=SpaceTokenizer(), tagset='en-brown') print(reader.tagged_sents(tagset='universal')) # NLTK tagged corpora from nltk.corpus import treebank print(reader.tagged_words()) print(reader.tagged_words(tagset='universal'))
# # Brill Tagger # # In[11]: from nltk.wsd import lesk import nltk from nltk.tokenize import sent_tokenize,word_tokenize import tkinter from nltk.tag import brill, brill_trainer from nltk.tag.brill_trainer import BrillTaggerTrainer from nltk.data import load from nltk.corpus.reader import TaggedCorpusReader train_data = TaggedCorpusReader('.', 'tagged_input_sentences.txt', sep="/") traindata= list(train_data.tagged_sents()) postag= load('taggers/maxent_treebank_pos_tagger/english.pickle') templates = [ brill.Template(brill.Pos([-1])), brill.Template(brill.Pos([1])), brill.Template(brill.Pos([-2])), brill.Template(brill.Pos([2])), brill.Template(brill.Pos([-2, -1])), brill.Template(brill.Pos([1, 2])), brill.Template(brill.Pos([-3, -2, -1])), brill.Template(brill.Pos([1, 2, 3])), brill.Template(brill.Pos([-1]), brill.Pos([1])), brill.Template(brill.Word([-1])), brill.Template(brill.Word([1])),