def get_brill_tagger(self): train_data = TaggedCorpusReader('.', 'tagged_input_sentences.txt', sep="/") traindata = list(train_data.tagged_sents()) postag = load('taggers/maxent_treebank_pos_tagger/english.pickle') templates = [ brill.Template(brill.Pos([-1])), brill.Template(brill.Pos([1])), brill.Template(brill.Pos([-2])), brill.Template(brill.Pos([2])), brill.Template(brill.Pos([-2, -1])), brill.Template(brill.Pos([1, 2])), brill.Template(brill.Pos([-3, -2, -1])), brill.Template(brill.Pos([1, 2, 3])), brill.Template(brill.Pos([-1]), brill.Pos([1])), brill.Template(brill.Word([-1])), brill.Template(brill.Word([1])), brill.Template(brill.Word([-2])), brill.Template(brill.Word([2])), brill.Template(brill.Word([-2, -1])), brill.Template(brill.Word([1, 2])), brill.Template(brill.Word([-3, -2, -1])), brill.Template(brill.Word([1, 2, 3])), brill.Template(brill.Word([-1]), brill.Word([1])) ] trainer = BrillTaggerTrainer(postag, templates=templates, trace=3) brill_tagger = trainer.train(traindata, max_rules=10) return brill_tagger
def NER_HINDINBC(): reader = TaggedCorpusReader('/python27/POS_9/', r'.*\.pos') f1 = reader.fileids() print "The Files of Corpus are:", f1 sents = reader.tagged_sents() sentn = reader.sents() #words=sentn.split() ls = len(sents) #lw=len(words) print "Length of Corpus Is:", ls #print "The Words are:",lw size1 = int(ls * 0.3) test_sents = sents[:size1] train_sents = sents[size1:] nbc_tagger = ClassifierBasedPOSTagger(train=train_sents) test = nbc_tagger.evaluate(test_sents) print "The Test Result is:", test #THE GIVEN INPUT given_sent = "नीतीश कुमार द्वारा भाजपा के साथ हाथ मिलाने से वहां का पूरा राजनीतिक परिदृश्य ही बदल गया है मगर शरद यादव इससे खुश नहीं हैं".decode( 'utf-8') gsw = given_sent.split() tag_gs = nbc_tagger.tag(gsw) print "GIVEN SENT TAG:", tag_gs ftag_gs = " ".join(list(itertools.chain(*tag_gs))) print "And its flattened Version is:", ftag_gs
def make_pos_model(model_type): now = time.time() reader = TaggedCorpusReader('.', 'greek_training_set.pos') train_sents = reader.tagged_sents() if model_type == 'unigram': tagger = UnigramTagger(train_sents) file = 'unigram.pickle' elif model_type == 'bigram': tagger = BigramTagger(train_sents) file = 'bigram.pickle' elif model_type == 'trigram': tagger = TrigramTagger(train_sents) file = 'trigram.pickle' elif model_type == 'backoff': tagger1 = UnigramTagger(train_sents) tagger2 = BigramTagger(train_sents, backoff=tagger1) tagger = TrigramTagger(train_sents, backoff=tagger2) file = '123grambackoff.pickle' elif model_type == 'tnt': tagger = tnt.TnT() tagger.train(train_sents) file = 'tnt.pickle' else: print('Invalid model_type.') _dir = os.path.expanduser('~/greek_models_cltk/taggers/pos') path = os.path.join(_dir, file) with open(path, 'wb') as f: pickle.dump(tagger, f) print('Completed training {0} model in {1} seconds to {2}.'.format( model_type, time.time() - now, path))
def cltk_pos_cv(full_training_set, local_dir_rel, counter): local_dir = os.path.expanduser(local_dir_rel) stdout_old = sys.stdout sys.stdout = open(os.path.join(local_dir, 'test_%d.out'%counter), 'w') # read POS corpora print("local_dir", local_dir) train_reader = TaggedCorpusReader(local_dir, 'train_%d.pos'%counter) train_sents = train_reader.tagged_sents() test_reader = TaggedCorpusReader(local_dir, 'test_%d.pos'%counter) test_sents = test_reader.tagged_sents() print('Loop #' + str(counter)) sys.stdout.flush() # make crf tagger crf_tagger = CRFTagger() crf_tagger.train(train_sents, 'model.crf.tagger') #crf_tagger = UnigramTagger(train_sents) # evaluate crf tagger crf_accuracy = None crf_accuracy = crf_tagger.evaluate(test_sents) print('crf:', crf_accuracy) sys.stdout = stdout_old
def make_pos_model(model_type): now = time.time() reader = TaggedCorpusReader('.', 'greek_training_set.pos') train_sents = reader.tagged_sents() if model_type == 'unigram': tagger = UnigramTagger(train_sents) file = 'unigram.pickle' elif model_type == 'bigram': tagger = BigramTagger(train_sents) file = 'bigram.pickle' elif model_type == 'trigram': tagger = TrigramTagger(train_sents) file = 'trigram.pickle' elif model_type == 'backoff': tagger1 = UnigramTagger(train_sents) tagger2 = BigramTagger(train_sents, backoff=tagger1) tagger = TrigramTagger(train_sents, backoff=tagger2) file = '123grambackoff.pickle' elif model_type == 'tnt': tagger = tnt.TnT() tagger.train(train_sents) file = 'tnt.pickle' else: print('Invalid model_type.') _dir = os.path.expanduser('~/greek_models_cltk/taggers/pos') path = os.path.join(_dir, file) with open(path, 'wb') as f: pickle.dump(tagger, f) print('Completed training {0} model in {1} seconds to {2}.'.format(model_type, time.time() - now, path))
def __init__(self, root, fileids='.*', encoding='utf8'): """ Reads all the files in root. :param root: Directory. :param fileids: List of files that have to be read. '.*' if all files have to be parsed. :param encoding: File enconding """ self._reader = TaggedCorpusReader(root, fileids, encoding=encoding)
def __init__(self, corpusroot, corpusname): #gunakan custom wordlist corpus dgn method WordListCorpusReader #wordlist = WordListCorpusReader(corpus_root, ['wordlist.txt']) #gunakan custom wordlist corpus dgn method PlaintextCorpusReader #wordlist = PlaintextCorpusReader(corpus_root,'wordlist.txt') reader = TaggedCorpusReader(corpusroot, corpusname) self.reader_train = reader.tagged_sents() self.test_sent = reader.tagged_sents()[1000:]
def __init__(self, sep="/", # Note that . needs to be escaped pattern = chinese_pattern, root=None, fileids=None): """docstring for __init__""" TaggedCorpusReader.__init__( self, sep=sep, root=root, fileids=fileids, sent_tokenizer = RegexpTokenizer(pattern, gaps=True), encoding="utf-8")
def load_corpus_reviews(self,begin,end): #reader = LazyCorpusLoader() reader = TaggedCorpusReader('data/', r'.*\.pos') pos_fileids = reader.fileids()[1] neg_fileids = reader.fileids()[0] pos_sents = reader.tagged_sents(pos_fileids) neg_sents = reader.tagged_sents(neg_fileids) return (pos_sents[begin:end], neg_sents[begin:end])
def list_of_words_for(category,limit=20): category_reader = TaggedCorpusReader('corpus',category) most_freq_words = [] for w,t in category_reader.tagged_words(): if t not in ["PRP","NC","$","$NC"]: most_freq_words.append(w.lower()) pos_counts = collections.Counter(w for w in most_freq_words) result = [word for word, count in pos_counts.most_common(limit)] return result
def read(self, file_path): logger.info('Reading instances from file %s', file_path) reader = TaggedCorpusReader(*os.path.split(file_path), sep='\t', word_tokenizer=RegexpTokenizer(r'\n', gaps=True), sent_tokenizer=BlanklineTokenizer(), para_block_reader=lambda s: [s.read()]) return Dataset([ self.text_to_instance(*tuple(zip(*tagged_sent))) for tagged_sent in reader.tagged_sents() ])
def __init__(self, root=None, fileids=None, encoding='utf8'): """ Construct a new MTECorpusreader for a set of documents located at the given root directory. Example usage: >>> root = '/...path to corpus.../' >>> reader = MTECorpusReader(root, 'oana-*.xml', 'utf8') :param root: The root directory for this corpus. (default points to location in multext config file) :param fileids: A list or regexp specifying the fileids in this corpus. (default is oana-en.xml) :param enconding: The encoding of the given files (default is utf8) """ TaggedCorpusReader.__init__(self, root, fileids, encoding)
def __init__(self, file_name, language='', separator='_', ws_delim=True, number_of_groups=10, encoding='utf-8'): """Initialize the corpus reader.""" TaggedCorpusReader.__init__(self, root='.', fileids=[file_name], sep=separator, encoding=encoding)
def __init__(self, root=None, fileids=None, encoding='utf8'): """ Construct a new MTECorpusreader for a set of documents located at the given root directory. Example usage: >>> root = '/...path to corpus.../' >>> reader = MTECorpusReader(root, 'oana-*.xml', 'utf8') # doctest: +SKIP :param root: The root directory for this corpus. (default points to location in multext config file) :param fileids: A list or regexp specifying the fileids in this corpus. (default is oana-en.xml) :param enconding: The encoding of the given files (default is utf8) """ TaggedCorpusReader.__init__(self, root, fileids, encoding)
def generate_corpus_from_segmented_reports(self): re = ReportEnviroments() new_corpus_of_segmented_reports = TaggedCorpusReader(re.segmented_reports_corpus_path, '.*', sent_tokenizer=LineTokenizer(blanklines='discard'), encoding='utf-8') raw_segmented_reports = [] for i in range(len(new_corpus_of_segmented_reports.fileids())): raw_segmented_reports.append(new_corpus_of_segmented_reports.sents(fileids=new_corpus_of_segmented_reports.fileids()[i])) cut_of_segmented_reports = [] topics = ['DISCENTE', 'DOCENTE', 'INFRAESTRUTURA', 'UNCATEGORIZED'] for i in range(len(raw_segmented_reports)): cut_of_segmented_reports.append(raw_segmented_reports[i][raw_segmented_reports[i].index([topics[0].decode('utf-8')]):raw_segmented_reports[i].index([topics[-1].decode('utf-8')])+1]) return cut_of_segmented_reports, topics
def __init__( self, sep="/", # Note that . needs to be escaped pattern=chinese_pattern, root=None, fileids=None): """docstring for __init__""" TaggedCorpusReader.__init__(self, sep=sep, root=root, fileids=fileids, sent_tokenizer=RegexpTokenizer(pattern, gaps=True), encoding="utf-8")
def take_ngrams_by_topic_from_file(self, ngram_directory, ngram_file): corpus = \ TaggedCorpusReader(ngram_directory, ngram_file, sent_tokenizer=LineTokenizer(blanklines='discard'), encoding='utf-8') corpus_paras = corpus.paras()[:] k = corpus_paras[::2] for i in range(2): k = list(chain(*k)) v = corpus_paras[1::2] ngrams_by_topic_from_file = \ {k.encode('utf-8'): list(set(chain(*v))) for k, v in dict(izip(k, v)).items()} return ngrams_by_topic_from_file
def read_reviews(): """ read reviews from the given file(s). """ from glob import glob filenames = glob("input/food*.parsed") sent_end_pattern = ".\/[,\.]" reader = TaggedCorpusReader( root = ".", fileids = filenames, sep = "/", sent_tokenizer = RegexpTokenizer(sent_end_pattern, gaps=True)) li = reader.sents() return li
def read_reviews(): """ read reviews from the given file(s). """ from glob import glob filenames = glob("input/food*.parsed") sent_end_pattern = ".\/[,\.]" reader = TaggedCorpusReader(root=".", fileids=filenames, sep="/", sent_tokenizer=RegexpTokenizer( sent_end_pattern, gaps=True)) li = reader.sents() return li
class CorpusAnalysis(): def __init__(self): self.punctuation = ['.',',',';','!','?','_','"','&',"'"] self.load() def load(self): from nltk.corpus.reader import TaggedCorpusReader from nltk.tokenize import WordPunctTokenizer self.reader = TaggedCorpusReader('../data/', r'.*\.pos') def words(self): print self.reader.words(['rev_pos.pos']) def ngrams(self,words,n=0): from nltk.corpus import stopwords word_list2 = [w for w in words if not w in stopwords.words('english') and not w in punctutation] wprev,wprev1,wprev2 = None,None,None for i in range(len(word_list2)): w = word_list2[i] yield (wprev,wprev1,wprev2,w) wprev = wprev1 wprev1 = wprev2 wprev2 = w def freq_dist_words(self): from nltk import ConditionalFreqDist from nltk.model import NgramModel categories = ['rev_neg.pos','rev_pos.pos'] cfd = ConditionalFreqDist((category, word) for category in categories for word in c.ngrams(c.reader.words(category))) genres = ['rev_neg.pos', 'rev_pos.pos'] modals = ['location','room','size','staff','excellent','poor','good','bad'] print 'neg :', cfd.__getitem__('rev_neg.pos') print 'pos :', cfd.__getitem__('rev_pos.pos') #lm = NgramModel(4, self.reader.words(['rev_neg.pos'])) def freq_dist_tags(self): from nltk import ConditionalFreqDist from nltk.model import NgramModel cfd = ConditionalFreqDist((tag,word) for (word,tag) in c.reader.tagged_words(self.cat_pos) if word.isalpha()) return cfd def MI(self): pass
def read_sentences_corpus(reader = None): #reader = LazyCorpusLoader() #its overriding reader reader = TaggedCorpusReader('../data/', r'.*\.pos') ''' create a corpus reader with the files in ../data/*.pos this files contains sentences tagged, and are the bases of trainig, test sets. ''' pos_fileids = reader.fileids()[1] neg_fileids = reader.fileids()[0] pos_sents = reader.tagged_sents(pos_fileids) neg_sents = reader.tagged_sents(neg_fileids) #pos_sents = [[(word.lower(),tag) for word,tag in sent if word not in stopwords.words('english')] for sent in pos_sents ] #neg_sents = [[(word.lower(),tag) for word,tag in sent if word not in stopwords.words('english')] for sent in neg_sents ] return (pos_sents,neg_sents)
class CorpusParser: def __init__(self, root, fileids='.*', encoding='utf8'): """ Reads all the files in root. :param root: Directory. :param fileids: List of files that have to be read. '.*' if all files have to be parsed. :param encoding: File enconding """ self._reader = TaggedCorpusReader(root, fileids, encoding=encoding) def words(self): """ Returns all the words in the corpora. :return: List of words. """ return self._reader.words() def tagged_words(self): """ Returns all words of the corpora with their corresponding tag. :return: List of tuples (word, tag) """ return self._reader.tagged_words() def sentences(self): """ Returns a list of all sentences. :return: List of lists of words. Each list represents a sentence, with a list of its words in it. """ return self._reader.sents() def tagged_sentences(self): """ Returns a list of all sentences with the tag of each word. :return: List of lists of tuples. Each sentence is a list with all its members being tuples (word, tag). """ return self._reader.tagged_sents()
def __init__(self, corpus_path, corpus_files): """ Construct a Treebank object :param corpus_path: path to corpus files :param corpus_files: list of filenames for corpus text """ msg("Importing treebank...") # get a corpus reader object for our corpus using NLTK treebank = TaggedCorpusReader(corpus_path, corpus_files) # get all sentences from corpus in a tagged format self.tagged_sents = treebank.tagged_sents() # get all sentences from corpus in an untagged format self.sents = treebank.sents() msg("done!\n")
def generate_corpus_from_segmented_reports(self): re = ReportEnviroments() new_corpus_of_segmented_reports = TaggedCorpusReader( re.segmented_reports_corpus_path, '.*', sent_tokenizer=LineTokenizer(blanklines='discard'), encoding='utf-8') raw_segmented_reports = [] for i in range(len(new_corpus_of_segmented_reports.fileids())): raw_segmented_reports.append( new_corpus_of_segmented_reports.sents( fileids=new_corpus_of_segmented_reports.fileids()[i])) cut_of_segmented_reports = [] topics = ['DISCENTE', 'DOCENTE', 'INFRAESTRUTURA', 'UNCATEGORIZED'] for i in range(len(raw_segmented_reports)): cut_of_segmented_reports.append( raw_segmented_reports[i] [raw_segmented_reports[i].index([topics[0].decode('utf-8')]): raw_segmented_reports[i].index([topics[-1].decode('utf-8')]) + 1]) return cut_of_segmented_reports, topics
def cltk_pos_cv(full_training_set, local_dir_rel, counter): local_dir = os.path.expanduser(local_dir_rel) stdout_old = sys.stdout sys.stdout = open(os.path.join(local_dir, 'test_%d.out' % counter), 'w') # read POS corpora print("local_dir", local_dir) train_reader = TaggedCorpusReader(local_dir, 'train_%d.pos' % counter) train_sents = train_reader.tagged_sents() test_reader = TaggedCorpusReader(local_dir, 'test_%d.pos' % counter) test_sents = test_reader.tagged_sents() print('Loop #' + str(counter)) sys.stdout.flush() # make crf tagger crf_tagger = CRFTagger() crf_tagger.train(train_sents, 'model.crf.tagger') #crf_tagger = UnigramTagger(train_sents) # evaluate crf tagger crf_accuracy = None crf_accuracy = crf_tagger.evaluate(test_sents) print('crf:', crf_accuracy) sys.stdout = stdout_old
def main(): """main function """ n = 2 # Bigram HMM args = parse_arguments() treebank = TaggedCorpusReader( os.path.split(args.train_f)[0], os.path.split(args.train_f)[1]) observation_space = [item[0] for item in treebank.sents()] # all words state_space = [item[1] for item in treebank.sents()] # all pos tags words = dict.fromkeys(observation_space) tags = dict.fromkeys(state_space) # HMM parameter estimation- initial, transition and emission probablity start = time.time() init_p = [item[1] for item in comp_initial(tags, treebank)] trans_p = comp_transition(n, tags, state_space) emission_p = comp_emission(words, tags, state_space, treebank, smoothing=args.smoothing) end = time.time() print("Runtime (training): %.3f s" % (end - start)) # Test your HMM-trained model treebank = TaggedCorpusReader( os.path.split(args.eval_f)[0], os.path.split(args.eval_f)[1]) viterbi_tags = [] start = time.time() for sentence in treebank.paras(): test_words = [item[0] for item in sentence] O, S, Y, pi, A, B = pre_process(words, tags, test_words, init_p, trans_p, emission_p) # Computes Viterbi's most likely tags if args.log_prob: X = viterbi_log(O, S, Y, pi, A, B) else: X = viterbi(O, S, Y, pi, A, B) viterbi_tags.append(X) end = time.time() print("Runtime (viterbi): %.3f s" % (end - start)) output_path = "./" + "de-tagger.tt" post_processing(viterbi_tags, args.test_f, output_path)
def make_morpho_model(language, model_type, feature, train_file, test_file=None): test_file = train_file if test_file == None else test_file reader_train = TaggedCorpusReader('.', train_file) reader_test = TaggedCorpusReader('.', test_file) train_sents = reader_train.tagged_sents() test_sents = reader_test.tagged_sents() verify_tagged_corpus(reader_train) verify_tagged_corpus(reader_test) tagger = train_tagger(language, model_type, feature, train_sents) acc = tagger.evaluate(test_sents) baseline = compute_baseline(reader_test.tagged_words()) kappa = (acc - baseline) / (1 - baseline) cm = conf_matrix(tagger, reader_test.words(), reader_test.tagged_words()) return (tagger, acc, kappa, cm)
def __init__(self, corpusroot, corpusname): #gunakan custom wordlist corpus dgn method WordListCorpusReader #wordlist = WordListCorpusReader(corpus_root, ['wordlist.txt']) #gunakan custom wordlist corpus dgn method PlaintextCorpusReader #wordlist = PlaintextCorpusReader(corpus_root,'wordlist.txt') #nltk_old = [(3,0,1)] #nltk_current = [tuple([int(x) for x in nltk.__version__.split('.')])] reader = TaggedCorpusReader(corpusroot, corpusname) splitratio = 0.8 self.reader_train = reader.tagged_sents()[:int(len(reader.tagged_sents())*splitratio)] self.test_sent = reader.tagged_sents()[int(len(reader.tagged_sents())*splitratio):] print "split test ratio: ", int(len(reader.tagged_sents())*splitratio),"\n" print "reader_train len: ", len(self.reader_train) print "test_sent len: ", len(self.test_sent)
class Classifier: def __init__(self, root, keyWords, devRoot): self.__root__ = root self.__keyWords__ = keyWords self.__corpus__ = None self.__classifier__ = None self.__dev_corpus__ = None self.__dev_root__ = devRoot def initClassifier(self): self.__corpus__ = TaggedCorpusReader(self.__root__, '.*\.txt', sep='#') self.__dev_corpus__ = TaggedCorpusReader(self.__dev_root__, '.*\.txt', sep='#') def separateSentence(self): grammer = r""" NP: {<.*>+} }<PU>{ """ return nltk.RegexpParser(grammer) def separateParagraphByReg(self, parag): ''' :return: a list of sentences separated by (,|.) in this paragraph :param parag: the paragraph before seggment :type parag: string ''' grammer = re.compile(',|。') return grammer.split(parag) def updateFeatures(self, src, dest): for key, val in src.items(): if type(val).__name__ == 'bool' and val: dest[key] = val elif type(val).__name__ == 'int': if key in dest: dest[key] += val else: dest[key] = val def training(self): trainSet = [] for file in self.__corpus__.fileids(): trainingData = re.match(r"[a-z]+", file) if trainingData is None: continue # skip the non training data sentences = self.__corpus__.tagged_sents(file) features = {} for sent in sentences: tree = self.separateSentence().parse(sent) for subtree in tree.subtrees(lambda t: t.label() == 'NP'): subfea = self.salespersonFeature( list(subtree)) # [(word, tag)] self.updateFeatures(subfea, features) print(features) trainSet.append((features, re.match(r"[a-z]+", file).group(0))) self.__classifier__ = nltk.NaiveBayesClassifier.train(trainSet) def salespersonFeature(self, sent): features = {} words = [word for (word, tag) in sent] for w in self.__keyWords__: features["count(%s)" % w] = words.count(w) features["has(%s)" % w] = (w in words) return features def distinguishSalesFromTagfile(self, tagfile): sents = self.__corpus__.tagged_sents(tagfile) feas = {} for sent in sents: tree = self.separateSentence().parse(sent) for subtree in tree.subtrees(lambda t: t.label() == 'NP'): subfea = self.salespersonFeature(list(subtree)) self.updateFeatures(subfea, feas) return self.__classifier__.classify(feas) def testClassifierAccuracy(self): testFea = [] for file in self.__dev_corpus__.fileids(): trainingData = re.match(r"[a-z]+", file) if trainingData is None: continue # skip the non testing data sentences = self.__dev_corpus__.tagged_sents(file) features = {} for sent in sentences: tree = self.separateSentence().parse(sent) for subtree in tree.subtrees(lambda t: t.label() == 'NP'): subfea = self.salespersonFeature(list(subtree)) self.updateFeatures(subfea, features) testFea.append((features, re.match(r"[a-z]+", file).group(0))) return nltk.classify.accuracy(self.__classifier__, testFea)
import nltk.data from nltk.corpus.reader import WordListCorpusReader from nltk.corpus import names from nltk.corpus.reader import TaggedCorpusReader from nltk.tokenize import SpaceTokenizer from nltk.corpus import treebank wordlist = WordListCorpusReader("C:/nltk_data/corpora/cookbook", ['wordlist']) print(wordlist.words()) print(wordlist.fileids()) print(names.fileids()) print(len(names.words('male.txt'))) reader = TaggedCorpusReader("C:/nltk_data/corpora/treebank/tagged", r'.*\.pos', word_tokenizer=SpaceTokenizer(), tagset='en-brown') print(reader.words('wsj_0001.pos')) print(reader.tagged_words('wsj_0001.pos')) print(reader.tagged_sents('wsj_0001.pos')) print(reader.tagged_paras('wsj_0001.pos')) print(reader.fileids()) print("\n") print(reader.tagged_words('wsj_0001.pos', tagset='universal')) print(treebank.tagged_words())
# tagged_sentences = nltk.corpus.brown.tagged_sents() from nltk.corpus.reader import TaggedCorpusReader reader = TaggedCorpusReader('/Users/lucasrosenblatt/nltk_data/corpora/oldenglish', 'taggedOEnpnounsDone.pos') tagged_sentences = reader.tagged_sents() print(tagged_sentences[0]) print("Tagged sentences: ", len(tagged_sentences)) def features(sentence, index): """ sentence: [w1, w2, ...], index: the index of the word """ return { 'word': sentence[index], 'is_first': index == 0, 'is_last': index == len(sentence) - 1, 'is_capitalized': sentence[index][0].upper() == sentence[index][0], 'is_all_caps': sentence[index].upper() == sentence[index], 'is_all_lower': sentence[index].lower() == sentence[index], 'prefix-1': sentence[index][0], 'prefix-2': sentence[index][:2], 'prefix-3': sentence[index][:3], 'suffix-1': sentence[index][-1], 'suffix-2': sentence[index][-2:], 'suffix-3': sentence[index][-3:], 'prev_word': '' if index == 0 else sentence[index - 1], 'next_word': '' if index == len(sentence) - 1 else sentence[index + 1], 'has_hyphen': '-' in sentence[index], 'is_numeric': sentence[index].isdigit(), 'capitals_inside': sentence[index][1:].lower() != sentence[index][1:] } import pprint pprint.pprint(features(['This', 'is', 'a', 'sentence'], 2))
def cltk_pos_cv(full_training_set, local_dir_rel): print("full_training_set", full_training_set) crf_accuracies = [] with open(full_training_set) as f: training_set_string = f.read() pos_set = training_set_string.split('\n\n') # mk into a list sentence_count = len(pos_set) # 3473 tenth = math.ceil(int(sentence_count) / int(10)) random.seed(0) random.shuffle(pos_set) def chunks(l, n): """Yield successive n-sized chunks from l. http://stackoverflow.com/a/312464 """ for i in range(0, len(l), n): yield l[i:i+n] # a list of 10 lists ten_parts = list(chunks(pos_set, tenth)) # a list of 10 lists with ~347 sentences each #for counter in list(range(10)): for counter, part in list(enumerate(ten_parts)): # map test list to part of given loop test_set = ten_parts[counter] # or: test_set = part # filter out this loop's test index training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]] # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 ) training_set = [item for sublist in training_set_lists for item in sublist] # save shuffled tests to file (as NLTK trainers expect) #local_dir_rel = '~/cltk_data/user_data' local_dir = os.path.expanduser(local_dir_rel) if not os.path.isdir(local_dir): os.makedirs(local_dir) test_path = os.path.join(local_dir, 'test.pos') with open(test_path, 'w') as f: f.write('\n\n'.join(test_set)) train_path = os.path.join(local_dir, 'train.pos') with open(train_path, 'w') as f: f.write('\n\n'.join(training_set)) # read POS corpora print("local_dir", local_dir) train_reader = TaggedCorpusReader(local_dir, 'train.pos') train_sents = train_reader.tagged_sents() test_reader = TaggedCorpusReader(local_dir, 'test.pos') test_sents = test_reader.tagged_sents() print('Loop #' + str(counter)) # make crf tagger crf_tagger = CRFTagger() crf_tagger.train(train_sents, 'model.crf.tagger') # evaluate crf tagger crf_accuracy = None crf_accuracy = crf_tagger.evaluate(test_sents) crf_accuracies.append(crf_accuracy) print('crf:', crf_accuracy) #if counter> 0: break final_accuracies_list = [] mean_accuracy_crf = mean(crf_accuracies) standard_deviation_crf = stdev(crf_accuracies) uni = {'crf': {'mean': mean_accuracy_crf, 'sd': standard_deviation_crf}} final_accuracies_list.append(uni) final_dict = {} for x in final_accuracies_list: final_dict.update(x) return final_dict
def cltk_pos_cv(full_training_set, local_dir_rel): print("full_training_set", full_training_set) unigram_accuracies = [] bigram_accuracies = [] trigram_accuracies = [] backoff_accuracies = [] tnt_accuracies = [] with open(full_training_set) as f: training_set_string = f.read() pos_set = training_set_string.split('\n\n') # mk into a list sentence_count = len(pos_set) # 3473 tenth = math.ceil(int(sentence_count) / int(10)) random.seed(0) random.shuffle(pos_set) def chunks(l, n): """Yield successive n-sized chunks from l. http://stackoverflow.com/a/312464 """ for i in range(0, len(l), n): yield l[i:i+n] # a list of 10 lists ten_parts = list(chunks(pos_set, tenth)) # a list of 10 lists with ~347 sentences each #for counter in list(range(10)): for counter, part in list(enumerate(ten_parts)): # map test list to part of given loop test_set = ten_parts[counter] # or: test_set = part # filter out this loop's test index training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]] # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 ) training_set = [item for sublist in training_set_lists for item in sublist] # save shuffled tests to file (as NLTK trainers expect) #local_dir_rel = '~/cltk_data/user_data' local_dir = os.path.expanduser(local_dir_rel) if not os.path.isdir(local_dir): os.makedirs(local_dir) test_path = os.path.join(local_dir, 'test.pos') with open(test_path, 'w') as f: f.write('\n\n'.join(test_set)) train_path = os.path.join(local_dir, 'train.pos') with open(train_path, 'w') as f: f.write('\n\n'.join(training_set)) # read POS corpora print("local_dir", local_dir) train_reader = TaggedCorpusReader(local_dir, 'train.pos') train_sents = train_reader.tagged_sents() test_reader = TaggedCorpusReader(local_dir, 'test.pos') test_sents = test_reader.tagged_sents() print('Loop #' + str(counter)) # make unigram tagger unigram_tagger = UnigramTagger(train_sents) # evaluate unigram tagger unigram_accuracy = None unigram_accuracy = unigram_tagger.evaluate(test_sents) unigram_accuracies.append(unigram_accuracy) print('Unigram:', unigram_accuracy) # make bigram tagger bigram_tagger = BigramTagger(train_sents) # evaluate bigram tagger bigram_accuracy = None bigram_accuracy = bigram_tagger.evaluate(test_sents) bigram_accuracies.append(bigram_accuracy) print('Bigram:', bigram_accuracy) # make trigram tagger trigram_tagger = TrigramTagger(train_sents) # evaluate trigram tagger trigram_accuracy = None trigram_accuracy = trigram_tagger.evaluate(test_sents) trigram_accuracies.append(trigram_accuracy) print('Trigram:', trigram_accuracy) # make 1, 2, 3-gram backoff tagger tagger1 = UnigramTagger(train_sents) tagger2 = BigramTagger(train_sents, backoff=tagger1) tagger3 = TrigramTagger(train_sents, backoff=tagger2) # evaluate trigram tagger backoff_accuracy = None backoff_accuracy = tagger3.evaluate(test_sents) backoff_accuracies.append(backoff_accuracy) print('1, 2, 3-gram backoff:', backoff_accuracy) # make tnt tagger tnt_tagger = tnt.TnT() tnt_tagger.train(train_sents) # evaulate tnt tagger tnt_accuracy = None tnt_accuracy = tnt_tagger.evaluate(test_sents) tnt_accuracies.append(tnt_accuracy) print('TnT:', tnt_accuracy) final_accuracies_list = [] mean_accuracy_unigram = mean(unigram_accuracies) standard_deviation_unigram = stdev(unigram_accuracies) uni = {'unigram': {'mean': mean_accuracy_unigram, 'sd': standard_deviation_unigram}} final_accuracies_list.append(uni) mean_accuracy_bigram = mean(bigram_accuracies) standard_deviation_bigram = stdev(bigram_accuracies) bi = {'bigram': {'mean': mean_accuracy_bigram, 'sd': standard_deviation_bigram}} final_accuracies_list.append(bi) mean_accuracy_trigram = mean(trigram_accuracies) standard_deviation_trigram = stdev(trigram_accuracies) tri = {'trigram': {'mean': mean_accuracy_trigram, 'sd': standard_deviation_trigram}} final_accuracies_list.append(tri) mean_accuracy_backoff = mean(backoff_accuracies) standard_deviation_backoff = stdev(backoff_accuracies) back = {'1, 2, 3-gram backoff': {'mean': mean_accuracy_backoff, 'sd': standard_deviation_backoff}} final_accuracies_list.append(back) mean_accuracy_tnt = mean(tnt_accuracies) standard_deviation_tnt = stdev(tnt_accuracies) tnt_score = {'tnt': {'mean': mean_accuracy_tnt, 'sd': standard_deviation_tnt}} final_accuracies_list.append(tnt_score) final_dict = {} for x in final_accuracies_list: final_dict.update(x) return final_dict
########## TAGGED CORPUS READER ############### from nltk.corpus.reader import TaggedCorpusReader root="C:\\Users\\Matrix\\AppData\\Roaming\\nltk_data\\corpora\\cookbook\\" file="brown.pos" source=root+file #Using Regex to match all files with extension .pos reader=TaggedCorpusReader(root,r'.*\.pos') print reader.words() print reader.tagged_words() print reader.sents() print reader.tagged_sents() print reader.paras() print reader.tagged_paras() #TaggedCorpus uses default tokenizer but we can change it by customizing it from nltk.tokenize import SpaceTokenizer reader=TaggedCorpusReader(root,r'.*\.pos',word_tokenizer=SpaceTokenizer()) print reader.words() #Customing TaggedCorpus's sentence tokenizer from nltk.tokenize import LineTokenizer reader=TaggedCorpusReader(root,r'.*\.pos',sent_tokenizer=LineTokenizer()) print reader.words() #Customizing TaggedCorpus's paragraph Block reader #Customizing TaggedCorpus's tag separator - Pg 57
def load(self): from nltk.corpus.reader import TaggedCorpusReader from nltk.tokenize import WordPunctTokenizer self.reader = TaggedCorpusReader('../data/', r'.*\.pos')
def trainPOSTagger(useTnTTagger): global __debug_on__ global pos_tagger global adskCorpusRoot # Train TNT/Brill POS-tagger using own training data + treebank data from nltk. Tested that using treebank data improves results. autodesk = TaggedCorpusReader(adskCorpusRoot, '.*', encoding='utf-8') train_sents = autodesk.tagged_sents() + treebank.tagged_sents() # Use TnT tagger on request if useTnTTagger: if __debug_on__: Service.logger.debug("Using TnT POS tagger...") unk_tagger = DefaultTagger('NN') pos_tagger = tnt.TnT(unk=unk_tagger, Trained=True) pos_tagger.train(train_sents) # Use Brill tagger by default else: if __debug_on__: Service.logger.debug("Using Brill POS tagger...") def backoff_tagger(tagged_sents, tagger_classes, backoff=None): if not backoff: backoff = tagger_classes[0](tagged_sents) del tagger_classes[0] for cls in tagger_classes: tagger = cls(tagged_sents, backoff=backoff) backoff = tagger return backoff word_patterns = [ (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*ould$', 'MD'), (r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*ness$', 'NN'), (r'.*ment$', 'NN'), (r'.*ful$', 'JJ'), (r'.*ious$', 'JJ'), (r'.*ble$', 'JJ'), (r'.*ic$', 'JJ'), (r'.*ive$', 'JJ'), (r'.*ic$', 'JJ'), (r'.*est$', 'JJ'), (r'^a$', 'PREP'), ] raubt_tagger = backoff_tagger(train_sents, [nltk.tag.AffixTagger, nltk.tag.UnigramTagger, nltk.tag.BigramTagger, nltk.tag.TrigramTagger], backoff=nltk.tag.RegexpTagger(word_patterns)) templates = [ brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,1)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (2,2)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,2)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,3)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,1)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (2,2)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,2)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,3)), brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1), (1,1)), brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1), (1,1)) ] trainer = brill.FastBrillTaggerTrainer(raubt_tagger, templates) pos_tagger = trainer.train(train_sents, max_rules=200, min_score=3)
# http://stevenloria.com/how-to-build-a-text-classification-system-with-python-and-textblob/ import nltk from textblob.classifiers import NaiveBayesClassifier from nltk.corpus.reader import TaggedCorpusReader from nltk.tokenize import sent_tokenize, word_tokenize reader = TaggedCorpusReader('.', 'idn.tsv') txt1 = """Presiden meresmikan kereta api super cepat Jakarta Bandung.""" sent_tokenize(txt1) print word_tokenize(sent_tokenize(txt1)[0])
def split_10fold(full_training_set, local_dir_rel): print("full_training_set", full_training_set) crf_accuracies = [] with open(full_training_set) as f: training_set_string = f.read() pos_set = training_set_string.split('\n\n') # mk into a list sentence_count = len(pos_set) # 3473 tenth = math.ceil(int(sentence_count) / int(10)) random.seed(0) random.shuffle(pos_set) def chunks(l, n): """Yield successive n-sized chunks from l. http://stackoverflow.com/a/312464 """ for i in range(0, len(l), n): yield l[i:i+n] # a list of 10 lists ten_parts = list(chunks(pos_set, tenth)) # a list of 10 lists with ~347 sentences each #for counter in list(range(10)): for counter, part in list(enumerate(ten_parts)): # map test list to part of given loop test_set = [item.rstrip() for item in ten_parts[counter] if len(item) > 0] # or: test_set = part if counter==1: print(len(test_set[993]),len(test_set[994]),len(test_set[995]),len(test_set[996])) # filter out this loop's test index training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]] # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 ) training_set = [item.rstrip() for sublist in training_set_lists for item in sublist if len(item) > 0] # save shuffled tests to file (as NLTK trainers expect) #local_dir_rel = '~/cltk_data/user_data' local_dir = os.path.expanduser(local_dir_rel) if not os.path.isdir(local_dir): os.makedirs(local_dir) test_path = os.path.join(local_dir, 'test_%d.pos'%counter) with open(test_path, 'w') as f: f.write('\n\n'.join(test_set)) test_reader = TaggedCorpusReader(local_dir, 'test_%d.pos'%counter) test_sents = test_reader.tagged_sents() test_sents_tex = [] for test_sent in test_sents: test_sents_tex.append(' '.join([token for token,tag in test_sent])) test_text_path = os.path.join(local_dir, 'test_%d.txt'%counter) with open(test_text_path, 'w') as f: f.write('\n'.join(test_sents_tex)) test_path = os.path.join(local_dir, 'test_%d.pos'%counter) with open(test_path, 'w') as f: f.write('\n'.join(test_set)) train_path = os.path.join(local_dir, 'train_%d.pos'%counter) with open(train_path, 'w') as f: f.write('\n'.join(training_set))
print("Sentence - %s\n" % (sent[x])) print("Words - %s\n" % (nltk.word_tokenize(sent[x]))) ## Reading corpora from a text files ########## ## No POS tags, chunks or categories ########## reader = PlaintextCorpusReader("/Users/atul/nltk_data/corpora/gutenberg", r'^.*\.txt') files = reader.fileids() print("File IDs:", files) print("Number of files:", len(files)) print(reader.words(files[0])) print(reader.sents(files[0])) ## Reading tagged corpora ##################### reader = TaggedCorpusReader('/Users/atul/nltk_data', r'brown.pos', tagset='en-brown') reader1 = TaggedCorpusReader('/Users/atul/nltk_data', r'brown.pos', word_tokenizer=SpaceTokenizer()) print(reader.words()) print(reader.sents()) print(reader.tagged_words()) print(reader.tagged_sents()) print( reader.tagged_words(tagset='universal') ) ## Mapping tags to universal format, if tagset is not correct every TAG will have UNK ## Reading chunk corpora ####### reader = ChunkedCorpusReader('/Users/atul/nltk_data',
from nltk.corpus.reader import TaggedCorpusReader from nltk.tokenize import SpaceTokenizer import nltk d = nltk.data.find('corpora/cookbook') reader = TaggedCorpusReader(d, r'.*\.pos') print(reader.words()) print(reader.tagged_words()) print(reader.sents()) print(reader.tagged_sents()) print(reader.paras()) print(reader.tagged_paras()) # custom tokenizer reader = TaggedCorpusReader(d, r'.*\.pos', word_tokenizer=SpaceTokenizer()) print(reader.sents()) print(reader.tagged_sents()) # universal tagset reader = TaggedCorpusReader(d, r'.*\.pos', word_tokenizer=SpaceTokenizer(), tagset='en-brown') print(reader.tagged_sents(tagset='universal')) # NLTK tagged corpora from nltk.corpus import treebank print(reader.tagged_words()) print(reader.tagged_words(tagset='universal'))
def __init__(self, file_name, language="", separator="_", ws_delim=True, number_of_groups=10, encoding="utf-8"): """Initialize the corpus reader.""" TaggedCorpusReader.__init__(self, root=".", fileids=[file_name], sep=separator, encoding=encoding)
# -*- coding: latin-1 -*- import re import nltk from nltk.tag import UnigramTagger from nltk.corpus.reader import TaggedCorpusReader from nltk.tokenize import PunktWordTokenizer from nltk import RegexpParser from nltk.corpus import stopwords from nltk.tokenize.regexp import WhitespaceTokenizer global corpus, sent_tags, tagger # corpus = TaggedCorpusReader('/root/adail/python/names',r'.*\.txt',word_tokenizer=PunktWordTokenizer(),sep="_") PATH no linux corpus = TaggedCorpusReader( 'C:/Users/jose.adail/workspace/TextProcessor/names', r'.*\.txt', word_tokenizer=WhitespaceTokenizer(), sep="_") name_tags = corpus.tagged_sents( ) # Recebe as sentenças marcadas com POS_Tags. tagger = UnigramTagger( name_tags ) # UnigramTagger é treinado com essas sentenças marcadas que o são repassadas. class RegexpReplacer(object): def __init__(self): self.replacement_patterns = [(r"'", ''), (r'#', 'hash'), (r'no', 'no_'), (r'not', 'not_'), (r'RT ', ''), (r'rs[rs]+', 'rs'), (r'ha[ha]+', 'haha'), (r's[s]+', 'sxs'), (r'r[r]+', 'rxr'), (r'a[a]+', 'aqa'),
def initClassifier(self): self.__corpus__ = TaggedCorpusReader(self.__root__, '.*\.txt', sep='#') self.__dev_corpus__ = TaggedCorpusReader(self.__dev_root__, '.*\.txt', sep='#')
from nltk.tag import brill from nltk.tag import brill_trainer import pickle # Brill tagger parameters max_rules=300 min_score=3 # Training parameters development_size=5110 train=.85 # Read data from development.sdx data = TaggedCorpusReader('.', r'.*\.sdx', sep='|', sent_tokenizer=BlanklineTokenizer()) # Get the list of tagged sentences tagged_data = data.tagged_sents() # Lower words and return as a list tagged_data_list = [[t for t in sent] for sent in tagged_data] tagged_data_list = [[(w.lower(),t) for (w,t) in s] for s in tagged_data_list] ## print "Data is read! " # Randomize training and evaluation set random.seed(len(tagged_data_list)) random.shuffle(tagged_data_list) cutoff = int(development_size*train)
from nltk import tag from nltk.tag import brill from nltk.tag import brill_trainer import pickle # Brill tagger parameters max_rules = 300 min_score = 3 # Training parameters development_size = 5110 train = .85 # Read data from development.sdx data = TaggedCorpusReader('.', r'.*\.sdx', sep='|', sent_tokenizer=BlanklineTokenizer()) # Get the list of tagged sentences tagged_data = data.tagged_sents() # Lower words and return as a list tagged_data_list = [[t for t in sent] for sent in tagged_data] tagged_data_list = [[(w.lower(), t) for (w, t) in s] for s in tagged_data_list] ## print "Data is read! " # Randomize training and evaluation set random.seed(len(tagged_data_list)) random.shuffle(tagged_data_list) cutoff = int(development_size * train)
def initClassifier(self): self.__corpus__ = TaggedCorpusReader(self.__root__, '.*\.txt', sep = '#') self.__dev_corpus__ = TaggedCorpusReader(self.__dev_root__, '.*\.txt', sep = '#')
class Classifier: def __init__(self, root, keyWords, devRoot): self.__root__ = root self.__keyWords__ = keyWords self.__corpus__ = None self.__classifier__ = None self.__dev_corpus__ = None self.__dev_root__ = devRoot def initClassifier(self): self.__corpus__ = TaggedCorpusReader(self.__root__, '.*\.txt', sep = '#') self.__dev_corpus__ = TaggedCorpusReader(self.__dev_root__, '.*\.txt', sep = '#') def separateSentence(self): grammer = r""" NP: {<.*>+} }<PU>{ """ return nltk.RegexpParser(grammer) def separateParagraphByReg(self, parag): ''' :return: a list of sentences separated by (,|.) in this paragraph :param parag: the paragraph before seggment :type parag: string ''' grammer = re.compile(',|。') return grammer.split(parag) def updateFeatures(self, src, dest): for key, val in src.items(): if type(val).__name__ == 'bool' and val: dest[key] = val elif type(val).__name__ == 'int': if key in dest: dest[key] += val else: dest[key] = val def training(self): trainSet = [] for file in self.__corpus__.fileids(): trainingData = re.match(r"[a-z]+", file) if trainingData is None: continue # skip the non training data sentences = self.__corpus__.tagged_sents(file) features = {} for sent in sentences: tree = self.separateSentence().parse(sent) for subtree in tree.subtrees(lambda t: t.label() == 'NP'): subfea = self.salespersonFeature(list(subtree)) # [(word, tag)] self.updateFeatures(subfea, features) print(features) trainSet.append((features, re.match(r"[a-z]+", file).group(0))) self.__classifier__ = nltk.NaiveBayesClassifier.train(trainSet) def salespersonFeature(self, sent): features = {} words = [word for (word, tag) in sent] for w in self.__keyWords__: features["count(%s)" % w] = words.count(w) features["has(%s)" % w] = (w in words) return features def distinguishSalesFromTagfile(self, tagfile): sents = self.__corpus__.tagged_sents(tagfile) feas = {} for sent in sents: tree = self.separateSentence().parse(sent) for subtree in tree.subtrees(lambda t: t.label() == 'NP'): subfea = self.salespersonFeature(list(subtree)) self.updateFeatures(subfea, feas) return self.__classifier__.classify(feas) def testClassifierAccuracy(self): testFea = [] for file in self.__dev_corpus__.fileids(): trainingData = re.match(r"[a-z]+", file) if trainingData is None: continue # skip the non testing data sentences = self.__dev_corpus__.tagged_sents(file) features = {} for sent in sentences: tree = self.separateSentence().parse(sent) for subtree in tree.subtrees(lambda t: t.label() == 'NP'): subfea = self.salespersonFeature(list(subtree)) self.updateFeatures(subfea, features) testFea.append((features, re.match(r"[a-z]+", file).group(0))) return nltk.classify.accuracy(self.__classifier__, testFea)
def setUp(self): reader = TaggedCorpusReader('./corpora/oe', 'oe_train.pos') os.system('mkdir -p taggers/oe/pos') self.sents = reader.tagged_sents()
from nltk.tag import brill_trainer from nltk.tbl import Template from nltk.tokenize import BlanklineTokenizer # Brill tagger parameters max_rules = 300 min_score = 3 # Training parameters development_size = 5110 train = .85 # Read data from development.sdx data = TaggedCorpusReader('.', r'.*\.sdx', sep='|', sent_tokenizer=BlanklineTokenizer(), encoding='ISO-8859-9') # Get the list of tagged sentences tagged_data = data.tagged_sents() # Lower words and return as a list tagged_data_list = [[t for t in sent] for sent in tagged_data] tagged_data_list = [[(w.lower(), t) for (w, t) in s] for s in tagged_data_list] # print "Data is read! " # Randomize training and evaluation set random.seed(len(tagged_data_list)) random.shuffle(tagged_data_list)
import nltk from nltk.tag import RegexpTagger from nltk.corpus.reader import TaggedCorpusReader reader = TaggedCorpusReader('corpus','tagged_corpus') train = reader.tagged_sents() tagger0 = nltk.DefaultTagger('n') tagger1 = nltk.UnigramTagger(train,backoff=tagger0) tagger2 = nltk.BigramTagger(train,backoff=tagger1) patterns = [ (r'^\d+((.|,)\d+)?\.?$', 'NC'), (r'^.*\$$','$'), (r'R\$\d+((.|,)\d+)?\.?$','NC$'), (r'^(R|r)eais$','$'), (r'^(D|d)(o|ó)lares','$') ] tagger3 = RegexpTagger(patterns,backoff=tagger2) def tag(sent): result = tagger3.tag(sent.split()) return result
if useTnTTagger: storedModel = "/var/log/Terminology/pos_model_tnt.bin" else: storedModel = "/var/log/Terminology/pos_model_brill.bin" if os.path.isfile(storedModel): Service.logger.debug("Loading stored POS tagger model from %s" % storedModel) modelFile = open(storedModel, "rb") try: pos_tagger = cPickle.load(modelFile) except Exception, e: Servide.logger.debug("Exception while loading pickled POS model!") Service.logger.debug(Service.traceback.format_exc()) modelFile.close() else: autodesk = TaggedCorpusReader(adskCorpusRoot, '.*', encoding='utf-8') train_sents = autodesk.tagged_sents() + treebank.tagged_sents() # Use TnT tagger on request if useTnTTagger: if __debug_on__: Service.logger.debug("Using TnT POS tagger...") unk_tagger = DefaultTagger('NN') pos_tagger = tnt.TnT(unk=unk_tagger, Trained=True) pos_tagger.train(train_sents) # Use Brill tagger by default else: if __debug_on__: Service.logger.debug("Using Brill POS tagger...")