def tag(self, sent, tagregex=True, deftag='XX', verbose=False): kalimat = sent.encode('utf-8') text = self.regexTokenizer(kalimat.lower().strip()) ## :> --___<<IMPORTANT>>___-- ## Untuk beberapa hal default tagger harus dibiarkan 'XX' ## dengan tujuan identifikasi Entitas backoff_tagger = DefaultTagger(deftag) if tagregex: regexp_tagger = RegexpTagger(patterns.regex_patterns,backoff=backoff_tagger) unigram_tagger = UnigramTagger(self.reader_train, backoff=regexp_tagger) else: unigram_tagger = UnigramTagger(self.reader_train, backoff=backoff_tagger) bigram_tagger = BigramTagger(self.reader_train, backoff=unigram_tagger) trigram_tagger = TrigramTagger(self.reader_train, backoff=bigram_tagger) """ # Menggunakan dataset pan localization bahasa indonesia "UI-1M-tagged.txt" # kombinasi proses tagging diatas menghasilkan tingkat akurasi: # dengan regextagger: 77% # tanpa regextagger : > 90% """ if verbose: # Semakin besar dokumen, semakin lama proses perhitungan akurasi # disarankan hanya untuk testing print ("Calculating Tagger Accuracy...") self.tagAccuracy = trigram_tagger.evaluate(self.test_sent) print ("Accuracy is: %s" % (self.tagAccuracy)) return trigram_tagger.tag(text)
def tag(self, sent, tagregex=True, deftag='XX', verbose=False): kalimat = sent.encode('utf-8') ## :> --___<<IMPORTANT>>___-- ## Untuk beberapa hal default tagger harus dibiarkan 'XX' ## dengan tujuan identifikasi Entitas backoff_tagger = DefaultTagger(deftag) if tagregex: text = self.regexTokenizer(kalimat.lower().strip()) regexp_tagger = RegexpTagger(patterns.regex_patterns,backoff=backoff_tagger) unigram_tagger = UnigramTagger(self.reader_train, backoff=regexp_tagger) else: text = self.wordTokenizer(kalimat.lower().strip()) unigram_tagger = UnigramTagger(self.reader_train, backoff=backoff_tagger) bigram_tagger = BigramTagger(self.reader_train, backoff=unigram_tagger) trigram_tagger = TrigramTagger(self.reader_train, backoff=bigram_tagger) """ # Menggunakan dataset pan localization bahasa indonesia "UI-1M-tagged.txt" # kombinasi proses tagging diatas menghasilkan tingkat akurasi: # dengan regextagger: < 77% # tanpa regextagger : > 83% """ if verbose: # Semakin besar dokumen, semakin lama proses perhitungan akurasi # disarankan hanya untuk testing print "Calculating Tagger Accuracy..." self.tagAccuracy = trigram_tagger.evaluate(self.test_sent) print "Accuracy is: %4.2f %%" % (100.0 * self.tagAccuracy) return trigram_tagger.tag(text)
def train_trigram(data): trigram_tagger = TrigramTagger(data) with open(os.path.join(PACKDIR, "nib_marburg", "trigram.pickle"), "wb") as f: pickle.dump(trigram_tagger, f) res = trigram_tagger.tag( "uns ist in alten mæren wunders vil geseit".split(" ")) print(res)