Esempio n. 1
0
    def tag(self, sent, tagregex=True, deftag='XX', verbose=False):
        kalimat = sent.encode('utf-8')

        text = self.regexTokenizer(kalimat.lower().strip())

        ## :> --___<<IMPORTANT>>___--
        ##      Untuk beberapa hal default tagger harus dibiarkan 'XX'
        ##      dengan tujuan identifikasi Entitas
        backoff_tagger = DefaultTagger(deftag)

        if tagregex:
           regexp_tagger = RegexpTagger(patterns.regex_patterns,backoff=backoff_tagger)
           unigram_tagger = UnigramTagger(self.reader_train, backoff=regexp_tagger)
        else:
           unigram_tagger = UnigramTagger(self.reader_train, backoff=backoff_tagger)
           
        bigram_tagger = BigramTagger(self.reader_train, backoff=unigram_tagger)
        trigram_tagger = TrigramTagger(self.reader_train, backoff=bigram_tagger)
        
        """
        # Menggunakan dataset pan localization bahasa indonesia "UI-1M-tagged.txt"
        # kombinasi proses tagging diatas menghasilkan tingkat akurasi:
        #      dengan regextagger: 77%
        #      tanpa regextagger : > 90%
        """
        if verbose:
           # Semakin besar dokumen, semakin lama proses perhitungan akurasi
           # disarankan hanya untuk testing
           print ("Calculating Tagger Accuracy...")
           self.tagAccuracy = trigram_tagger.evaluate(self.test_sent)
           print ("Accuracy is: %s" % (self.tagAccuracy))
        
        return trigram_tagger.tag(text)
Esempio n. 2
0
    def tag(self, sent, tagregex=True, deftag='XX', verbose=False):
        kalimat = sent.encode('utf-8')

        ## :> --___<<IMPORTANT>>___--
        ##      Untuk beberapa hal default tagger harus dibiarkan 'XX'
        ##      dengan tujuan identifikasi Entitas
        backoff_tagger = DefaultTagger(deftag)

        if tagregex:
           text = self.regexTokenizer(kalimat.lower().strip())
           regexp_tagger = RegexpTagger(patterns.regex_patterns,backoff=backoff_tagger)
           unigram_tagger = UnigramTagger(self.reader_train, backoff=regexp_tagger)
        else:
           text = self.wordTokenizer(kalimat.lower().strip())
           unigram_tagger = UnigramTagger(self.reader_train, backoff=backoff_tagger)
           
        bigram_tagger = BigramTagger(self.reader_train, backoff=unigram_tagger)
        trigram_tagger = TrigramTagger(self.reader_train, backoff=bigram_tagger)
        
        """
        # Menggunakan dataset pan localization bahasa indonesia "UI-1M-tagged.txt"
        # kombinasi proses tagging diatas menghasilkan tingkat akurasi:
        #      dengan regextagger: < 77%
        #      tanpa regextagger : > 83%
        """
        if verbose:
           # Semakin besar dokumen, semakin lama proses perhitungan akurasi
           # disarankan hanya untuk testing
           print "Calculating Tagger Accuracy..."
           self.tagAccuracy = trigram_tagger.evaluate(self.test_sent)
           print "Accuracy is: %4.2f %%" % (100.0 * self.tagAccuracy)
        
        return trigram_tagger.tag(text)
def train_trigram(data):
    trigram_tagger = TrigramTagger(data)
    with open(os.path.join(PACKDIR, "nib_marburg", "trigram.pickle"),
              "wb") as f:
        pickle.dump(trigram_tagger, f)
    res = trigram_tagger.tag(
        "uns ist in alten mæren wunders vil geseit".split(" "))
    print(res)