def TextTokenizer(sen): # Materi Syntatic proses:text tokenizing # http://blog.pantaw.com/syntatic-proses-text-tokenizing/ stopwords= ['kah','lah','pun','jah','jeh','mu','ku','ke','di','tapi','saya','kamu','mereka','dia', \ 'kita','adalah','dan','jika','kalau','sama','yang', \ 'sekarang','nanti','besok','kemarin','kemaren','nya','na',\ 'at','apa','ini','itu','juga','ketika','namun',\ 'sebab','oleh','malah','memang'] tok = tokenize() kata = tok.WordTokenize(sen,removepunct=False) if kata: print "kalimat setelah di tokenize: ", kata, "\n" return kata
def TextTokenizer(sen): # Materi Syntatic proses:text tokenizing # http://blog.pantaw.com/syntatic-proses-text-tokenizing/ stopwords= ['kah','lah','pun','jah','jeh','mu','ku','ke','di','tapi','saya','kamu','mereka','dia', \ 'kita','adalah','dan','jika','kalau','sama','yang', \ 'sekarang','nanti','besok','kemarin','kemaren','nya','na',\ 'at','apa','ini','itu','juga','ketika','namun',\ 'sebab','oleh','malah','memang'] tok = tokenize() kata = tok.WordTokenize(sen, removepunct=False) if kata: print "kalimat setelah di tokenize: ", kata, "\n" return kata
def rawForLangmodel(self, f, punct_remove=False, to_token=True, min_word=2): tok = tokenize() table = string.maketrans("","") # Splitting sentence based on new line then Regex pattern[0] words = re.split(r'\n',f) words = re.split(r''+pattern[0]+'',f) if punct_remove: words = [z.translate(table, string.punctuation) for z in words] if to_token: words = [tok.WordTokenize(z) for z in words] words = filter(lambda x: len(x) >= min_word, words) else: words = filter(lambda x: len(tok.WordTokenize(x)) >= min_word, words) return words
def rawForVector(self, f, min_word=2): """ Word level vector """ tok = tokenize() t0 = time() #print "Splitting sentence for vector processing..." table = string.maketrans("", "") words = re.split(r'' + pattern[0] + '', f) words = [z.translate(table, string.punctuation) for z in words] words = filter(lambda x: len(tok.WordTokenize(x)) >= min_word, words) words = [tok.WordTokenize(z) for z in words] #print "total sentence for process: ", len(words) #print "total unique words(vocabulary): ", len(self.word_constructor(words)) #print("Splitting sentence for vector done in %fs" % (time() - t0)) #print "\n" return words
def rawForVector(self, f, min_word=2): """ Word level vector """ tok = tokenize() t0 = time() #print "Splitting sentence for vector processing..." table = string.maketrans("","") words = re.split(r''+pattern[0]+'',f) words = [z.translate(table, string.punctuation) for z in words] words = filter(lambda x: len(tok.WordTokenize(x)) >= min_word, words) words = [tok.WordTokenize(z) for z in words] #print "total sentence for process: ", len(words) #print "total unique words(vocabulary): ", len(self.word_constructor(words)) #print("Splitting sentence for vector done in %fs" % (time() - t0)) #print "\n" return words
def train(self, vect, optimizer=None, separate=True, njump=0, verbose=False): """ In the study of probability, given at least two random variables X, Y, ...,that are defined on a probability space S, the joint probability distribution for X, Y, ... is a probability distribution that gives the probability that each of X, Y, ... falls in any particular range or discrete set of values specified for that variable ##################################################################################################################### ############################################################################################################# demikian jika diberikan kalimat S: "saya sedang makan nasi goreng di warung depan" berarti p(w1,w2,...,wn) disebut sebagai distribusi probabilitas(dimana w1,w2,...wn sebagai random variables), kata (w1,w2,...,wn), over the kalimat S """ t0 = time() print "Begin training language model..." if optimizer=='modkn': """NOTE: Untuk sementara penerapan njump parameter belum dapat digunakan dalam pengimplementasian Modified Kneser-Ney optimizer ini. """ print "Using optimizer: ", 'Modified Kneser-Ney' modkn = ModifiedKneserNey() modkn.kneser_ney_discounting(vect) modkn.train() #print "proba\t\ttoken\t\tbow\t\tcount" tmpN=1 for k,v in sorted(modkn.mKNeyEstimate.items(),key=lambda x: x[1][3]): #print ("%0.7f\t%s\t\t%0.7f\t\t%d"%(exp(v[0]),k,exp(v[1]),v[2])) if len(k.split(' ')) != tmpN: self.finalmodel[tmpN]=self.vocab self.vocab={} tmpN = len(k.split(' ')) self.vocab[k] = SimpleVocab(count=int(v[2]), estimator=v[0]) self.finalmodel[tmpN]=self.vocab del self.vocab else: if optimizer =='sgt': print "Using optimizer: ", 'Simple Good-Turing' elif optimizer == 'ls': print "Using optimizer: ", 'Laplace' else: print "Using optimizer: ",'Maximum Likelihood Estimation' tok = tokenize() for i in range(1,self.nforgram+1): self.nforgram=i self.raw_vocab,self.total_word = constructVocab(vect, self.total_word, \ nforgram=self.nforgram, separate=separate, \ njump=njump) if optimizer=='sgt': sgtN= float(functools.reduce(operator.add,self.raw_vocab.values())) sgt = SimpleGoodTuring(self.raw_vocab, sgtN) sgtSmoothProb,p0 = sgt.train(self.raw_vocab) for k, v in self.raw_vocab.iteritems(): # Hitung: # P(Wi) = C(Wi) / N <= untuk UniGram <= dicari melalui MLE if i==1: # Unigram do not use history """ WARNING!!! Kalau menggunakan SGT smoothing, MLE tidak digunakan """ if optimizer=='ls': V=len(self.raw_vocab) #<= gunakan jika menggunakan laplace smoothing self.vocab[k] = SimpleVocab(count=v, estimator=self.MLE(v,self.total_word,ls=True,V=V)) elif optimizer =='sgt': self.vocab[k] = SimpleVocab(count=v, estimator=sgtSmoothProb[k]) else: self.vocab[k] = SimpleVocab(count=v, estimator=self.MLE(v,self.total_word)) elif i ==2: """ Perlu diingat motivasi dibalik NGram LM adalah: begin with the task of computing P(w|h), the probability of a word w given some history h """ # P(Wi, Wj) = C(Wi, Wj) / N <= untuk BiGram <= dicari melalui MLE # tetapi yang perlu kita cari adalah P(Wj | Wi) == conditional distribution untuk # seberapa kemungkinan kata Wj muncul diberikan kata Wi sebelumnya # P(Wj | Wi) = P(Wi, Wj) / P(Wi) = C(Wi, Wj) / C(Wi) # C(Wi)=> adalah unigram count CWi = self.finalmodel[i-1][tok.WordTokenize(k)[0]].count if optimizer=='ls': #functools.reduce(operator.add,self.raw_vocab.values()) V=len(self.raw_vocab) #<= gunakan jika menggunakan laplace smoothing self.vocab[k] = SimpleVocab(count=v, estimator=self.MLE(v,CWi,ls=True,V=V)) elif optimizer =='sgt': self.vocab[k] = SimpleVocab(count=v, estimator=sgtSmoothProb[k]) else: self.vocab[k] = SimpleVocab(count=v, estimator=self.MLE(v,CWi)) elif i ==3: ####################################################################################### # P(Wi, Wj, Wk) = C (Wi, Wj, Wk) / N <= untuk Trigram # tetapi yang perlu kita cari adalah P(Wk | Wi, Wj) == conditional distribution untuk # seberapa kemungkinan kata Wk muncul diberikan kata Wi,Wj sebelumnya CWi = self.finalmodel[i-1][' '.join(tok.WordTokenize(k)[:-1])].count if optimizer=='ls': V=len(self.raw_vocab) #<= gunakan jika menggunakan laplace smoothing self.vocab[k] = SimpleVocab(count=v, estimator=self.MLE(v,CWi,ls=True,V=V)) elif optimizer =='sgt': self.vocab[k] = SimpleVocab(count=v, estimator=sgtSmoothProb[k]) else: self.vocab[k] = SimpleVocab(count=v, estimator=self.MLE(v,CWi)) self.finalmodel[i]=self.vocab self.vocab={} del self.raw_vocab self.perplexity(self.finalmodel, verbose=verbose) print ("Training language model done in %fs" % (time() - t0)) if verbose: print "token \t count \t proba \n", for k, v in self.finalmodel.iteritems(): print "######################################################################" print k, " - Gram", "\n", print "######################################################################" for ke,va in v.iteritems(): print ("%s\t %d\t %0.5f"%(ke,va.count,exp(va.estimator))) return self.finalmodel
def wordTokenizer(self, sent, simple=False): if not simple: tok = tokenize() return tok.WordTokenize(sent) else: return [x.strip() for x in re.split('(\W+)?', text) if x.strip()]
def train(self, vect, optimizer=None, separate=True, njump=0, verbose=False): """ In the study of probability, given at least two random variables X, Y, ...,that are defined on a probability space S, the joint probability distribution for X, Y, ... is a probability distribution that gives the probability that each of X, Y, ... falls in any particular range or discrete set of values specified for that variable ##################################################################################################################### ############################################################################################################# demikian jika diberikan kalimat S: "saya sedang makan nasi goreng di warung depan" berarti p(w1,w2,...,wn) disebut sebagai distribusi probabilitas(dimana w1,w2,...wn sebagai random variables), kata (w1,w2,...,wn), over the kalimat S """ t0 = time() print "Begin training language model..." if optimizer == 'modkn': """NOTE: Untuk sementara penerapan njump parameter belum dapat digunakan dalam pengimplementasian Modified Kneser-Ney optimizer ini. """ print "Using optimizer: ", 'Modified Kneser-Ney' modkn = ModifiedKneserNey() modkn.kneser_ney_discounting(vect) modkn.train() #print "proba\t\ttoken\t\tbow\t\tcount" tmpN = 1 for k, v in sorted(modkn.mKNeyEstimate.items(), key=lambda x: x[1][3]): #print ("%0.7f\t%s\t\t%0.7f\t\t%d"%(exp(v[0]),k,exp(v[1]),v[2])) if len(k.split(' ')) != tmpN: self.finalmodel[tmpN] = self.vocab self.vocab = {} tmpN = len(k.split(' ')) self.vocab[k] = SimpleVocab(count=int(v[2]), estimator=v[0]) self.finalmodel[tmpN] = self.vocab del self.vocab else: if optimizer == 'sgt': print "Using optimizer: ", 'Simple Good-Turing' elif optimizer == 'ls': print "Using optimizer: ", 'Laplace' else: print "Using optimizer: ", 'Maximum Likelihood Estimation' tok = tokenize() for i in range(1, self.nforgram + 1): self.nforgram = i self.raw_vocab,self.total_word = constructVocab(vect, self.total_word, \ nforgram=self.nforgram, separate=separate, \ njump=njump) if optimizer == 'sgt': sgtN = float( functools.reduce(operator.add, self.raw_vocab.values())) sgt = SimpleGoodTuring(self.raw_vocab, sgtN) sgtSmoothProb, p0 = sgt.train(self.raw_vocab) for k, v in self.raw_vocab.iteritems(): # Hitung: # P(Wi) = C(Wi) / N <= untuk UniGram <= dicari melalui MLE if i == 1: # Unigram do not use history """ WARNING!!! Kalau menggunakan SGT smoothing, MLE tidak digunakan """ if optimizer == 'ls': V = len( self.raw_vocab ) #<= gunakan jika menggunakan laplace smoothing self.vocab[k] = SimpleVocab(count=v, estimator=self.MLE( v, self.total_word, ls=True, V=V)) elif optimizer == 'sgt': self.vocab[k] = SimpleVocab( count=v, estimator=sgtSmoothProb[k]) else: self.vocab[k] = SimpleVocab(count=v, estimator=self.MLE( v, self.total_word)) elif i == 2: """ Perlu diingat motivasi dibalik NGram LM adalah: begin with the task of computing P(w|h), the probability of a word w given some history h """ # P(Wi, Wj) = C(Wi, Wj) / N <= untuk BiGram <= dicari melalui MLE # tetapi yang perlu kita cari adalah P(Wj | Wi) == conditional distribution untuk # seberapa kemungkinan kata Wj muncul diberikan kata Wi sebelumnya # P(Wj | Wi) = P(Wi, Wj) / P(Wi) = C(Wi, Wj) / C(Wi) # C(Wi)=> adalah unigram count CWi = self.finalmodel[i - 1][tok.WordTokenize(k)[0]].count if optimizer == 'ls': #functools.reduce(operator.add,self.raw_vocab.values()) V = len( self.raw_vocab ) #<= gunakan jika menggunakan laplace smoothing self.vocab[k] = SimpleVocab(count=v, estimator=self.MLE( v, CWi, ls=True, V=V)) elif optimizer == 'sgt': self.vocab[k] = SimpleVocab( count=v, estimator=sgtSmoothProb[k]) else: self.vocab[k] = SimpleVocab(count=v, estimator=self.MLE( v, CWi)) elif i == 3: ####################################################################################### # P(Wi, Wj, Wk) = C (Wi, Wj, Wk) / N <= untuk Trigram # tetapi yang perlu kita cari adalah P(Wk | Wi, Wj) == conditional distribution untuk # seberapa kemungkinan kata Wk muncul diberikan kata Wi,Wj sebelumnya CWi = self.finalmodel[i - 1][' '.join( tok.WordTokenize(k)[:-1])].count if optimizer == 'ls': V = len( self.raw_vocab ) #<= gunakan jika menggunakan laplace smoothing self.vocab[k] = SimpleVocab(count=v, estimator=self.MLE( v, CWi, ls=True, V=V)) elif optimizer == 'sgt': self.vocab[k] = SimpleVocab( count=v, estimator=sgtSmoothProb[k]) else: self.vocab[k] = SimpleVocab(count=v, estimator=self.MLE( v, CWi)) self.finalmodel[i] = self.vocab self.vocab = {} del self.raw_vocab self.perplexity(self.finalmodel, verbose=verbose) print("Training language model done in %fs" % (time() - t0)) if verbose: print "token \t count \t proba \n", for k, v in self.finalmodel.iteritems(): print "######################################################################" print k, " - Gram", "\n", print "######################################################################" for ke, va in v.iteritems(): print("%s\t %d\t %0.5f" % (ke, va.count, exp(va.estimator))) return self.finalmodel