Exemple #1
0
def NgramModel(sen):
    # Materi Syntatic proses:N-Gram
    # http://blog.pantaw.com/syntatic-proses-n-grams/
    kata = TextTokenizer(sen)
    kata = ngram.ngrams(kata, n=2, njump=3)

    print "Total sample: ", len(kata)
Exemple #2
0
def constructVocab(vect, totalword, nforgram, separate=True, njump=0):
    # kata masuk sudah ditokenize
    # kalo buat N-Gram brarti: ['saya sedang', 'sedang makan',...,'warung depan']
    freqDist = dict()
    total_word = totalword

    dataset = []
    jumpdataset = []

    ############################### 2015-11-07 ################################
    ## Dipisah, supaya penggunaan jump parameter tidak mempengaruhi
    ## jumlah dari keseluruhan kata, hanya akan mempengaruhi frekuensi sample
    ###########################################################################
    def scale(voc):
        for z in xrange(len(voc)):
            for y in xrange(len(voc[z])):
                # Untuk menangani bentuk Selain bigram, karena itu kita menggunakan format ini
                freqDist.setdefault(' '.join(voc[z][y]), 0)

                # C(Wi) => rekam jumlah kemunculan suatu kata
                freqDist[' '.join(voc[z][y])] += 1

    for z in vect:
        # Semua yang masuk ke language model ngram, harus melalui proses tokenize->ngram
        if separate:
            if '<s>' not in z: z.insert(0, '<s>')
            if '</s>' not in z: z.insert(len(z), '</s>')
        dataset.append(ngram.ngrams(z, n=nforgram))
        if njump > 0 and nforgram > 1:
            jumpdataset.append(ngram.ngrams(z, n=nforgram, njump=njump))

    scale(dataset)

    if totalword == 0:
        # self.total_word = N = Jumlah(C) seluruh kata/sample pada kalimat/ruang sample
        total_word = functools.reduce(
            operator.add, freqDist.values())  #sum(freqDist.values())

    if jumpdataset:
        scale(jumpdataset)

    print("Vocab size for N=%i is:%i, with Total Word(corpus length):%i" %
          (nforgram, len(freqDist), total_word))
    return freqDist, total_word
Exemple #3
0
def NgramModel(sen):
    # Materi Syntatic proses:N-Gram
    # http://blog.pantaw.com/syntatic-proses-n-grams/
    kata = TextTokenizer(sen)
    kata = ngram.ngrams(kata,n=2,njump=3)
    
    print "Jumlah sample: ", len(kata)
    for z in kata:
        print ' '.join(z)
    print "\n"
Exemple #4
0
def NgramModel(sen):
    # Materi Syntatic proses:N-Gram
    # http://blog.pantaw.com/syntatic-proses-n-grams/
    kata = TextTokenizer(sen)
    kata = ngram.ngrams(kata,n=2,njump=3)
    
    print "Jumlah sample: ", len(kata)
    for z in kata:
        print ' '.join(z)
    print "\n"
Exemple #5
0
def constructVocab(vect, totalword, nforgram, separate=True, njump=0):
    # kata masuk sudah ditokenize
    # kalo buat N-Gram brarti: ['saya sedang', 'sedang makan',...,'warung depan']
    freqDist=dict()
    total_word=totalword

    dataset=[]
    jumpdataset=[]

    ############################### 2015-11-07 ################################
    ## Dipisah, supaya penggunaan jump parameter tidak mempengaruhi
    ## jumlah dari keseluruhan kata, hanya akan mempengaruhi frekuensi sample
    ###########################################################################
    def scale(voc):
        for z in xrange(len(voc)):
            for y in xrange(len(voc[z])):
                # Untuk menangani bentuk Selain bigram, karena itu kita menggunakan format ini
                freqDist.setdefault(' '.join(voc[z][y]),0)
                
                # C(Wi) => rekam jumlah kemunculan suatu kata
                freqDist[' '.join(voc[z][y])]+=1
                
    for z in vect:
        # Semua yang masuk ke language model ngram, harus melalui proses tokenize->ngram
        if separate:
            if '<s>' not in z: z.insert(0,'<s>')
            if '</s>' not in z: z.insert(len(z),'</s>')
        dataset.append(ngram.ngrams(z,n=nforgram))
        if njump>0 and nforgram>1:
           jumpdataset.append(ngram.ngrams(z,n=nforgram,njump=njump)) 

    scale(dataset)
    
    if totalword==0:
        # self.total_word = N = Jumlah(C) seluruh kata/sample pada kalimat/ruang sample
        total_word = functools.reduce(operator.add, freqDist.values())#sum(freqDist.values())

    if jumpdataset:
        scale(jumpdataset)

    print ("Vocab size for N=%i is:%i, with Total Word(corpus length):%i" % (nforgram,len(freqDist),total_word))
    return freqDist,total_word