class TamilVUNgram: def __init__(self): self.filename = u'tamilvu_dictionary_words.txt' self.unigram = Unigram(self.filename) self.unigram.frequency_model() print(u"--- completed Unigram model ---") self.bigram = Bigram(self.filename) self.bigram.language_model(verbose=False) self.trigram = Trigram(self.filename) self.trigram.language_model(verbose=False) print(u"--- completed Bigram,Trigram model ---") def save(self): # save letter2 of bigram # save letter of unigram with codecs.open("tvu_bigram.txt","w","utf-8") as fp: d = {} for k,v in self.bigram.letter2.items(): for k2,v2 in v.items(): if v2 == 0: continue d[k+k2] = v2 for k,v in sorted(d.items(),key=operator.itemgetter(1),reverse=True): fp.write(u"%s - %d\n"%(k,v)) with codecs.open("tvu_unigram.txt","w","utf-8") as fp: for k,v in sorted(self.unigram.letter.items(),key=operator.itemgetter(1),reverse=True): if v == 0: continue fp.write(u"%s - %d\n"%(k,v)) self.trigram.save(u'tvu_trigram.txt') print(u"SAVED tvu_unigram.txt, tvu_bigram.txt")
def get_stats(): obj = Unigram("out-tamil-words.txt") obj.frequency_model() with codecs.open("ta_data_freq.txt", "w", "utf-8") as fp: pprint.pprint(obj.letter, stream=fp) proc_stats(obj.letter, u"ta_data_freq2.txt") return
def get_stats(): obj = Unigram("out-tamil-words.txt") obj.frequency_model() with codecs.open("ta_data_freq.txt","w","utf-8") as fp: pprint.pprint( obj.letter, stream=fp) proc_stats(obj.letter,u"ta_data_freq2.txt") return
def __init__(self): self.filename = u'tamilvu_dictionary_words.txt' self.unigram = Unigram(self.filename) self.unigram.frequency_model() print(u"--- completed Unigram model ---") self.bigram = Bigram(self.filename) self.bigram.language_model(verbose=False) self.trigram = Trigram(self.filename) self.trigram.language_model(verbose=False) print(u"--- completed Bigram,Trigram model ---")
class TamilVUNgram: def __init__(self): self.filename = "tamilvu_dictionary_words.txt" self.unigram = Unigram(self.filename) self.unigram.frequency_model() print("--- completed Unigram model ---") self.bigram = Bigram(self.filename) self.bigram.language_model(verbose=False) self.trigram = Trigram(self.filename) self.trigram.language_model(verbose=False) print("--- completed Bigram,Trigram model ---") def save(self): # save letter2 of bigram # save letter of unigram with codecs.open("tvu_bigram.txt", "w", "utf-8") as fp: d = {} for k, v in list(self.bigram.letter2.items()): for k2, v2 in list(v.items()): if v2 == 0: continue d[k + k2] = v2 for k, v in sorted(list(d.items()), key=operator.itemgetter(1), reverse=True): fp.write("%s - %d\n" % (k, v)) with codecs.open("tvu_unigram.txt", "w", "utf-8") as fp: for k, v in sorted( list(self.unigram.letter.items()), key=operator.itemgetter(1), reverse=True, ): if v == 0: continue fp.write("%s - %d\n" % (k, v)) self.trigram.save("tvu_trigram.txt") print("SAVED tvu_unigram.txt, tvu_bigram.txt")
def run(parent,outputfile): x=None for filename in glob.glob(os.path.join(parent,"*.word")): if not x: x = Unigram(filename) else: x.corpus = Corpus(filename) #update file x.frequency_model() x.save(outputfile) proc_stats(get_prob(x.letter), outputfile) return
def __init__(self, filename): Unigram.__init__(self, filename) self.bigram = dict()
def __init__(self,filename): Unigram.__init__(self,filename) self.bigram = dict()