def train(cmd_args, corpus_files, model): """ Trains statistical model. """ for lang in corpus_files: text = udhr2.raw(lang) #print("lang:", lang, "; length:", len(text)) # Replace multiple whitespaces (including ' ', '\n', '\t') with just one ' ' text = re.sub(r'\s+', ' ', text) # Skip empty files, like nku.txt if len(text) < 1000: #print("skipping pathological file", lang) model.deleted_langs.append(lang) continue model.ngrams[lang] = {} model.smoothed[lang] = [] if cmd_args.cross_valid: # Remove the first 100 characters to go to the test set model.tests[lang] = text[:cmd_args.test_len] text = text[cmd_args.test_len:] # Build ngrams for each language in training model.ngrams[lang] = char_freqs(text, cmd_args.n_order) model.smoothed[lang] = probability.LaplaceProbDist( probability.FreqDist(model.ngrams[lang]))
def distribution(self, tokens, laplace=True): fd = probability.FreqDist() for word in tokens: fd.inc(word) if laplace: return probability.LaplaceProbDist(fd) else: return probability.MLEProbDist(fd)
def savelocaldist(self, laplace = True, savetokens = False): self.localdist = dict() for doc in self.docs: if savetokens: doc.terms = [] localfd = probability.FreqDist() for tok in doc.tokens(): if savetokens: doc.terms.append(tok) localfd.inc(tok) if localfd.N() > 0: if laplace: self.localdist[doc.fid] = probability.LaplaceProbDist(localfd) else: self.localdist[doc.fid] = probability.MLEProbDist(localfd)
def laplace_stuff(): sent = "am ate ate apple am x." sent_tokenized = word_tokenize(sent) freq_dist = FreqDist(word.lower() for word in word_tokenize(sent)) print(freq_dist.items()) lap = probability.LaplaceProbDist(freq_dist) print(lap.generate()) print(lap.prob("am")) print("Finished freq dist, Starting Cond dist") # Cond Probabilty cond_dist = ConditionalFreqDist() context = None tokens = sent_tokenized # The type of the preceeding word for token in tokens: outcome = token cond_dist[context] = (outcome) context = token print(cond_dist["am"]) print(cond_dist.items())
def globaldist(self, laplace=True): ''' return a global probabiliyt distribution for a set of document. Memory problem if the set of document is too large. Use laplace smooting by default. Creates a storage gdist which holds the global dist Must clear this variable after use to free the memory ''' fd = probability.FreqDist() for doc in self.docs: tokens = None if doc.terms is None: tokens = doc.tokens() else: tokens = doc.terms for tok in tokens: fd.inc(tok) if laplace: self.gdist = probability.LaplaceProbDist(fd) else: self.gdist = probability.MLEProbDist(fd) return self.gdist
def laplace(counter): freq_dist = FreqDist(wrd for wrd in counter.keys()) lap = probability.LaplaceProbDist(freq_dist) return [lap]