Ejemplo n.º 1
0
def train(cmd_args, corpus_files, model):
    """ Trains statistical model. """
    for lang in corpus_files:

        text = udhr2.raw(lang)
        #print("lang:", lang, "; length:", len(text))
        # Replace multiple whitespaces (including ' ', '\n', '\t') with just one ' '
        text = re.sub(r'\s+', ' ', text)

        # Skip empty files, like nku.txt
        if len(text) < 1000:
            #print("skipping pathological file", lang)
            model.deleted_langs.append(lang)
            continue

        model.ngrams[lang] = {}
        model.smoothed[lang] = []

        if cmd_args.cross_valid:
            # Remove the first 100 characters to go to the test set
            model.tests[lang] = text[:cmd_args.test_len]
            text = text[cmd_args.test_len:]

        # Build ngrams for each language in training
        model.ngrams[lang] = char_freqs(text, cmd_args.n_order)

        model.smoothed[lang] = probability.LaplaceProbDist(
            probability.FreqDist(model.ngrams[lang]))
Ejemplo n.º 2
0
 def distribution(self, tokens, laplace=True):
     fd = probability.FreqDist()
     for word in tokens:
        fd.inc(word)
     if laplace:
         return probability.LaplaceProbDist(fd)
     else:
         return probability.MLEProbDist(fd)
Ejemplo n.º 3
0
 def savelocaldist(self, laplace = True, savetokens = False):
     self.localdist = dict()
     
     for doc in self.docs:
         if savetokens:
             doc.terms = []
         localfd = probability.FreqDist()
         for tok in doc.tokens():
             if savetokens:
                 doc.terms.append(tok)
             localfd.inc(tok)
         if localfd.N() > 0:
             if laplace:
                 self.localdist[doc.fid] = probability.LaplaceProbDist(localfd)
             else:
                 self.localdist[doc.fid] = probability.MLEProbDist(localfd)
Ejemplo n.º 4
0
def laplace_stuff():
    sent = "am ate ate apple am x."
    sent_tokenized = word_tokenize(sent)
    freq_dist = FreqDist(word.lower() for word in word_tokenize(sent))
    print(freq_dist.items())
    lap = probability.LaplaceProbDist(freq_dist)
    print(lap.generate())
    print(lap.prob("am"))
    print("Finished freq dist, Starting Cond dist")
    # Cond Probabilty
    cond_dist = ConditionalFreqDist()
    context = None
    tokens = sent_tokenized
    # The type of the preceeding word
    for token in tokens:
        outcome = token
        cond_dist[context] = (outcome)
        context = token
    print(cond_dist["am"])
    print(cond_dist.items())
Ejemplo n.º 5
0
 def globaldist(self, laplace=True):
     '''
     return a global probabiliyt distribution for a set of document.
     Memory problem if the set of document is too large.
     Use laplace smooting by default.
     Creates  a storage gdist which holds the global dist
     Must clear this variable after use to free the memory
     '''
     fd = probability.FreqDist()
     for doc in self.docs:
         tokens = None
         if doc.terms is None:
             tokens = doc.tokens()
         else:
             tokens = doc.terms
         for tok in tokens:
             fd.inc(tok)
     if laplace:
         self.gdist = probability.LaplaceProbDist(fd)
     else:
         self.gdist = probability.MLEProbDist(fd)
     return self.gdist
Ejemplo n.º 6
0
def laplace(counter):
    freq_dist = FreqDist(wrd for wrd in counter.keys())
    lap = probability.LaplaceProbDist(freq_dist)
    return [lap]