Example #1
0
def getTransitionProb(sm, sents, tagset):
    # P(nextTag|prevTag) = transitionProb[prevTag].prob(nextTag)
    transition = []
    for s in sents:
        tags = [t for (w, t) in s]
        transition += ngrams(tags, 2)

    transitionProb = {}
    for tag in tagset:
        nextTags = [
            nextTag for (prevTag, nextTag) in transition if prevTag == tag
        ]

        if sm == "no":
            transitionProb[tag] = LidstoneProbDist(FreqDist(nextTags),
                                                   0,
                                                   bins=1e5)
        elif sm == "laplace":
            transitionProb[tag] = LidstoneProbDist(FreqDist(nextTags),
                                                   1,
                                                   bins=1e5)
        elif sm == "goodturing":
            transitionProb[tag] = SimpleGoodTuringProbDist(FreqDist(nextTags),
                                                           bins=1e5)
        else:
            transitionProb[tag] = WittenBellProbDist(FreqDist(nextTags),
                                                     bins=1e5)

    return transitionProb
Example #2
0
 def raw_words(self, length=100):
     """Generates a list of words using an NLTK NgramModel."""
     if not hasattr(self, '_ngram_model'):
         estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
         self._ngram_model = NgramModel(2, self.model, estimator=estimator)
     return self._ngram_model.generate(length,
                                       [random.choice(self.words)])[1:]
Example #3
0
def make_model(nst_infile, picklefile, protocol=-1):
    """ Train a POS probability model on the NST lexicon and save it as a pickle file.
    The model is a LidstoneProbDist (NLTK) which has compounded POS tags (SUC set) as keys (e.g. "NN+NN")
    and smoothed probabilities as values."""
    # Collect all compounds from nst data
    nst_full_compounds = set()
    with open(nst_infile, encoding='UTF-8') as f:
        for line in f:
            fields = line[:-1].split('\t')
            word = fields[0]
            comp = fields[3].replace("!", "")
            pos = fields[4]
            if "+" in comp and "_" not in word and not (comp.startswith("+") or comp.startswith("-")):
                nst_full_compounds.add((word, comp, pos))

    # Build POS probability model
    pos_fdist = FreqDist()
    for _w, _c, pos in nst_full_compounds:
        if '+' in pos:
            pos = re.sub(r"\+LN", "", pos)
            pos_fdist[pos] += 1

    pd = LidstoneProbDist(pos_fdist, 0.001, pos_fdist.B())

    # Save probability model as pickle
    with open(picklefile, "wb") as f:
        pickle.dump(pd, f, protocol=protocol)
Example #4
0
def _estimator(fdist, *estimator_args, **estimator_kwargs):
    """
    Default estimator function using a SimpleGoodTuringProbDist.
    """
    # can't be an instance method of NgramModel as they
    # can't be pickled either.
    return LidstoneProbDist(fdist, *estimator_args, **estimator_kwargs)
def make_model(stats_infile, picklefile, smoothingparam=0.001, min_freq=3, protocol=-1):
    """Train a probability model on a korp statistics file and save it as a pickle file.
    The model is a LidstoneProbDist (NLTK) which has tuples (wordform, MSD-tag) as keys
    and smoothed probabilities as values."""
    fdist = FreqDist()
    with open(stats_infile, encoding='utf-8') as f:
        for line in f:
            fields = line[:-1].split('\t')
            word = fields[0]
            # Skip word forms that occur fewer times than min_freq
            if int(fields[4]) < min_freq:
                break
            # Get rid of all urls
            if word.startswith("http://"):
                continue
            # # Words that only occur once may only contain letters and hyphens
            # if fields[4] == '1' and any(not (c.isalpha() or c == "-") for c in word):
            #     continue
            # if len(word) > 100:
            #     continue
            simple_msd = fields[1][:fields[1].find('.')] if '.' in fields[1] else fields[1]
            fdist[(word, simple_msd)] += int(fields[4])

    pd = LidstoneProbDist(fdist, smoothingparam, fdist.B())

    # Save probability model as pickle
    with open(picklefile, "wb") as p:
        pickle.dump(pd, p, protocol=protocol)
Example #6
0
def getEmissionProb(sm, sents, tagset):
    # P(word|tag) = transitionProb[tag].prob(word)
    emission = []
    for s in sents:
        emission += [(w.lower(), t) for (w, t) in s]

    emissionProb = {}
    for tag in tagset:
        words = [w for (w, t) in emission if t == tag]
        if sm == "no":
            emissionProb[tag] = LidstoneProbDist(FreqDist(words), 0, bins=1e5)
        elif sm == "laplace":
            emissionProb[tag] = LidstoneProbDist(FreqDist(words), 1, bins=1e5)
        elif sm == "goodturing":
            emissionProb[tag] = SimpleGoodTuringProbDist(FreqDist(words),
                                                         bins=1e5)
        else:
            emissionProb[tag] = WittenBellProbDist(FreqDist(words), bins=1e5)

    return emissionProb
def get_probs_dist(freq_dist, smoothing=1):
    freq_dists = defaultdict(FreqDist)

    for ngram, freq in freq_dist.items():
        *prefix, cur = ngram
        key = ''.join(prefix)
        freq_dists[key].update({cur: freq_dist[ngram]})

    probs_dist = defaultdict(LidstoneProbDist)

    for prefix, fd in freq_dists.items():
        probs_dist[prefix] = LidstoneProbDist(fd, gamma=smoothing)

    return probs_dist
Example #8
0
def demo_pos():
    # demonstrates POS tagging using supervised training

    print
    print "HMM POS tagging demo"
    print

    print 'Training HMM...'
    labelled_sequences, tag_set, symbols = load_pos(200)
    trainer = HiddenMarkovModelTrainer(tag_set, symbols)
    hmm = trainer.train_supervised(labelled_sequences[10:],
                    estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins))

    print 'Testing...'
    test_pos(hmm, labelled_sequences[:10], True)
Example #9
0
 def __init__(self, dataset, capitalize=False):
     self.capitalize = capitalize
     tweets = dataset.split("\n")
     words = []
     for tweet in tweets:
         if "@" in tweet or tweet.startswith("RT"):
             continue
         words += [
             word for word in tweet.split()
             if word[0] not in ["@", "#", ":", "(", ")", "2"]
             and not "http://" in word and not "https://" in word
         ]
     self.words = words
     self.model = nltk.Text(words)
     estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
     self._ngram_model = NgramModel(2, self.model, estimator=estimator)
Example #10
0
def demo_pos_bw():
    # demonstrates the Baum-Welch algorithm in POS tagging

    print
    print "Baum-Welch demo for POS tagging"
    print

    print 'Training HMM (supervised)...'
    sentences, tag_set, symbols = load_pos(210)
    symbols = set()
    for sentence in sentences:
        for token in sentence:
            symbols.add(token[_TEXT])
            
    trainer = HiddenMarkovModelTrainer(tag_set, list(symbols))
    hmm = trainer.train_supervised(sentences[10:200],
                    estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins))
    print 'Training (unsupervised)...'
    # it's rather slow - so only use 10 samples
    unlabeled = _untag(sentences[200:210])
    hmm = trainer.train_unsupervised(unlabeled, model=hmm, max_iterations=5)
    test_pos(hmm, sentences[:10], True)
Example #11
0
 def run(self):
     cfd = ConditionalFreqDist((tuple(self.data_set[i: i + self.n - 1]), self.data_set[i + self.n - 1]) for i in
                               range(len(self.data_set) - self.n + 1))
     lidstone_estimator = lambda fd: LidstoneProbDist(fd, self.gamma, fd.B() + 1)
     cpd = ConditionalProbDist(cfd, lidstone_estimator)
     self.model = cpd