Ejemplo n.º 1
0
def create_lm(path_to_data, grades, f_type):
    dataset = prepare_dataset(path_to_data, grades)

    client = MongoClient('mongodb://localhost:27017/')

    for grade in grades:
        print grade + " grade"
        start_total = time.time()
        client.drop_database(f_type + '_' + grade)
        db = client[f_type + '_' + grade]

        for n in xrange(1, 3):
            print str(n) + " gram"
            start = time.time()
            fd_dict = dict()
            # TODO separate dataset on grades

            for text in dataset:
                if text.grade == grade:
                    fd_dict[text.name] = FreqDist()
            fd_dict['all'] = FreqDist()

            for text in dataset:
                if text.grade == grade:

                    tokens = nltk.word_tokenize(text.data)
                    tokens_l = [token.lower() for token in tokens]

                    for key in fd_dict:
                        if key != text.name:
                            n_grams = ngrams(tokens_l, n)
                            fd_dict[key].update(n_grams)

            for key in fd_dict:
                lm_collection = db[key]
                fd = fd_dict[key]
                sgt = SimpleGoodTuringProbDist(fd)
                prob_many = list()

                for fd_key in fd:
                    prob_many.append({"type": fd_key, "n-gram": n, "count": fd[fd_key], "prob": sgt.prob(fd_key)})

                if prob_many:
                    lm_collection.insert_many(prob_many)

            print str(time.time() - start) + " sec"

        print str(time.time() - start_total) + " sec total"
Ejemplo n.º 2
0
def _estimator(fdist, bins):
    """
    Default estimator function using a SimpleGoodTuringProbDist.
    """
    # can't be an instance method of NgramModel as they
    # can't be pickled either.
    return SimpleGoodTuringProbDist(fdist)
Ejemplo n.º 3
0
    def _setTrigram(self, listInput):
        trigramList = ngrams(listInput, 3)
        trigramFreq = self.countProbability(trigramList)
        outerTrigram = {}
        size = len(listInput)

        sgtTri = SimpleGoodTuringProbDist(trigramFreq)

        for trigram in trigramFreq:
            b = trigram[0:2]
            if b in outerTrigram:
                innerTrigram = outerTrigram[b]
                innerTrigram[b] = (sgtTri.prob(trigram) / size)
            else:
                innerTrigram = {}
                #print(trigram[2])
                innerTrigram[trigram[2]] = (sgtTri.prob(trigram) / size)
                outerTrigram[b] = innerTrigram

        return outerTrigram
Ejemplo n.º 4
0
    def _setTrigram(self, listInput):
        trigramList = ngrams(listInput,3)
        trigramFreq = self.countProbability(trigramList)
        outerTrigram = {}
        size = len(listInput)

        sgtTri = SimpleGoodTuringProbDist(trigramFreq)

        for trigram in trigramFreq:
            b = trigram[0:2]
            if b in outerTrigram:
                innerTrigram = outerTrigram[b]
                innerTrigram[b] = (sgtTri.prob(trigram)/size)
            else:
                innerTrigram = {}
                #print(trigram[2])
                innerTrigram[trigram[2]] = (sgtTri.prob(trigram)/size)
                outerTrigram[b] = innerTrigram

        return outerTrigram
Ejemplo n.º 5
0
    def __init__(self, tokens, frequencies, debug=DEBUG, stop=STOP):
        self.freqdist = frequencies
        self.tokens = tokens | set(self.freqdist)
        self.DEBUG = debug
        self.STOP = stop

        if len(self.freqdist) == 0:
            raise ValueError("No frequencies given!")

        while not self.freqdist.hapaxes():
            warnings.warn("no hapaxes present -- shifting distribution down")
            min_freq = min(self.freqdist.values()) - 1
            for k in self.freqdist:
                self.freqdist[k] -= min_freq

        if min(self.freqdist.values()) != 0:
            warnings.warn("no unseen present -- adding dummy category")
            self.tokens.add("<dummy>")

        self.sgt = SimpleGoodTuringProbDist(self.freqdist,
                                            bins=len(self.tokens))
        self._get_probabilities()
def train(train_set, word_types, tag_set):
    """
    Training...
    Called this way, the HMM knows the whole set of tags and the whole set of words (no "unknown" word and/or tag during test)
    """
    trainer = HiddenMarkovModelTrainer(list(tag_set), list(
        word_types))  # tag_set and word_types are sets: I need to create lists
    # GoodTuring smoothing
    # see: https://nltk.googlecode.com/svn/trunk/doc/api/nltk.probability.SimpleGoodTuringProbDist-class.html
    #      http://en.wikipedia.org/wiki/Additive_smoothing
    hmm = trainer.train_supervised(
        train_set,
        estimator=lambda fd, bins: SimpleGoodTuringProbDist(fd, bins))
    return hmm
Ejemplo n.º 7
0
    def _setBigram(self, listInput):

        bigramList = ngrams(listInput, 2)
        bigramFreq = self.countProbability(bigramList)
        outerBigram = {}
        size = len(listInput)

        sgtBig = SimpleGoodTuringProbDist(bigramFreq)

        for bigram in bigramFreq:
            b = bigram[0]
            #print(b)
            if b in outerBigram:
                innerBigram1 = outerBigram[b]
                innerBigram1[b] = (sgtBig.prob(bigram) / size)
                # print(bigramFreq[bigram]/size)
            else:
                innerBigram = {}
                innerBigram[bigram[1]] = (sgtBig.prob(bigram) / size)
                outerBigram[b] = innerBigram
                #print(bigramFreq[bigram]/size)

        return outerBigram
Ejemplo n.º 8
0
    def _setBigram(self, listInput):

        bigramList = ngrams(listInput, 2)
        bigramFreq = self.countProbability(bigramList)
        outerBigram = {}
        size = len(listInput)

        sgtBig = SimpleGoodTuringProbDist(bigramFreq)


        for bigram in bigramFreq:
            b = bigram[0]
            #print(b)
            if b in outerBigram :
                innerBigram1 = outerBigram[b]
                innerBigram1[b] = (sgtBig.prob(bigram)/size)
                # print(bigramFreq[bigram]/size)
            else:
                innerBigram = {}
                innerBigram[bigram[1]] = (sgtBig.prob(bigram)/size)
                outerBigram[b] = innerBigram
                #print(bigramFreq[bigram]/size)

        return outerBigram
Ejemplo n.º 9
0
def goodturing_estimator(freqdist):
    return SimpleGoodTuringProbDist(freqdist)
Ejemplo n.º 10
0
parser.add_argument("--word-type")
parser.add_argument("-n", type=int) #n-grams

group = parser.add_mutually_exclusive_group()
group.add_argument("--laplace", action="store_true")
group.add_argument("--good-turing", action="store_true")

parser.add_argument("--unknown-word-freq", type=int)
parser.add_argument("-o", required=True)
parsed = parser.parse_args()

estimator = None
if parsed.laplace:
	estimator = lambda fdist, bins: LaplaceProbDist(fdist)
elif parsed.good_turing:
	estimator = lambda fdist, bins: SimpleGoodTuringProbDist(fdist, bins=1e5)

words = []
directory = parsed.src_texts
n = parsed.n
output = parsed.o

for filename in os.listdir(directory):
	with open (directory+"/"+filename, "r") as file:
		inp = file.read()
		if parsed.text_encoding:
			inp = inp.decode(parsed.text_encoding)

		if filename != ".DS_Store":
			if parsed.word_type == "stem":
				stemmer = Stemmer.Stemmer('russian')
Ejemplo n.º 11
0
from nltk.book import text1
from collections import Counter
from nltk.probability import SimpleGoodTuringProbDist
from openpyxl import Workbook

#Create your bigrams
bgs = nltk.bigrams(text1)

#compute frequency distribution for bigrams
fdistB = nltk.FreqDist(bgs)

#compute frequency distribution for unigrams
fdistU = nltk.FreqDist(text1)

#apply simple good turing smoothing method
sgt = SimpleGoodTuringProbDist(fdistB)

#most common 25 unigrams
mostCommon = fdistU.most_common(25)

#initialize dataframe for the excel sheet
column = []
for k, v in mostCommon:
    column.append(k)

#data frame

df = pd.DataFrame(index=column, columns=column)

#fill data frame with probability data
for k, v in mostCommon:
Ejemplo n.º 12
0
def _estimator(fdist, bins):
    return SimpleGoodTuringProbDist(fdist)
Ejemplo n.º 13
0
from __future__ import division
from nltk.corpus.reader import ConllCorpusReader
from nltk.probability import FreqDist, DictionaryProbDist, LaplaceProbDist, SimpleGoodTuringProbDist, MLEProbDist

conllreader = ConllCorpusReader(".", "de-train.tt", ('words', 'pos'))  # getting a train corpus from file
states = ('VERB', 'NOUN', 'PRON', 'ADJ', 'ADV', 'ADP', 'CONJ', 'DET', 'NUM', 'PRT', 'X', '.')  # list of 12 POS tags
sentslen = len(conllreader.tagged_sents())  # getting number of sentences

tagfdist = FreqDist(pair[1] for pair in conllreader.tagged_words())   # getting frequence of (word,tag)

firsttagfdist = FreqDist(pair[0][1] for pair in conllreader.tagged_sents())  # getting frequence of first tags
A0j = DictionaryProbDist(dict(map(lambda (k, x): (k, x/sentslen), firsttagfdist.iteritems())))
A0jLap = LaplaceProbDist(firsttagfdist)
A0jGT = SimpleGoodTuringProbDist(firsttagfdist)
A0jMLE = MLEProbDist(firsttagfdist)

TagPair = []
words = conllreader.tagged_words()
for i in range(0, len(words)-1):
    TagPair.append((words[i][1], words[i+1][1]))

TagPairfdist = FreqDist(TagPair)
Aij = DictionaryProbDist(dict(map(lambda (k, x): (k, x/tagfdist.get(k[0])), TagPairfdist.iteritems())))
AijLap = LaplaceProbDist(TagPairfdist)
AijGT = SimpleGoodTuringProbDist(TagPairfdist)
AijMLE = MLEProbDist(TagPairfdist)

TagWordfdist = FreqDist(conllreader.tagged_words())
Biw = DictionaryProbDist(dict(map(lambda (k, x): (k, x/tagfdist.get(k[1])), TagWordfdist.iteritems())))
BiwLap = LaplaceProbDist(TagWordfdist)
BiwGT = SimpleGoodTuringProbDist(TagWordfdist)
Ejemplo n.º 14
0
def simple_good_turing_estimator(fdist, bins):
    return SimpleGoodTuringProbDist(fdist, bins=bins)
Ejemplo n.º 15
0
        current_line = test_file[i]
        word_tag = current_line.split('\t\t')
        words.add(word_tag[0])
        tags.add(word_tag[1])
        current_sentence.append((word_tag[0], word_tag[1]))
        if word_tag[0] == '.':
            test.append(current_sentence)
            current_sentence = []

    tags = list(tags)
    words = list(words)
    trainer = hmm.HiddenMarkovModelTrainer(tags, words)
    # tagger = trainer.train_supervised(train, estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins))
    # tagger = trainer.train_supervised(train, estimator=lambda fd, bins: MLEProbDist(fd))
    tagger = trainer.train_supervised(
        train, estimator=lambda fd, bins: SimpleGoodTuringProbDist(fd, bins))
    # tagger = trainer.train_supervised(train, estimator=lambda fd, bins: WittenBellProbDist(fd, bins))
    # tagger = trainer.train_supervised(train, estimator=lambda fd, bins: KneserNeyProbDist(fd, bins))

    print("here")
    predicted = []
    real = []
    for i in range(0, len(test) - 1):
        current = list(zip(*test[i]))
        tagged = tagger.tag(list(current[0]))
        current_tags = list(list(zip(*tagged))[1])
        predicted += current_tags
        real += list(current[1])

    print(tags)
    confusion = confusion_matrix(predicted, real, labels=tags)
Ejemplo n.º 16
0
class Simulator(object):
    def __init__(self, tokens, frequencies, debug=DEBUG, stop=STOP):
        self.freqdist = frequencies
        self.tokens = tokens | set(self.freqdist)
        self.DEBUG = debug
        self.STOP = stop

        if len(self.freqdist) == 0:
            raise ValueError("No frequencies given!")

        while not self.freqdist.hapaxes():
            warnings.warn("no hapaxes present -- shifting distribution down")
            min_freq = min(self.freqdist.values()) - 1
            for k in self.freqdist:
                self.freqdist[k] -= min_freq

        if min(self.freqdist.values()) != 0:
            warnings.warn("no unseen present -- adding dummy category")
            self.tokens.add("<dummy>")

        self.sgt = SimpleGoodTuringProbDist(self.freqdist,
                                            bins=len(self.tokens))
        self._get_probabilities()

    def _get_probabilities(self):
        self.probabilities = {}
        for t in self.tokens:
            self.probabilities[t] = self.sgt.prob(t)
        return self.probabilities

    def dump(self):  # pragma: no cover
        for t in self.tokens:
            print("\t".join([
                "%30s" % t,
                '%d' % self.freqdist[t],
                '%0.4f' % self.sgt.prob(t)
            ]))
        print("HAPAXES: %r" % self.freqdist.hapaxes())
        # freq of freqs:
        print("FreqDist: %r" % sorted(self.freqdist.values(), reverse=True))
        print("Slope: %r" % self.sgt._slope)
        print("Switch at: %r" % self.sgt._switch_at)

    def simulate(self, n=1000):
        keys = [k for k in self.probabilities]
        probs = [self.probabilities[k] for k in keys]

        # set up
        itercount = 0  # stopping safety valve
        transcript = []
        complete = False  # have we seen everything?
        seen = Counter({k: 0 for k in self.tokens})  # tokens we've seen
        while not complete:
            for char in np.random.choice(keys, n, replace=True, p=probs)[0:n]:
                transcript.append(char)
                seen[char] += 1
                complete = all(v > 0 for v in seen.values())

                # if self.DEBUG:
                #     print(itercount, len(transcript))
                #
                if complete:
                    return len(transcript)

                if itercount >= self.STOP:
                    raise StopIteration("Abort Abort!")
                itercount += 1

        raise StopIteration("Failed")