def create_lm(path_to_data, grades, f_type): dataset = prepare_dataset(path_to_data, grades) client = MongoClient('mongodb://localhost:27017/') for grade in grades: print grade + " grade" start_total = time.time() client.drop_database(f_type + '_' + grade) db = client[f_type + '_' + grade] for n in xrange(1, 3): print str(n) + " gram" start = time.time() fd_dict = dict() # TODO separate dataset on grades for text in dataset: if text.grade == grade: fd_dict[text.name] = FreqDist() fd_dict['all'] = FreqDist() for text in dataset: if text.grade == grade: tokens = nltk.word_tokenize(text.data) tokens_l = [token.lower() for token in tokens] for key in fd_dict: if key != text.name: n_grams = ngrams(tokens_l, n) fd_dict[key].update(n_grams) for key in fd_dict: lm_collection = db[key] fd = fd_dict[key] sgt = SimpleGoodTuringProbDist(fd) prob_many = list() for fd_key in fd: prob_many.append({"type": fd_key, "n-gram": n, "count": fd[fd_key], "prob": sgt.prob(fd_key)}) if prob_many: lm_collection.insert_many(prob_many) print str(time.time() - start) + " sec" print str(time.time() - start_total) + " sec total"
def _estimator(fdist, bins): """ Default estimator function using a SimpleGoodTuringProbDist. """ # can't be an instance method of NgramModel as they # can't be pickled either. return SimpleGoodTuringProbDist(fdist)
def _setTrigram(self, listInput): trigramList = ngrams(listInput, 3) trigramFreq = self.countProbability(trigramList) outerTrigram = {} size = len(listInput) sgtTri = SimpleGoodTuringProbDist(trigramFreq) for trigram in trigramFreq: b = trigram[0:2] if b in outerTrigram: innerTrigram = outerTrigram[b] innerTrigram[b] = (sgtTri.prob(trigram) / size) else: innerTrigram = {} #print(trigram[2]) innerTrigram[trigram[2]] = (sgtTri.prob(trigram) / size) outerTrigram[b] = innerTrigram return outerTrigram
def _setTrigram(self, listInput): trigramList = ngrams(listInput,3) trigramFreq = self.countProbability(trigramList) outerTrigram = {} size = len(listInput) sgtTri = SimpleGoodTuringProbDist(trigramFreq) for trigram in trigramFreq: b = trigram[0:2] if b in outerTrigram: innerTrigram = outerTrigram[b] innerTrigram[b] = (sgtTri.prob(trigram)/size) else: innerTrigram = {} #print(trigram[2]) innerTrigram[trigram[2]] = (sgtTri.prob(trigram)/size) outerTrigram[b] = innerTrigram return outerTrigram
def __init__(self, tokens, frequencies, debug=DEBUG, stop=STOP): self.freqdist = frequencies self.tokens = tokens | set(self.freqdist) self.DEBUG = debug self.STOP = stop if len(self.freqdist) == 0: raise ValueError("No frequencies given!") while not self.freqdist.hapaxes(): warnings.warn("no hapaxes present -- shifting distribution down") min_freq = min(self.freqdist.values()) - 1 for k in self.freqdist: self.freqdist[k] -= min_freq if min(self.freqdist.values()) != 0: warnings.warn("no unseen present -- adding dummy category") self.tokens.add("<dummy>") self.sgt = SimpleGoodTuringProbDist(self.freqdist, bins=len(self.tokens)) self._get_probabilities()
def train(train_set, word_types, tag_set): """ Training... Called this way, the HMM knows the whole set of tags and the whole set of words (no "unknown" word and/or tag during test) """ trainer = HiddenMarkovModelTrainer(list(tag_set), list( word_types)) # tag_set and word_types are sets: I need to create lists # GoodTuring smoothing # see: https://nltk.googlecode.com/svn/trunk/doc/api/nltk.probability.SimpleGoodTuringProbDist-class.html # http://en.wikipedia.org/wiki/Additive_smoothing hmm = trainer.train_supervised( train_set, estimator=lambda fd, bins: SimpleGoodTuringProbDist(fd, bins)) return hmm
def _setBigram(self, listInput): bigramList = ngrams(listInput, 2) bigramFreq = self.countProbability(bigramList) outerBigram = {} size = len(listInput) sgtBig = SimpleGoodTuringProbDist(bigramFreq) for bigram in bigramFreq: b = bigram[0] #print(b) if b in outerBigram: innerBigram1 = outerBigram[b] innerBigram1[b] = (sgtBig.prob(bigram) / size) # print(bigramFreq[bigram]/size) else: innerBigram = {} innerBigram[bigram[1]] = (sgtBig.prob(bigram) / size) outerBigram[b] = innerBigram #print(bigramFreq[bigram]/size) return outerBigram
def _setBigram(self, listInput): bigramList = ngrams(listInput, 2) bigramFreq = self.countProbability(bigramList) outerBigram = {} size = len(listInput) sgtBig = SimpleGoodTuringProbDist(bigramFreq) for bigram in bigramFreq: b = bigram[0] #print(b) if b in outerBigram : innerBigram1 = outerBigram[b] innerBigram1[b] = (sgtBig.prob(bigram)/size) # print(bigramFreq[bigram]/size) else: innerBigram = {} innerBigram[bigram[1]] = (sgtBig.prob(bigram)/size) outerBigram[b] = innerBigram #print(bigramFreq[bigram]/size) return outerBigram
def goodturing_estimator(freqdist): return SimpleGoodTuringProbDist(freqdist)
parser.add_argument("--word-type") parser.add_argument("-n", type=int) #n-grams group = parser.add_mutually_exclusive_group() group.add_argument("--laplace", action="store_true") group.add_argument("--good-turing", action="store_true") parser.add_argument("--unknown-word-freq", type=int) parser.add_argument("-o", required=True) parsed = parser.parse_args() estimator = None if parsed.laplace: estimator = lambda fdist, bins: LaplaceProbDist(fdist) elif parsed.good_turing: estimator = lambda fdist, bins: SimpleGoodTuringProbDist(fdist, bins=1e5) words = [] directory = parsed.src_texts n = parsed.n output = parsed.o for filename in os.listdir(directory): with open (directory+"/"+filename, "r") as file: inp = file.read() if parsed.text_encoding: inp = inp.decode(parsed.text_encoding) if filename != ".DS_Store": if parsed.word_type == "stem": stemmer = Stemmer.Stemmer('russian')
from nltk.book import text1 from collections import Counter from nltk.probability import SimpleGoodTuringProbDist from openpyxl import Workbook #Create your bigrams bgs = nltk.bigrams(text1) #compute frequency distribution for bigrams fdistB = nltk.FreqDist(bgs) #compute frequency distribution for unigrams fdistU = nltk.FreqDist(text1) #apply simple good turing smoothing method sgt = SimpleGoodTuringProbDist(fdistB) #most common 25 unigrams mostCommon = fdistU.most_common(25) #initialize dataframe for the excel sheet column = [] for k, v in mostCommon: column.append(k) #data frame df = pd.DataFrame(index=column, columns=column) #fill data frame with probability data for k, v in mostCommon:
def _estimator(fdist, bins): return SimpleGoodTuringProbDist(fdist)
from __future__ import division from nltk.corpus.reader import ConllCorpusReader from nltk.probability import FreqDist, DictionaryProbDist, LaplaceProbDist, SimpleGoodTuringProbDist, MLEProbDist conllreader = ConllCorpusReader(".", "de-train.tt", ('words', 'pos')) # getting a train corpus from file states = ('VERB', 'NOUN', 'PRON', 'ADJ', 'ADV', 'ADP', 'CONJ', 'DET', 'NUM', 'PRT', 'X', '.') # list of 12 POS tags sentslen = len(conllreader.tagged_sents()) # getting number of sentences tagfdist = FreqDist(pair[1] for pair in conllreader.tagged_words()) # getting frequence of (word,tag) firsttagfdist = FreqDist(pair[0][1] for pair in conllreader.tagged_sents()) # getting frequence of first tags A0j = DictionaryProbDist(dict(map(lambda (k, x): (k, x/sentslen), firsttagfdist.iteritems()))) A0jLap = LaplaceProbDist(firsttagfdist) A0jGT = SimpleGoodTuringProbDist(firsttagfdist) A0jMLE = MLEProbDist(firsttagfdist) TagPair = [] words = conllreader.tagged_words() for i in range(0, len(words)-1): TagPair.append((words[i][1], words[i+1][1])) TagPairfdist = FreqDist(TagPair) Aij = DictionaryProbDist(dict(map(lambda (k, x): (k, x/tagfdist.get(k[0])), TagPairfdist.iteritems()))) AijLap = LaplaceProbDist(TagPairfdist) AijGT = SimpleGoodTuringProbDist(TagPairfdist) AijMLE = MLEProbDist(TagPairfdist) TagWordfdist = FreqDist(conllreader.tagged_words()) Biw = DictionaryProbDist(dict(map(lambda (k, x): (k, x/tagfdist.get(k[1])), TagWordfdist.iteritems()))) BiwLap = LaplaceProbDist(TagWordfdist) BiwGT = SimpleGoodTuringProbDist(TagWordfdist)
def simple_good_turing_estimator(fdist, bins): return SimpleGoodTuringProbDist(fdist, bins=bins)
current_line = test_file[i] word_tag = current_line.split('\t\t') words.add(word_tag[0]) tags.add(word_tag[1]) current_sentence.append((word_tag[0], word_tag[1])) if word_tag[0] == '.': test.append(current_sentence) current_sentence = [] tags = list(tags) words = list(words) trainer = hmm.HiddenMarkovModelTrainer(tags, words) # tagger = trainer.train_supervised(train, estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins)) # tagger = trainer.train_supervised(train, estimator=lambda fd, bins: MLEProbDist(fd)) tagger = trainer.train_supervised( train, estimator=lambda fd, bins: SimpleGoodTuringProbDist(fd, bins)) # tagger = trainer.train_supervised(train, estimator=lambda fd, bins: WittenBellProbDist(fd, bins)) # tagger = trainer.train_supervised(train, estimator=lambda fd, bins: KneserNeyProbDist(fd, bins)) print("here") predicted = [] real = [] for i in range(0, len(test) - 1): current = list(zip(*test[i])) tagged = tagger.tag(list(current[0])) current_tags = list(list(zip(*tagged))[1]) predicted += current_tags real += list(current[1]) print(tags) confusion = confusion_matrix(predicted, real, labels=tags)
class Simulator(object): def __init__(self, tokens, frequencies, debug=DEBUG, stop=STOP): self.freqdist = frequencies self.tokens = tokens | set(self.freqdist) self.DEBUG = debug self.STOP = stop if len(self.freqdist) == 0: raise ValueError("No frequencies given!") while not self.freqdist.hapaxes(): warnings.warn("no hapaxes present -- shifting distribution down") min_freq = min(self.freqdist.values()) - 1 for k in self.freqdist: self.freqdist[k] -= min_freq if min(self.freqdist.values()) != 0: warnings.warn("no unseen present -- adding dummy category") self.tokens.add("<dummy>") self.sgt = SimpleGoodTuringProbDist(self.freqdist, bins=len(self.tokens)) self._get_probabilities() def _get_probabilities(self): self.probabilities = {} for t in self.tokens: self.probabilities[t] = self.sgt.prob(t) return self.probabilities def dump(self): # pragma: no cover for t in self.tokens: print("\t".join([ "%30s" % t, '%d' % self.freqdist[t], '%0.4f' % self.sgt.prob(t) ])) print("HAPAXES: %r" % self.freqdist.hapaxes()) # freq of freqs: print("FreqDist: %r" % sorted(self.freqdist.values(), reverse=True)) print("Slope: %r" % self.sgt._slope) print("Switch at: %r" % self.sgt._switch_at) def simulate(self, n=1000): keys = [k for k in self.probabilities] probs = [self.probabilities[k] for k in keys] # set up itercount = 0 # stopping safety valve transcript = [] complete = False # have we seen everything? seen = Counter({k: 0 for k in self.tokens}) # tokens we've seen while not complete: for char in np.random.choice(keys, n, replace=True, p=probs)[0:n]: transcript.append(char) seen[char] += 1 complete = all(v > 0 for v in seen.values()) # if self.DEBUG: # print(itercount, len(transcript)) # if complete: return len(transcript) if itercount >= self.STOP: raise StopIteration("Abort Abort!") itercount += 1 raise StopIteration("Failed")