def __init__(self): self.bigrams = ConditionalFreqDist() self.unigrams = FreqDist() sentences = nltk.corpus.brown.sents( categories=nltk.corpus.brown.categories()[1:]) for sent in sentences: # Vi utvider setningen med None foran, for å angi start av # setningen, og en None etter, for å markere setningsslutt. sent = [None] + sent + [None] for prev, word in bigrams(sent): self.bigrams[prev][word] += 1 self.unigrams[word] += 1 self.bigrams = ConditionalProbDist(self.bigrams, LaplaceProbDist) self.unigrams = LaplaceProbDist(self.unigrams) ############################# modified lm #################################### # regular expression: self.patterns = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ] # regular expression (modified): self.patternsModified = [ (r'(.*able|.*ish|.*ible)$', 'JJ'), # adjectives # 1 (r'(The|the|A|a|An|an)$', 'AT'), # articles # 2 (r'(a|an|my|some|the)$', 'DT'), # determinative # 3 (r'(our|its|his|their|my|your|her|out|thy|mine|thine)$', 'PP$'), # determinative possesive # 4 (r'(.*ily|.*ly)$', 'ADV'), # adverb # 5 (r'(at|in|of|over|with)$', 'PP'), # preposition # 6 (r'(and|because|but|if|or)$', 'CNJ'), # conjuction # 7 (r'([\.?!;:]+)$', '.'), # sentence terminator # 8 (r'(\,)$', ','), # comma # 9 (r'(\-)$', '-'), # dash # 10 (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ]
def __init__(self): self.bigrams = ConditionalFreqDist() self.unigrams = FreqDist() sentences = nltk.corpus.brown.sents( categories=nltk.corpus.brown.categories()[1:]) for sent in sentences: sent = [None] + sent + [None] for prev, word in bigrams(sent): self.bigrams[prev].inc(word) self.unigrams.inc(word) self.bigrams = ConditionalProbDist(self.bigrams, LaplaceProbDist) self.unigrams = LaplaceProbDist(self.unigrams)
def __init__(self): self.bigrams = ConditionalFreqDist() self.unigrams = FreqDist() sentences = nltk.corpus.brown.sents(categories=nltk.corpus.brown.categories()[1:]) for sent in sentences: # Vi utvider setningen med None foran, for å angi start av # setningen, og en None etter, for å markere setningsslutt. sent = [None] + sent + [None] for prev, word in bigrams(sent): self.bigrams[prev][word] += 1 self.unigrams[word] += 1 self.bigrams = ConditionalProbDist(self.bigrams, LaplaceProbDist) self.unigrams = LaplaceProbDist(self.unigrams)
def __init__(self): self.bigrams = ConditionalFreqDist() self.unigrams = FreqDist() sentences = nltk.corpus.brown.sents( categories=nltk.corpus.brown.categories()[1:]) for sent in sentences: # Vi utvider setningen med None foran, for å angi start av # setningen, og en None etter, for å markere setningsslutt. sent = [None] + sent + [None] for prev, word in bigrams(sent): self.bigrams[prev][word] += 1 self.unigrams[word] += 1 self.bigrams = ConditionalProbDist(self.bigrams, LaplaceProbDist) self.unigrams = LaplaceProbDist(self.unigrams)
def build_LM(in_file): """ build language models for each label each line in in_file contains a label and a string separated by a space """ print 'building language models...' lines = open(in_file, "r") observedFourGrams = set() LM = {} for line in lines: #This loop generates the count language model without one-smoothing with punctuations removed. word_list = line.split() language = word_list[0] word_string = ' '.join(word_list[1:]).lower() fourGrams = ngrams(list(word_string), 4) if language not in LM: LM[language] = FreqDist() for gram in fourGrams: strGram = ''.join(gram) LM[language][strGram] += 1 observedFourGrams.add(strGram) for language in LM: #This loop converts each count model to a probabilistic model with add-one smoothing LM[language] = LaplaceProbDist(LM[language], len(observedFourGrams)) return LM
def _load_corpus_counts(cls): input_file = os.path.join(settings.DATA_DIR, 'corpus', 'jp_char_corpus_counts.gz') freq_dist = FreqDist() with open(input_file, 'r') as istream: istream = gzip.GzipFile(fileobj=istream) istream = codecs.getreader('utf8')(istream) for line in istream: kanji, count = line.split() freq_dist.inc(kanji, count=int(count)) return LaplaceProbDist(freq_dist)
class LM: def __init__(self): self.bigrams = ConditionalFreqDist() self.unigrams = FreqDist() sentences = nltk.corpus.brown.sents( categories=nltk.corpus.brown.categories()[1:]) for sent in sentences: sent = [None] + sent + [None] for prev, word in bigrams(sent): self.bigrams[prev].inc(word) self.unigrams.inc(word) self.bigrams = ConditionalProbDist(self.bigrams, LaplaceProbDist) self.unigrams = LaplaceProbDist(self.unigrams) def p(self, w, prev): p = 0.5 * self.unigrams.prob(w) if prev in self.bigrams: p += self.bigrams[prev].prob(w) return p def logprob(self, s): p = 0 for x, y in self.bigram(s): p = p + log(self.p(x, y), 2) return p def perplexity(self, sents): l = 0 N = 0 for line in sents: l += self.logprob(line) N += len(line) perplexity = pow(2, -l / N) return perplexity
class LM: def __init__(self): self.bigrams = ConditionalFreqDist() self.unigrams = FreqDist() sentences = nltk.corpus.brown.sents( categories=nltk.corpus.brown.categories()[1:]) for sent in sentences: # Vi utvider setningen med None foran, for å angi start av # setningen, og en None etter, for å markere setningsslutt. sent = [None] + sent + [None] for prev, word in bigrams(sent): self.bigrams[prev].inc(word) self.unigrams.inc(word) self.bigrams = ConditionalProbDist(self.bigrams, LaplaceProbDist) self.unigrams = LaplaceProbDist(self.unigrams) def p(self, w, prev): p = 0.5 * self.unigrams.prob(w) if prev in self.bigrams: p += self.bigrams[prev].prob(w) return p def logprob(self, s): s = [None] + s + [None] logprob = 0.0 for prev, word in bigrams(s): logprob += log(self.p(word, prev), 2) return logprob def perplexity(self, sents): logprob = 0.0 words = 0 for s in sents: words += len(s) logprob += self.logprob(s) return pow(2, -logprob / words)
def _load_corpus_counts(cls): input_file = os.path.join(settings.DATA_DIR, 'jp_char_corpus_counts.gz') freq_dist = FreqDist() with gzip.open(input_file, 'rb') as istream: for line in istream: kanji, count = line.split() # Decode kanji = kanji.decode() count = count.decode() freq_dist[kanji] += int(count) return LaplaceProbDist(freq_dist)
class LM: def __init__(self): self.bigrams = ConditionalFreqDist() self.unigrams = FreqDist() sentences = nltk.corpus.brown.sents(categories=nltk.corpus.brown.categories()[1:]) for sent in sentences: # Vi utvider setningen med None foran, for å angi start av # setningen, og en None etter, for å markere setningsslutt. sent = [None] + sent + [None] for prev, word in bigrams(sent): self.bigrams[prev][word] += 1 self.unigrams[word] += 1 self.bigrams = ConditionalProbDist(self.bigrams, LaplaceProbDist) self.unigrams = LaplaceProbDist(self.unigrams) def p(self, w, prev): p = 0.5*self.unigrams.prob(w) if prev in self.bigrams: p += self.bigrams[prev].prob(w) return p def logprob(self, s): prob = 0 if(isinstance(s, basestring)): s = s.split() for prev, w in bigrams(s): prob += log(self.p(w, prev), 2) return prob def perplexity(self, sents): l = 0 N = 0 for s in sents: l += self.logprob(s) N += len(s) return pow(2, -l/N)
class LM: def __init__(self): self.bigrams = ConditionalFreqDist() self.unigrams = FreqDist() sentences = nltk.corpus.brown.sents( categories=nltk.corpus.brown.categories()[1:]) for sent in sentences: # Vi utvider setningen med None foran, for å angi start av # setningen, og en None etter, for å markere setningsslutt. sent = [None] + sent + [None] for prev, word in bigrams(sent): self.bigrams[prev][word] += 1 self.unigrams[word] += 1 self.bigrams = ConditionalProbDist(self.bigrams, LaplaceProbDist) self.unigrams = LaplaceProbDist(self.unigrams) def p(self, w, prev): p = 0.5 * self.unigrams.prob(w) if prev in self.bigrams: p += self.bigrams[prev].prob(w) return p def logprob(self, s): prob = 0 if (isinstance(s, basestring)): s = s.split() for prev, w in bigrams(s): prob += log(self.p(w, prev), 2) return prob def perplexity(self, sents): l = 0 N = 0 for s in sents: l += self.logprob(s) N += len(s) return pow(2, -l / N)
def __init__(self, vecs_path, data_path, min_count): ''' Initialize class to track aligned geographic vector spaces Arguments: * vecs_path - Path to directory containing word2vec-style text files * data_path - Path to directory of text files ofwhite-spaced separated tokens vectors are derived from, with matching geography file names * min_count - The probability of a word with this frequency in the geography with the least amount of training data becomes a lower bound on the a word's maximum p(word|geo) for inclusion. MUST be at least word2vec minimum frequency ''' self.vector_spaces = [] self.name2id = dict() self.id2name = [] freq_dists = [] for f_name in os.listdir(vecs_path): self.name2id[f_name] = len(self.id2name) self.id2name.append(f_name) # Add vector space, frequency distribution for geography self.vector_spaces.append(KeyedVectors.load_word2vec_format(os.path.join(vecs_path, f_name))) print("Loaded " + f_name + " vectors") # freq_dists.append(FreqDist(chain(*[l.split() for l in open(os.path.join(data_path, f_name))]))) fd = FreqDist() for l in open(os.path.join(data_path, f_name)): for w in l.split(): fd[w] += 1 print("Built " + f_name + " frequency distribution") freq_dists.append(fd) self.num_geos = len(self.id2name) # p(word|geography) distributions for each geography, with Laplace smoothing prob_dists = [LaplaceProbDist(fd) for fd in freq_dists] print("Built probability distributions") # Find the probability of a word with frequency min_count in the geographic region # with the least data min_prob = min(prob_dists, key = lambda pd: pd.freqdist().N()).prob(None)*(min_count + 1) # Build vocab from items whose probs in most overrepresented vocabularies # exceed threshold # Allows exclusion of low-probability items that is not biased against geographies # with less associated data self.vocab = list({w for pd in prob_dists for w in pd.samples() if pd.prob(w) >= min_prob}) print("Loaded vocabulary") self.probs = BaseKeyedVectors(self.num_geos) self.probs.add(self.vocab, [np.array([pd.prob(w) for pd in prob_dists]) for w in self.vocab]) print("Built probability vectors") # pmi = log(p(word|geo)/p(word)) = log(p(word|geo)) - log(p(word)) # Let p(word) = avg p(word|geo) over all geographies # allows equal weighting of each geographic vector space regardless of token count pmi = np.log(self.probs.vectors) - np.log(self.probs.vectors.mean(axis=1).reshape(-1, 1)) self.pmi = KeyedVectors(self.num_geos) self.pmi.add(self.vocab, pmi) print("Built PMI vectors")
from __future__ import division from nltk.corpus.reader import ConllCorpusReader from nltk.probability import FreqDist, DictionaryProbDist, LaplaceProbDist, SimpleGoodTuringProbDist, MLEProbDist conllreader = ConllCorpusReader(".", "de-train.tt", ('words', 'pos')) # getting a train corpus from file states = ('VERB', 'NOUN', 'PRON', 'ADJ', 'ADV', 'ADP', 'CONJ', 'DET', 'NUM', 'PRT', 'X', '.') # list of 12 POS tags sentslen = len(conllreader.tagged_sents()) # getting number of sentences tagfdist = FreqDist(pair[1] for pair in conllreader.tagged_words()) # getting frequence of (word,tag) firsttagfdist = FreqDist(pair[0][1] for pair in conllreader.tagged_sents()) # getting frequence of first tags A0j = DictionaryProbDist(dict(map(lambda (k, x): (k, x/sentslen), firsttagfdist.iteritems()))) A0jLap = LaplaceProbDist(firsttagfdist) A0jGT = SimpleGoodTuringProbDist(firsttagfdist) A0jMLE = MLEProbDist(firsttagfdist) TagPair = [] words = conllreader.tagged_words() for i in range(0, len(words)-1): TagPair.append((words[i][1], words[i+1][1])) TagPairfdist = FreqDist(TagPair) Aij = DictionaryProbDist(dict(map(lambda (k, x): (k, x/tagfdist.get(k[0])), TagPairfdist.iteritems()))) AijLap = LaplaceProbDist(TagPairfdist) AijGT = SimpleGoodTuringProbDist(TagPairfdist) AijMLE = MLEProbDist(TagPairfdist) TagWordfdist = FreqDist(conllreader.tagged_words()) Biw = DictionaryProbDist(dict(map(lambda (k, x): (k, x/tagfdist.get(k[1])), TagWordfdist.iteritems()))) BiwLap = LaplaceProbDist(TagWordfdist) BiwGT = SimpleGoodTuringProbDist(TagWordfdist)
fr = FreqDist(tokens) print('X | P(X)') print('___________________') for s in fr.items(): for d in vocab: if d == s[0]: print(d, ' | ', (s[1] / len(tokens)).__round__(2)) UNK = 0 for d in vocab: r = [item for item in fr if item[0] != d] isemp = not all(r) if isemp == True: UNK += (r[1] / len(tokens)).__round__(2) print('UNK | ', UNK) print('== UNIGRAMS AFTER LAPLACE SMOOTHING ==') lpt = LaplaceProbDist(fr) print('X | P(X)') print('___________________') for d in vocab: print(d, ' | ', lpt.prob(d).__round__(2)) UNK = 0 for d in vocab: r = [item for item in lpt.freqdist().items() if item[0] != d] isemp = not all(r) if isemp == True: UNK += lpt.prob(r[0]) print('UNK | ', UNK) print('=========== BIGRAMS ===========') file = open('sampledata.txt', 'r') filetext = file.read() filetext = filetext.replace('</s>', '')
class LM: def __init__(self): self.bigrams = ConditionalFreqDist() self.unigrams = FreqDist() sentences = nltk.corpus.brown.sents( categories=nltk.corpus.brown.categories()[1:]) for sent in sentences: # Vi utvider setningen med None foran, for å angi start av # setningen, og en None etter, for å markere setningsslutt. sent = [None] + sent + [None] for prev, word in bigrams(sent): self.bigrams[prev][word] += 1 self.unigrams[word] += 1 self.bigrams = ConditionalProbDist(self.bigrams, LaplaceProbDist) self.unigrams = LaplaceProbDist(self.unigrams) ############################# modified lm #################################### # regular expression: self.patterns = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ] # regular expression (modified): self.patternsModified = [ (r'(.*able|.*ish|.*ible)$', 'JJ'), # adjectives # 1 (r'(The|the|A|a|An|an)$', 'AT'), # articles # 2 (r'(a|an|my|some|the)$', 'DT'), # determinative # 3 (r'(our|its|his|their|my|your|her|out|thy|mine|thine)$', 'PP$'), # determinative possesive # 4 (r'(.*ily|.*ly)$', 'ADV'), # adverb # 5 (r'(at|in|of|over|with)$', 'PP'), # preposition # 6 (r'(and|because|but|if|or)$', 'CNJ'), # conjuction # 7 (r'([\.?!;:]+)$', '.'), # sentence terminator # 8 (r'(\,)$', ','), # comma # 9 (r'(\-)$', '-'), # dash # 10 (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ] def p(self, w, prev): p = 0.5 * self.unigrams.prob(w) if prev in self.bigrams: p += self.bigrams[prev].prob(w) return p def logprob(self, s): P = 0.0 s = [None] + s + [None] for prevWord, word in bigrams(s): P += log(self.p(word, prevWord), 2) return P def perplexity(self, sents): N = 0.0 l = 0.0 for sent in sents: N += len(sent) l += self.logprob(sent) return 2**(-l / N) def zipfity(self, lst): frekvensOrd = FreqDist() frekvens = [] for setning in lst: frekvensOrd += FreqDist(word.lower() for word in setning) r = 1 for word, antall in frekvensOrd.most_common(10): frekvens.append([word, antall, antall / r]) r += 1 return frekvens def regularTagger(self, lst, regex='patterns'): regexp_tagger = self.setRegexPatterns(regex) tagger = [] for sentence in lst: tagger.append(regexp_tagger.tag(sentence)) return tagger def analyseRegularTagger(self, name, regex='patterns'): regexp_tagger = self.setRegexPatterns(regex) return regexp_tagger.evaluate( nltk.corpus.brown.tagged_sents(categories=name)) def setRegexPatterns(self, regex): if regex == "patterns": return RegexpTagger(self.patterns) elif regex == "modified": return RegexpTagger(self.patternsModified)
plain_train = get_text(plain_path) # format the training data train_data = format_data(cipher_train, plain_train) # test data testc_path = cipher_folder + '/test_cipher.txt' testp_path = cipher_folder + '/test_plain.txt' testc = get_text(testc_path) testp = get_text(testp_path) # format the test data test_data = format_data(testc, testp) trainer = hmm.HiddenMarkovModelTrainer() #laplace estimator my_estimator = lambda fdist, bins: LaplaceProbDist(fdist, bins) if args.laplace_smoothing: if args.supplement: tagger = train_supervised2(trainer, train_data, extra_text(), estimator=my_estimator) else: tagger = trainer.train_supervised(train_data, estimator=my_estimator) else: if args.supplement: tagger = train_supervised2(trainer, train_data, extra_text()) else: tagger = trainer.train_supervised(train_data)
def laplace_estimator(fdist, bins): return LaplaceProbDist(fdist, bins=bins)