def n_gramas_texto(self,texto,N): ''' Searching for n-grams In: (texto, text) text string (N, int) size of the n-gram 2 is equal to bigram, 3 is equal to trigram and so on... Out: (listaNgrama, list) n-gram list ''' if isinstance(N, int): try: if isinstance(texto, str): texto = unicode(texto, "utf-8", "xmlcharrefreplace") texto = self.filtro_caracteres_especiales(texto) lista = self.filtro_palabras_cerradas(texto) listaNgrama=[] for Ngramas in ingrams(lista, N, pad_right=True): if (str(Ngramas[N-1])!= "None"): listaNgrama.append(" ".join(Ngramas)) return listaNgrama except TypeError: return [] except UnicodeDecodeError: return [] except: return [] else: return []
def __init__(self, n, train, estimator=None, *estimator_args, **estimator_kw_args): """ Creates an ngram language model to capture patterns in n consecutive words of training text. An estimator smooths the probabilities derived from the text and may allow generation of ngrams not seen during training. >>> from nltk.corpus import brown >>> from nltk.probability import LidstoneProbDist >>> estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) >>> lm = NgramModel(3, brown.words(categories='news'), estimator) >>> lm.entropy(['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ... 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', ... 'primary', 'election', 'produced', '``', 'no', 'evidence', ... "''", 'that', 'any', 'irregularities', 'took', 'place', '.']) ... # doctest: +ELLIPSIS 1.682... :param n: the order of the language model (ngram size) :type n: int :param train: the training text :type train: list of string :param estimator: a function for generating a probability distribution :type estimator: a function that takes a ConditionalFreqDist and returns a ConditionalProbDist :param estimator_args: Extra arguments for estimator. These arguments are usually used to specify extra properties for the probability distributions of individual conditions, such as the number of bins they contain. Note: For backward-compatibility, if no arguments are specified, the number of bins in the underlying ConditionalFreqDist are passed to the estimator as an argument. :type estimator_args: (any) :param estimator_kw_args: Extra keyword arguments for estimator. :type estimator_kw_args: (any) """ self._n = n if estimator is None: estimator = _estimator cfd = ConditionalFreqDist() self._ngrams = set() self._prefix = ("",) * (n - 1) for ngram in ingrams(chain(self._prefix, train), n): self._ngrams.add(ngram) context = tuple(ngram[:-1]) token = ngram[-1] cfd[context].inc(token) if (not estimator_args) and (not estimator_kw_args): self._model = ConditionalProbDist(cfd, estimator, len(cfd)) else: self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kw_args) # recursively construct the lower-order models if n > 1: self._backoff = NgramModel(n - 1, train, estimator, *estimator_args, **estimator_kw_args)
def eval_word(self, word): ''' @return: probability that proposed word is a word. ''' if self._hypothetical_phonotactics: # Make a deep copy of the ngram model so that we can update it without modifying the original word_ngram_model = copy.deepcopy(self._ngram_model) word_ngram_model.update([word]) else: word_ngram_model = self._ngram_model # If I want to duplicate functionality of OCaml code, should probably make a ProbDist that inherits from Lidstone, # otherwise this won't do the denominator adjustments and other stuff. # for ngram in ingrams(chain(self._ngram_model._padding, word, self._ngram_model._padding), self._n): # print("Ngram: {}\tProb for {} given {}: {}".format(ngram, ngram[-1], tuple(ngram[:-1]), self._ngram_model.prob(tuple(ngram[:-1]), ngram[-1])), file=sys.stderr) if self._n > 1: adjustment = Fraction(1) else: # Need to compensate for the lack of an empty word by getting rel freq of word_delimiter somehow # print("word delimiter prob: {}".format(self._ngram_model.prob(self._word_delimiter, self._ngram_model._padding)), file=sys.stderr) adjustment = Fraction(1) / (Fraction(1) - self._ngram_model.prob(self._word_delimiter, self._ngram_model._padding)) # print([self._ngram_model.prob(ngram[-1], tuple(ngram[:-1])) for ngram in # ingrams(chain(*([self._word_delimiter, word, self._word_delimiter] if self._n > 1 else [word, self._word_delimiter])), # self._n)], file=sys.stderr) raw_score = self._score_combiner([self._ngram_model.prob(ngram[-1], tuple(ngram[:-1])) for ngram in ingrams(chain(*([self._word_delimiter, word, self._word_delimiter] if self._n > 1 else [word, self._word_delimiter])), self._n)]) # print("adjustment: {:.10e}\traw score: {:.10e}".format(float(adjustment), float(raw_score)), file=sys.stderr) return adjustment * raw_score
def N_gramas_lista(self,lista,N): if isinstance(lista, list) and isinstance(N, int): if (all(type(x) is str for x in lista)): try: listaNgrama=[] remove_punctuation_map = dict((ord(char), None) for char in string.punctuation) lista=[unicode(s,codificacion) for s in lista] lista=[s.translate(remove_punctuation_map).lower() for s in lista] lista=[str(self.elimina_tildes(s)) for s in lista] lista= filter(None,lista) lista= filter(str.strip, lista) for Ngramas in ingrams(lista, N, pad_right=True): if (str(Ngramas[N-1])!= "None"): listaNgrama.append(" ".join(Ngramas)) return listaNgrama except TypeError: return [] except UnicodeDecodeError: return [] except: return [] else: return [] else: return []
def __init__(self, n, train, estimator=None): """ Creates an ngram language model to capture patterns in n consecutive words of training text. An estimator smooths the probabilities derived from the text and may allow generation of ngrams not seen during training. :param n: the order of the language model (ngram size) :type n: int :param train: the training text :type train: list(str) :param estimator: a function for generating a probability distribution :type estimator: function(ConditionalFreqDist) -> ConditionalProbDist """ self._n = n if estimator is None: estimator = _estimator cfd = ConditionalFreqDist() self._ngrams = set() self._prefix = ('',) * (n - 1) for ngram in ingrams(chain(self._prefix, train), n): self._ngrams.add(ngram) context = tuple(ngram[:-1]) token = ngram[-1] cfd[context].inc(token) self._model = ConditionalProbDist(cfd, estimator, len(cfd)) # recursively construct the lower-order models if n > 1: self._backoff = NgramModel(n-1, train, estimator)
def from_words(cls, words): """Construct a QuadgramCollocationFinder for all quadgrams in the given sequence. """ wfd = FreqDist() bfd = FreqDist() wild2fd = FreqDist() tfd = FreqDist() wild3fd = FreqDist() qfd = FreqDist() for w1, w2, w3, w4 in ingrams(words, 4, pad_right=True): wfd.inc(w1) if w2 is None: continue bfd.inc((w1, w2)) if w3 is None: continue wild2fd.inc((w1, w3)) tfd.inc((w1, w2, w3)) if w4 is None: continue wild2fd.inc((w1, w4)) wild3fd.inc((w1, w2, w4)) wild3fd.inc((w1, w3, w4)) qfd.inc((w1, w2, w3, w4)) return cls(wfd, bfd, wild2fd, tfd, wild3fd, qfd)
def big_test(version="3.0", max_length=3): from topicmod.util.wordnet import load_wn from nltk.corpus import brown from nltk.util import ingrams wn = load_wn(version) term_counts = defaultdict(int) for ngram_length in xrange(max_length): token = 0 for w in ingrams(brown.words(), ngram_length): token += 1 normalized = "_".join(w).lower() if wn.synsets(normalized, 'n'): term_counts[wn.morphy(normalized)] += 1 filename = "wn/wordnet.wn" if version != "3.0": filename = "wn/wordnet_%s.wn" % version o = OntologyWriter(filename) for ii in orderedTraversal(wn): o.AddSynset(ii.offset, ii.name, [x.offset for x in ii.hyponyms() + ii.instance_hyponyms()], [(0, x.name.lower(), term_counts[x.name] + 1) for x in ii.lemmas]) o.Finalize()
def __init__(self, n, train, pad_left=True, pad_right=False,estimator=None, *estimator_args, **estimator_kwargs): super(MyNgramModel,self).__init__(n,train,pad_left,pad_right,estimator,*estimator_args, **estimator_kwargs) assert(isinstance(pad_left, bool)) assert(isinstance(pad_right, bool)) self._n = n self._lpad = ('',) * (n - 1) if pad_left else () self._rpad = ('',) * (n - 1) if pad_right else () if estimator is None: estimator = _estimator self._cfd = ConditionalFreqDist() self._ngrams = set() # If given a list of strings instead of a list of lists, create enclosing list if (train is not None) and isinstance(train[0], basestring): train = [train] for sent in train: for ngram in ingrams(chain(self._lpad, sent, self._rpad), n): self._ngrams.add(ngram) context = tuple(ngram[:-1]) token = ngram[-1] self._cfd[context].inc(token) if not estimator_args and not estimator_kwargs: self._model = ConditionalProbDist(self._cfd, estimator, len(self._cfd)) else: self._model = ConditionalProbDist(self._cfd, estimator, *estimator_args, **estimator_kwargs) # recursively construct the lower-order models self._backoff = None if n > 1: self._backoff = MyNgramModel(n-1, train, pad_left, pad_right, estimator, *estimator_args, **estimator_kwargs) if self._backoff is not None: self._backoff_alphas = dict() # For each condition (or context) for ctxt in self._cfd.conditions(): pd = self._model[ctxt] # prob dist for this context backoff_ctxt = ctxt[1:] backoff_total_pr = 0 total_observed_pr = 0 for word in self._cfd[ctxt].keys(): # this is the subset of words that we OBSERVED backoff_total_pr += self._backoff.prob(word,backoff_ctxt) total_observed_pr += pd.prob(word) assert total_observed_pr <= 1 and total_observed_pr > 0 assert backoff_total_pr <= 1 and backoff_total_pr > 0 alpha_ctxt = (1.0-total_observed_pr) / (1.0-backoff_total_pr) self._backoff_alphas[ctxt] = alpha_ctxt
def __init__(self, n, train, estimator=None, *estimator_args, **estimator_kw_args): """ Creates an ngram language model to capture patterns in n consecutive words of training text. An estimator smooths the probabilities derived from the text and may allow generation of ngrams not seen during training. @param n: the order of the language model (ngram size) @type n: C{int} @param train: the training text @type train: C{list} of C{string} (or C{list} of C{string} C{list}s) @param estimator: a function for generating a probability distribution @type estimator: a function that takes a C{ConditionalFreqDist} and returns a C{ConditionalProbDist} @param estimator_args: Extra arguments for C{estimator}. These arguments are usually used to specify extra properties for the probability distributions of individual conditions, such as the number of bins they contain. Note: For backward-compatibility, if no arguments are specified, the number of bins in the underlying C{ConditionalFreqDist} are passed to the estimator as an argument. @type estimator_args: (any) @param estimator_kw_args: Extra keyword arguments for C{estimator}. @type estimator_kw_args: (any) """ self._n = n if estimator is None: estimator = _estimator cfd = ConditionalFreqDist() self._ngrams = set() self._padding = ('',) * (n - 1) # If given a list of strings instead of a list of lists, create enclosing list if isinstance(train[0], basestring): train = [train] for utterance in train: for ngram in ingrams(chain(self._padding, utterance, self._padding), n): self._ngrams.add(ngram) context = tuple(ngram[:-1]) token = ngram[-1] cfd[context].inc(token) if (not estimator_args) and (not estimator_kw_args): self._model = ConditionalProbDist(cfd, estimator, len(cfd)) else: self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kw_args) # recursively construct the lower-order models if n > 1: self._backoff = NgramModel(n-1, train, estimator, *estimator_args, **estimator_kw_args)
def quantify_variant(analysis, variant): n = variant + 1 d = {} all_pos_tags = [ pos for (_, pos) in analysis.pos_tags() ] for ngram in ingrams(all_pos_tags, n): sparse_dict_increment(d, ngram) return {output_filter_ngram(k): v for (k, v) in d.items()}
def classify(self, sentence, tokenizer_lang, ngram_length=3): features = [] for ii in self.tokenizers[tokenizer_lang].tokenize(sentence): d = {} for jj in ingrams(ii, ngram_length): d[jj] = d.get(jj, 0) + 1 features.append(d) data = SparseDataSet(features) f = FreqDist() for ii in [self._labels[self._classifier.classify(data, x)[0]] for x in xrange(len(features))]: f.inc(ii) return f
def update(self, samples, increase_amount=1): ''' Update the underlying frequency distributions given the current list of samples. ''' cond_samples = [] for utterance in samples: for ngram in ingrams(chain(self._padding, utterance, self._padding), self._n): self._ngrams.add(ngram) cond_samples.append((tuple(ngram[:-1]), ngram[-1])) self._model.update(cond_samples, increase_amount) # Recursively update lower-order models if self._backoff: self._backoff.update(samples, increase_amount)
def AddInstance(self, lang, line, ngram_length): if not lang in self.label_names: self.label_names.append(lang) id = line[:line.find("\t")] sentence = line[line.find("\t")+1:].strip() d = {} for ii in ingrams(line, ngram_length): d[ii] = d.get(ii, 0) + 1 self.features.append(d) self.labels.append(self.label_names.index(lang)) self.id.append("%s-%s" % (lang, id))
def GetNGrams(self,tokenized_sentences): # Description = Takes the tokenized sentences and outputs all of the n-grams from a given text # Inputs: # Tokenized Sentences = The sentences after we tokenize the file to be used. # Outputs: # n_grams = The list of n_grams found in the file, not unique # This function simply turns all of the work into a really long vector of ngrams n_grams = [] for token_sent in tokenized_sentences: sent_n_grams = ingrams(token_sent,self.N,self.leftpad,self.rightpad) n_grams.extend(sent_n_grams) return n_grams
def prepareBigrams(self, window_size, word): wfd = FreqDist() bfd = FreqDist() if word == '': for sentence in self.__sentences: if len(sentence) > 1: for window in ingrams(sentence, window_size, pad_right=True): if window[0] not in self.__ignoredColl: w1 = window[0] try: window = window[:list(window).index(w1, 1)] except ValueError: pass wfd.inc(w1) for w2 in set(window[1:]): if w2 is not None and w2 not in self.__ignoredColl: bfd.inc((w1, w2)) else: for sentence in self.__sentences: if len(sentence) > 1: for window in ingrams(sentence, window_size, pad_right=True): if window[0] not in self.__ignoredColl: w1 = window[0] try: window = window[:list(window).index(w1, 1)] except ValueError: pass bigramOK = False for w2 in set(window[1:]): if w2 is not None and w2 not in self.__ignoredColl and (w1 == word or w2==word): bfd.inc((w1, w2)) bigramOK = True if bigramOK: wfd.inc(w1) self.__bigrams = MyBigramCollFinder(wfd, bfd)
def __init__(self, n, train, estimator, freqtype, padding, backoff, *estimator_args, **estimator_kw_args): """ Creates an ngram language model to capture patterns in n consecutive words of training text. An estimator smooths the probabilities derived from the text and may allow generation of ngrams not seen during training. @param n: the order of the language model (ngram size) @type n: L{int} @param train: the training text @type train: L{list} of L{str} (or L{list} of L{str} L{list}s) @param estimator: a function for generating a probability distribution (must take FreqDist as first argument, and n as second) @type estimator: a function that takes a L{ConditionalFreqDist} and returns a L{ConditionalProbDist} @param freqtype: the type to use to store the counts in the underlying frequency distribution @type freqtype: any numeric type @param backoff: whether or not we should use Katz back-off @type backoff: L{bool} @param estimator_args: Extra arguments for L{estimator}. These arguments are usually used to specify extra properties for the probability distributions of individual conditions, such as the number of bins they contain. @type estimator_args: (any) @param estimator_kw_args: Extra keyword arguments for L{estimator}. @type estimator_kw_args: (any) """ self._n = n cfd = ConditionalFreqDist(counttype=freqtype) self._ngrams = set() self._padding = (padding,) * (n - 1) self._estimator = estimator self._freqtype = freqtype self._estimator_args = estimator_args self._estimator_kw_args = estimator_kw_args if train: for utterance in train: for ngram in ingrams(chain(self._padding, utterance, self._padding), n): self._ngrams.add(ngram) context = tuple(ngram[:-1]) token = ngram[-1] cfd[context].inc(token) self._model = ConditionalProbDist(cfd, estimator, self._freqtype, n, *estimator_args, **estimator_kw_args) # recursively construct the lower-order models self._backoff = PartialCountNgramModel(n - 1, train, estimator, freqtype, padding, backoff, *estimator_args, **estimator_kw_args) if (backoff and n > 1) else None
def from_words(cls, words, window_size=2): """Construct a BigramCollocationFinder for all bigrams in the given sequence. By default, bigrams must be contiguous. """ wfd = FreqDist() bfd = FreqDist() if window_size < 2: raise ValueError, "Specify window_size at least 2" for window in ingrams(words, window_size, pad_right=True): w1 = window[0] wfd.inc(w1) for w2 in window[1:]: if w2 is not None: bfd.inc((w1, w2)) return cls(wfd, bfd)
def big_test(hyperparam, treefilename, hyperfilename, vocab, version="3.0", max_length=2): """ @param hyperparam A function that, given a synset, returns a hyperparameter value @param version The version of WordNet we use @param max_length The maximum length of n-grams we'll use for computing counts """ from python_lib.wordnet import load_wn from nltk.corpus import brown from nltk.util import ingrams wn = load_wn(version) term_counts = defaultdict(int) for ngram_length in xrange(max_length): token = 0 for w in ingrams(brown.words(), ngram_length): token += 1 normalized = "_".join(w).lower() if wn.synsets(normalized, 'n'): term_counts[wn.morphy(normalized)] += 1 print("Done collecting counts") if not treefilename: treefilename = "wn/wordnet.wn" if version != "3.0": treefilename = "wn/wordnet_%s.wn" % version o = OntologyWriter(treefilename, vocab=vocab, max_leaves=-1) for ii in wn.all_synsets('n'): o.AddSynset(ii.offset, ii.name, [x.offset for x in ii.hyponyms() + ii.instance_hyponyms()], [(0, x.name.lower(), term_counts[x.name] + 1) for x in ii.lemmas], hyperparameter=hyperparam(ii)) o.Finalize() #hyperparam.dump(filename + "_hyp.lookup") hyperparam.dump(hyperfilename)
def from_words(cls, words, window_size=2): """Construct a BigramCollocationFinder for all bigrams in the given sequence. When window_size > 2, count non-contiguous bigrams, in the style of Church and Hanks's (1990) association ratio. """ wfd = FreqDist() bfd = FreqDist() if window_size < 2: raise ValueError("Specify window_size at least 2") for window in ingrams(words, window_size, pad_right=True): w1 = window[0] wfd.inc(w1) for w2 in window[1:]: if w2 is not None: bfd.inc((w1, w2)) return cls(wfd, bfd, window_size=window_size)
def N_gramas_texto(self,texto,N): if isinstance(texto, str) and isinstance(N, int): try: remove_punctuation_map = dict((ord(char), None) for char in string.punctuation) lista=[s.translate(remove_punctuation_map).lower() for s in nltk.word_tokenize(unicode(texto,codificacion))] lista=[str(self.elimina_tildes(s)) for s in lista] lista= filter(None,lista) lista= filter(str.strip, lista) listaNgrama=[] for Ngramas in ingrams(lista, N, pad_right=True): if (str(Ngramas[N-1])!= "None"): listaNgrama.append(" ".join(Ngramas)) return listaNgrama except TypeError: return [] except UnicodeDecodeError: return [] except: return [] else: return []
def from_words(cls, words, window_size=2): """Construct a BigramCollocationFinder for all bigrams in the given sequence. By default, bigrams must be contiguous. """ wfd = FreqDist() bfd = FreqDist() if window_size < 2: raise ValueError, "Specify window_size at least 2" for window in ingrams(words, window_size, pad_right=True): w1 = window[0] try: window = window[:list(window).index(w1, 1)] except ValueError: pass wfd.inc(w1) for w2 in set(window[1:]): if w2 is not None: bfd.inc((w1, w2)) return cls(wfd, bfd)
def __init__(self, n, train, estimator=None): """ Creates an ngram language model to capture patterns in n consecutive words of training text. An estimator smooths the probabilities derived from the text and may allow generation of ngrams not seen during training. @param n: the order of the language model (ngram size) @type n: C{int} @param train: the training text @type train: C{list} of C{string} @param estimator: a function for generating a probability distribution @type estimator: a function that takes a C{ConditionalFreqDist} and returns a C{ConditionalProbDist} """ self._n = n self._N = 1 + len(train) - n if estimator is None: def estimator(fdist, bins): return MLEProbDist(fdist) if n == 1: fd = FreqDist(train) self._model = estimator(fd, fd.B()) else: cfd = ConditionalFreqDist() self._ngrams = set() self._prefix = ('',) * (n - 1) for ngram in ingrams(chain(self._prefix, train), n): self._ngrams.add(ngram) context = tuple(ngram[:-1]) token = ngram[-1] cfd[context][token] += 1 self._model = ConditionalProbDist(cfd, estimator, len(cfd)) # recursively construct the lower-order models if n > 1: self._backoff = NgramModel(n - 1, train, estimator)
def from_words(cls, words): """Construct a QuadgramCollocationFinder for all quadgrams in the given sequence. """ wfd = FreqDist() bfd = FreqDist() wild2fd = FreqDist() tfd = FreqDist() wild3fd = FreqDist() qfd = FreqDist() wild4fd = FreqDist() pfd = FreqDist() for w1, w2, w3, w4, w5 in ingrams(words, 5, pad_right=True): wfd.inc(w1) if w2 is None: continue bfd.inc((w1, w2)) if w3 is None: continue wild2fd.inc((w1, w3)) tfd.inc((w1, w2, w3)) if w4 is None: continue wild2fd.inc((w1, w4)) wild3fd.inc((w1, w2, w4)) wild3fd.inc((w1, w3, w4)) qfd.inc((w1, w2, w3, w4)) if w5 is None: continue wild2fd.inc((w1, w5)) wild3fd.inc((w1, w2, w5)) wild3fd.inc((w1, w3, w5)) wild3fd.inc((w1, w4, w5)) wild4fd.inc((w1, w3, w4, w5)) wild4fd.inc((w1, w2, w4, w5)) wild4fd.inc((w1, w2, w3, w5)) pfd.inc((w1, w2, w3, w4, w5)) return cls(wfd, bfd, wild2fd, tfd, wild3fd, qfd, wild4fd, pfd)
def prob_classify(self, document): """ @return: a probability distribution over labels for the given document. @rtype: L{ProbDistI <nltk.probability.ProbDistI>} """ # Find the log probabilty of each label, given the features. # Start with the log probability of the label itself. logprob = {} # Loop through each possible label and calculate the # (log) probability of document under that label for label in self.labels(): # Probability of category logprob[label] = self.model.category_probdist.prob(label) # Extract ngram model ngram_model = self.model.ngrams[label] prefix = ('',) * (ngram_model._n - 1) # Prepare words words = [w.lower() for w in document if w.isalpha()] # Go through each word and calculate P(w | context) for ngram in ingrams(chain(prefix, words), ngram_model._n): context = tuple(ngram[:-1]) token = ngram[-1] try: logprob[label] += -ngram_model.logprob(token, context) except RuntimeError: # Unknown word, skip it #logger.debug(label + ': Ignoring unknown word: ' + token) continue #logger.debug(label + ': ' + str(logprob[label])) # Return probability for each label return DictionaryProbDist(logprob, normalize=True, log=True)
def eval_word(self, word): ''' @return: probability that proposed word is a word. ''' if self._hypothetical_phonotactics: # Make a deep copy of the ngram model so that we can update it without modifying the original word_ngram_model = copy.deepcopy(self._ngram_model) word_ngram_model.update([word]) else: word_ngram_model = self._ngram_model # If I want to duplicate functionality of OCaml code, should probably make a ProbDist that inherits from Lidstone, # otherwise this won't do the denominator adjustments and other stuff. # for ngram in ingrams(chain(self._ngram_model._padding, word, self._ngram_model._padding), self._n): # print("Ngram: {}\tProb for {} given {}: {}".format(ngram, ngram[-1], tuple(ngram[:-1]), self._ngram_model.prob(tuple(ngram[:-1]), ngram[-1])), file=sys.stderr) if self._n > 1: adjustment = Fraction(1) else: # Need to compensate for the lack of an empty word by getting rel freq of word_delimiter somehow # print("word delimiter prob: {}".format(self._ngram_model.prob(self._word_delimiter, self._ngram_model._padding)), file=sys.stderr) adjustment = Fraction(1) / (Fraction(1) - self._ngram_model.prob( self._word_delimiter, self._ngram_model._padding)) # print([self._ngram_model.prob(ngram[-1], tuple(ngram[:-1])) for ngram in # ingrams(chain(*([self._word_delimiter, word, self._word_delimiter] if self._n > 1 else [word, self._word_delimiter])), # self._n)], file=sys.stderr) raw_score = self._score_combiner([ self._ngram_model.prob(ngram[-1], tuple(ngram[:-1])) for ngram in ingrams( chain( *([self._word_delimiter, word, self._word_delimiter] if self._n > 1 else [word, self._word_delimiter])), self._n) ]) # print("adjustment: {:.10e}\traw score: {:.10e}".format(float(adjustment), float(raw_score)), file=sys.stderr) return adjustment * raw_score
def get_nested_noun_phrase_chunks(noun_phrase_chunks): # grabs any nested noun phrases # helpful for when original noun phrase includes tokens # which shouldn't really be part of the noun-phrase # e.g. [('use', 'V'), ('firewalls', 'N')] should just be 'firewalls' def is_nounphrase(tagged_phrase): # Determine whether a tagged phrase matches the # noun-phrase regular expression tag sequence for subtree in NP_PARSER.parse(tagged_phrase).subtrees(): if subtree.node == 'NounPhrase': return True return False from nltk.util import ingrams nested_noun_phrase_chunks = [] for np in noun_phrase_chunks: for ngramSize in xrange(len(np)-1,0,-1): for ngram in ingrams(np,ngramSize): if is_nounphrase(ngram): nested_noun_phrase_chunks.append(list(ngram)) return nested_noun_phrase_chunks
def get_nested_noun_phrase_chunks(noun_phrase_chunks): # grabs any nested noun phrases # helpful for when original noun phrase includes tokens # which shouldn't really be part of the noun-phrase # e.g. [('use', 'V'), ('firewalls', 'N')] should just be 'firewalls' def is_nounphrase(tagged_phrase): # Determine whether a tagged phrase matches the # noun-phrase regular expression tag sequence for subtree in NP_PARSER.parse(tagged_phrase).subtrees(): if subtree.node == 'NounPhrase': return True return False from nltk.util import ingrams nested_noun_phrase_chunks = [] for np in noun_phrase_chunks: for ngramSize in xrange(len(np) - 1, 0, -1): for ngram in ingrams(np, ngramSize): if is_nounphrase(ngram): nested_noun_phrase_chunks.append(list(ngram)) return nested_noun_phrase_chunks
def __init__(self, n, train, estimator=None, factor=0.77): """ Creates an ngram language model to capture patterns in n consecutive words of training text. An estimator smooths the probabilities derived from the text and may allow generation of ngrams not seen during training. @param n: the order of the language model (ngram size) @type n: C{int} @param train: the training text @type train: C{list} of C{string} @param estimator: a function for generating a probability distribution @type estimator: a function that takes a C{ConditionalFreqDist} and returns a C{ConditionalProbDist} """ self._n = n if estimator is None: if n > 1: # Use smoothing based on Ney et al probdist_factory = lambda fdist, bins, n_train, n_0: \ NeyProbDist(fdist, bins, n_train, n_0, factor, NeyProbDist.ABSOLUTE) #probdist_factory = lambda fdist, bins, *args: LaplaceProbDist(fdist, bins) else: # Use simple add-1 smoothing for unigrams probdist_factory = lambda fdist, bins, *args: LaplaceProbDist(fdist, bins) else: probdist_factory = estimator # Initialize conditional frequency distribution cfd = ConditionalFreqDist() # Initialize set of ngrams self._ngrams = set() self._ngram_count = 0 # Prefix beginning of document with empty strings self._prefix = ('',) * (n - 1) # Count the number of training examples num_training = 0 # Loop through each ngram and add to CFD for ngram in ingrams(chain(self._prefix, train), n): # Lowercase words ngram = tuple(w.lower() for w in ngram) # Add to known ngrams self._ngrams.add(ngram) # Add to CFD context = tuple(ngram[:-1]) token = ngram[-1] cfd[context].inc(token) num_training += 1 # Calculate vocabulary size (for NeyProbDist) v = len(set(train)) bins = v ** n # Number of bins with a count > 0 self._ngram_count = len(self._ngrams) # Gives us number of bins with count = 0 n_0 = bins - self._ngram_count # Create CPD model self._model = ConditionalProbDist(cfd, probdist_factory, bins, num_training, n_0) # recursively construct the lower-order models if n > 1: self._backoff = SLINgramModel(n-1, train, estimator)
def __init__(self, n, train, k=5, v=None, liveDangerously=False, quiet=False): """ Creates an Katz-threshholded Ngram language model to capture patterns in n consecutive words of training text. Uses the KGoodTuringProbDist to estimate the conditional and unigram probabilities, to provide coverage of Ngrams not seen during training. @param n: the order of the language model (ngram size) @type n: C{int} @param train: the training text @type train: C{list} of C{string} @param k: The threshhold above which counts are assumed to be reliable. Defaults to 5. @type k: C{Int} @param v: The number of unseens of degree 1. Defaults to the number of types in the training set @type v: C{Int} @param liveDangerously: If False, for each model check that the total probability mass after all adjustments is close to 1. Defaults to False. @type liveDangerously: C{Boolean} @param quiet: Various information will be printed during model construction unless this is True. Defaults to False. @type quiet: C{Boolean} """ self._n = n self._N = 1 + len(train) - n fd = FreqDist(train) if v is None: v = fd.B() print(('v', v)) if n == 1: # Treat this case specially self._model = KGoodTuringProbDist(fd, k, v, liveDangerously, ()) if not quiet: print("%s entries for %s tokens at degree 1, %s" % (len(fd), fd.N(), self._model.status)) else: def estimator(fdist, ctxt): return KGoodTuringProbDist(fdist, k, v, liveDangerously, ctxt) cfd = ConditionalFreqDist() for ngram in ingrams(train, n): # self._ngrams.add(ngram) context = tuple(ngram[:-1]) token = ngram[-1] cfd[context].inc(token) self._model = ConditionalProbDist(cfd, estimator, True) if not quiet: statuses = {'normal': 0, 'bigSkewed': 0, 'weak': 0, LowHacked: 0} for ctx in cfd.conditions(): statuses[self[ctx].status] += 1 print("%s conditions at degree %s" % (len(cfd.conditions()), n)) for s in list(statuses.keys()): print(" %s %6d" % (s, statuses[s])) # recursively construct the lower-order models self._backoff = KBNgramModel(n - 1, train, k, v, liveDangerously)
def __add_token_ngrams(self, token): for current_ngram in ingrams(token, self.k + 1): sparse_dict_increment(self.histogram, ''.join(current_ngram))
def __init__(self, n, train, pad_left=True, pad_right=False, estimator=None, *estimator_args, **estimator_kwargs): """ Create an ngram language model to capture patterns in n consecutive words of training text. An estimator smooths the probabilities derived from the text and may allow generation of ngrams not seen during training. >>> from nltk.corpus import brown >>> from nltk.probability import LidstoneProbDist >>> est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) >>> lm = NgramModel(3, brown.words(categories='news'), estimator=est) >>> lm <NgramModel with 91603 3-grams> >>> lm._backoff <NgramModel with 62888 2-grams> >>> lm.entropy(['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ... 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', ... 'primary', 'election', 'produced', '``', 'no', 'evidence', ... "''", 'that', 'any', 'irregularities', 'took', 'place', '.']) ... # doctest: +ELLIPSIS 0.5776... :param n: the order of the language model (ngram size) :type n: int :param train: the training text :type train: list(str) or list(list(str)) :param pad_left: whether to pad the left of each sentence with an (n-1)-gram of empty strings :type pad_left: bool :param pad_right: whether to pad the right of each sentence with an (n-1)-gram of empty strings :type pad_right: bool :param estimator: a function for generating a probability distribution :type estimator: a function that takes a ConditionalFreqDist and returns a ConditionalProbDist :param estimator_args: Extra arguments for estimator. These arguments are usually used to specify extra properties for the probability distributions of individual conditions, such as the number of bins they contain. Note: For backward-compatibility, if no arguments are specified, the number of bins in the underlying ConditionalFreqDist are passed to the estimator as an argument. :type estimator_args: (any) :param estimator_kwargs: Extra keyword arguments for the estimator :type estimator_kwargs: (any) """ # protection from cryptic behavior for calling programs # that use the pre-2.0.2 interface assert(isinstance(pad_left, bool)) assert(isinstance(pad_right, bool)) self._n = n self._lpad = ('',) * (n - 1) if pad_left else () self._rpad = ('',) * (n - 1) if pad_right else () if estimator is None: estimator = _estimator cfd = ConditionalFreqDist() self._ngrams = set() # If given a list of strings instead of a list of lists, create enclosing list if (train is not None) and isinstance(train[0], compat.string_types): train = [train] for sent in train: for ngram in ingrams(chain(self._lpad, sent, self._rpad), n): self._ngrams.add(ngram) context = tuple(ngram[:-1]) token = ngram[-1] cfd[context].inc(token) if not estimator_args and not estimator_kwargs: self._model = ConditionalProbDist(cfd, estimator, len(cfd)) else: self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kwargs) # recursively construct the lower-order models if n > 1: self._backoff = NgramModel(n-1, train, pad_left, pad_right, estimator, *estimator_args, **estimator_kwargs)
def frequency_of_term_in_article(term, tokens): # term is a tuple of strings # tokens is a list of sentences, where sentences are a list of words return sum(1 for sent in tokens for ngram in ingrams(sent, len(term)) if tuple(ngram) == term)
def __init__(self, n, train, estimator=None, *estimator_args, **estimator_kw_args): """ Creates an ngram language model to capture patterns in n consecutive words of training text. An estimator smooths the probabilities derived from the text and may allow generation of ngrams not seen during training. >>> from nltk.corpus import brown >>> from nltk.probability import LidstoneProbDist >>> estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) >>> lm = NgramModel(3, brown.words(categories='news'), estimator) >>> lm.entropy(['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ... 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', ... 'primary', 'election', 'produced', '``', 'no', 'evidence', ... "''", 'that', 'any', 'irregularities', 'took', 'place', '.']) ... # doctest: +ELLIPSIS 1.682... :param n: the order of the language model (ngram size) :type n: int :param train: the training text :type train: list of string :param estimator: a function for generating a probability distribution :type estimator: a function that takes a ConditionalFreqDist and returns a ConditionalProbDist :param estimator_args: Extra arguments for estimator. These arguments are usually used to specify extra properties for the probability distributions of individual conditions, such as the number of bins they contain. Note: For backward-compatibility, if no arguments are specified, the number of bins in the underlying ConditionalFreqDist are passed to the estimator as an argument. :type estimator_args: (any) :param estimator_kw_args: Extra keyword arguments for estimator. :type estimator_kw_args: (any) """ self._n = n if estimator is None: estimator = _estimator cfd = ConditionalFreqDist() self._ngrams = set() self._prefix = ('', ) * (n - 1) for ngram in ingrams(chain(self._prefix, train), n): self._ngrams.add(ngram) context = tuple(ngram[:-1]) token = ngram[-1] cfd[context].inc(token) if (not estimator_args) and (not estimator_kw_args): self._model = ConditionalProbDist(cfd, estimator, len(cfd)) else: self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kw_args) # recursively construct the lower-order models if n > 1: self._backoff = NgramModel(n - 1, train, estimator, *estimator_args, **estimator_kw_args)
def frequency_of_term_in_article(term,tokens): # term is a tuple of strings # tokens is a list of sentences, where sentences are a list of words return sum(1 for sent in tokens for ngram in ingrams(sent,len(term)) if tuple(ngram) == term)
def __init__(self, n, train, smoothing=1e9, #lagrangian_parameter=1., #estimator=None, maximum_length=20, minimum_length=3, char_set=string.lowercase + string.punctuation + string.digits, #char_set=string.lowercase, patch_char='#'): """ Creates an nchar language model to capture patterns in n consecutive words of training text. An estimator smooths the probabilities derived from the text and may allow generation of ngrams not seen during training. @param n: the order of the language model (nchar size) @type n: C{int} @param train: the training text @type train: C{list} of C{string} @param estimator: a function for generating a probability distribution @type estimator: a function that takes a C{ConditionalFreqDist} and returns a C{ConditionalProbDist} """ self._smoothing = smoothing; #self.lagrangian_parameter = lagrangian_parameter; self._n = n self._maximum_length = maximum_length; self._minimum_length = minimum_length; self._char_set = char_set; #estimator = lambda fdist, bins: nltk.probability.WittenBellProbDist(fdist, len(char_set)); estimator = lambda fdist, bins: nltk.probability.LidstoneProbDist(fdist, self._smoothing, len(self._char_set)+1); #estimator = lambda fdist, bins: nltk.probability.LidstoneProbDist(fdist, 1e-9, len(self._char_set)); #estimator = lambda fdist, bins: nltk.probability.GoodTuringProbDist(fdist, len(self._char_set)); #estimator = lambda fdist, bins: nltk.probability.SimpleGoodTuringProbDist(fdist, len(self._char_set)); cfd = ConditionalFreqDist() self._ngrams = set() self._patch_char = patch_char; self._prefix = (self._patch_char,) * (n - 1) length = nltk.probability.FreqDist(); word_freq_dist = nltk.probability.FreqDist(); char_list = []; for word in train: word = word.strip().lower(); if len(word)<self._minimum_length or len(word)>self._maximum_length: continue; length.inc(len(word)); word_freq_dist.inc(word, 1); char_list.extend(self._prefix); char_list.extend([char for char in word if char in self._char_set]); self._length = nltk.probability.WittenBellProbDist(length, length.B()+1); #self._length = nltk.probability.WittenBellProbDist(length, self._maximum_length); #context_freq_dist = nltk.probability.FreqDist(); #for nchar in ingrams(chain(self._prefix, train), n): for nchar in ingrams(char_list, n): self._ngrams.add(nchar) context = tuple(nchar[:-1]) #context_freq_dist.inc(context); token = nchar[-1] cfd[context].inc(token) #self._context = nltk.probability.WittenBellProbDist(context_freq_dist, len(self._char_set)**(n-1)+1); ''' if n==3: cond = 0; for x in self._char_set: for y in self._char_set: print (x, y), context_freq_dist[(x, y)], self._context.prob((x, y)); cond += self._context.prob((x, y)); print 'cond is', cond ''' #self._model = ConditionalProbDist(cfd, estimator, len(cfd)); #print self._char_set; self._model = ConditionalProbDist(cfd, estimator, len(self._char_set) ** (n - 1)); #========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ''' consonant_freq_dist = nltk.probability.FreqDist(); consonant_condition_freq_dist = nltk.probability.ConditionalFreqDist(); for word in train: #word = re.sub(r'aeiou', ' ', word); word = word[0] + re.sub('aeiouy', ' ', word[1:]); consonant_list = word.split(); #consonant_list = ['#', '#'] + consonant_list; for temp in consonant_list: consonant_freq_dist.inc(temp, 1); consonant_freq_dist.plot() ''' #========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ========== word_prob_dist = nltk.probability.MLEProbDist(word_freq_dist); word_model_empirical_frequency = numpy.zeros((1, self._maximum_length - self._minimum_length + 1)) + 1e-300; word_model_square = numpy.zeros((1, self._maximum_length - self._minimum_length + 1)) + 1e-300; #word_model_empirical_frequency_old = numpy.zeros((1, self._maximum_length - self._minimum_length + 1)); #word_model_square_old = numpy.zeros((1, self._maximum_length - self._minimum_length + 1)); total_outcomes = 0; for x in xrange(self._minimum_length, self._maximum_length+1): total_outcomes += len(self._char_set) ** x; for word in word_freq_dist.keys(): word_model_empirical_frequency[0, len(word)-self._minimum_length] += word_prob_dist.prob(word) * self.probability_without_length(word); #word_model_empirical_frequency[0, len(word)-self._minimum_length] += 1.0/total_outcomes * self.probability_without_length(word); word_model_square[0, len(word)-self._minimum_length] += self.probability_without_length(word) ** 2; #word_model_empirical_frequency_old[0, len(word)-self._minimum_length] += word_prob_dist.prob(word) * self.probability_without_length(word); #word_model_square_old[0, len(word)-self._minimum_length] += self.probability_without_length(word) ** 2; #print "alpha is", 2 * (1-numpy.sum(word_model_empirical_frequency / word_model_square))/numpy.sum(1.0/word_model_square) #print word_model_empirical_frequency, word_model_square #sum_word_model_square_inverse = numpy.sum(1.0 / word_model_square); #sum_word_model_empirical_frequency_over_word_model_square = numpy.sum(word_model_empirical_frequency / word_model_square); #self._multinomial_length = (word_model_empirical_frequency * sum_word_model_square_inverse - sum_word_model_empirical_frequency_over_word_model_square + 1) / (word_model_square * sum_word_model_square_inverse); #print sum_word_model_square_inverse, sum_word_model_empirical_frequency_over_word_model_square; #print self._multinomial_length, numpy.sum(self._multinomial_length); if True: lagrangian_parameter = 2 * (1-numpy.sum(word_model_empirical_frequency / word_model_square))/numpy.sum(1.0/word_model_square) else: lagrangian_parameter = 1.; #print "lagrangian parameter is", lagrangian_parameter self._multinomial_length = (word_model_empirical_frequency - lagrangian_parameter / 2) / word_model_square; self._multinomial_length /= numpy.sum(self._multinomial_length); #print self._multinomial_length, numpy.sum(self._multinomial_length); assert numpy.all(self._multinomial_length>=0), self._multinomial_length; # recursively construct the lower-order models if n > 1: self._backoff = NcharModel(n-1, train, self._smoothing, maximum_length, minimum_length, self._char_set, self._patch_char);
t = time() - t print str(t) + 's' # Test generation of CFD print 'Creating CFD...', sys.stdout.flush() t = time() cat = cr.categories()[0] n = 3 cfd = ConditionalFreqDist() prefix = ('',) * (n - 1) for ngram in ingrams(chain(prefix, cr.words(categories=[cat])), n): context = tuple(ngram[:-1]) token = ngram[-1] cfd[context].inc(token) t = time() - t print str(t) + 's' t = time() print 'Pickling CFD...', sys.stdout.flush() pickle.dump(cfd, open('cfd.p', 'w'), protocol=1) t = time() - t
def __init__(self, n, train, pad_left=True, pad_right=False, estimator=None, *estimator_args, **estimator_kwargs): """ Create an ngram language model to capture patterns in n consecutive words of training text. An estimator smooths the probabilities derived from the text and may allow generation of ngrams not seen during training. >>> from nltk.corpus import brown >>> from nltk.probability import LidstoneProbDist >>> est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) >>> lm = NgramModel(3, brown.words(categories='news'), estimator=est) >>> lm <NgramModel with 91603 3-grams> >>> lm._backoff <NgramModel with 62888 2-grams> >>> lm.entropy(['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ... 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', ... 'primary', 'election', 'produced', '``', 'no', 'evidence', ... "''", 'that', 'any', 'irregularities', 'took', 'place', '.']) ... # doctest: +ELLIPSIS 0.5776... :param n: the order of the language model (ngram size) :type n: int :param train: the training text :type train: list(str) or list(list(str)) :param pad_left: whether to pad the left of each sentence with an (n-1)-gram of empty strings :type pad_left: bool :param pad_right: whether to pad the right of each sentence with an (n-1)-gram of empty strings :type pad_right: bool :param estimator: a function for generating a probability distribution :type estimator: a function that takes a ConditionalFreqDist and returns a ConditionalProbDist :param estimator_args: Extra arguments for estimator. These arguments are usually used to specify extra properties for the probability distributions of individual conditions, such as the number of bins they contain. Note: For backward-compatibility, if no arguments are specified, the number of bins in the underlying ConditionalFreqDist are passed to the estimator as an argument. :type estimator_args: (any) :param estimator_kwargs: Extra keyword arguments for the estimator :type estimator_kwargs: (any) """ # protection from cryptic behavior for calling programs # that use the pre-2.0.2 interface assert (isinstance(pad_left, bool)) assert (isinstance(pad_right, bool)) self._n = n self._lpad = ('',) * (n - 1) if pad_left else () self._rpad = ('',) * (n - 1) if pad_right else () if estimator is None: estimator = _estimator cfd = ConditionalFreqDist() self._ngrams = set() # If given a list of strings instead of a list of lists, create enclosing list if (train is not None) and isinstance(train[0], str): train = [train] for sent in train: for ngram in ingrams(chain(self._lpad, sent, self._rpad), n): self._ngrams.add(ngram) context = tuple(ngram[:-1]) token = ngram[-1] cfd[context].inc(token) if not estimator_args and not estimator_kwargs: self._model = ConditionalProbDist(cfd, estimator, len(cfd)) else: self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kwargs) # recursively construct the lower-order models if n > 1: self._backoff = NgramModel(n - 1, train, pad_left, pad_right, estimator, *estimator_args, **estimator_kwargs)
def __init__( self, n, train, smoothing=1e9, #lagrangian_parameter=1., #estimator=None, maximum_length=20, minimum_length=3, char_set=string.lowercase + string.punctuation + string.digits, #char_set=string.lowercase, patch_char='#'): """ Creates an nchar language model to capture patterns in n consecutive words of training text. An estimator smooths the probabilities derived from the text and may allow generation of ngrams not seen during training. @param n: the order of the language model (nchar size) @type n: C{int} @param train: the training text @type train: C{list} of C{string} @param estimator: a function for generating a probability distribution @type estimator: a function that takes a C{ConditionalFreqDist} and returns a C{ConditionalProbDist} """ self._smoothing = smoothing #self.lagrangian_parameter = lagrangian_parameter; self._n = n self._maximum_length = maximum_length self._minimum_length = minimum_length self._char_set = char_set #estimator = lambda fdist, bins: nltk.probability.WittenBellProbDist(fdist, len(char_set)); estimator = lambda fdist, bins: nltk.probability.LidstoneProbDist( fdist, self._smoothing, len(self._char_set) + 1) #estimator = lambda fdist, bins: nltk.probability.LidstoneProbDist(fdist, 1e-9, len(self._char_set)); #estimator = lambda fdist, bins: nltk.probability.GoodTuringProbDist(fdist, len(self._char_set)); #estimator = lambda fdist, bins: nltk.probability.SimpleGoodTuringProbDist(fdist, len(self._char_set)); cfd = ConditionalFreqDist() self._ngrams = set() self._patch_char = patch_char self._prefix = (self._patch_char, ) * (n - 1) length = nltk.probability.FreqDist() word_freq_dist = nltk.probability.FreqDist() char_list = [] for word in train: word = word.strip().lower() if len(word) < self._minimum_length or len( word) > self._maximum_length: continue length.inc(len(word)) word_freq_dist.inc(word, 1) char_list.extend(self._prefix) char_list.extend([char for char in word if char in self._char_set]) self._length = nltk.probability.WittenBellProbDist( length, length.B() + 1) #self._length = nltk.probability.WittenBellProbDist(length, self._maximum_length); #context_freq_dist = nltk.probability.FreqDist(); #for nchar in ingrams(chain(self._prefix, train), n): for nchar in ingrams(char_list, n): self._ngrams.add(nchar) context = tuple(nchar[:-1]) #context_freq_dist.inc(context); token = nchar[-1] cfd[context].inc(token) #self._context = nltk.probability.WittenBellProbDist(context_freq_dist, len(self._char_set)**(n-1)+1); ''' if n==3: cond = 0; for x in self._char_set: for y in self._char_set: print (x, y), context_freq_dist[(x, y)], self._context.prob((x, y)); cond += self._context.prob((x, y)); print 'cond is', cond ''' #self._model = ConditionalProbDist(cfd, estimator, len(cfd)); #print self._char_set; self._model = ConditionalProbDist(cfd, estimator, len(self._char_set)**(n - 1)) #========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ''' consonant_freq_dist = nltk.probability.FreqDist(); consonant_condition_freq_dist = nltk.probability.ConditionalFreqDist(); for word in train: #word = re.sub(r'aeiou', ' ', word); word = word[0] + re.sub('aeiouy', ' ', word[1:]); consonant_list = word.split(); #consonant_list = ['#', '#'] + consonant_list; for temp in consonant_list: consonant_freq_dist.inc(temp, 1); consonant_freq_dist.plot() ''' #========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ========== word_prob_dist = nltk.probability.MLEProbDist(word_freq_dist) word_model_empirical_frequency = numpy.zeros( (1, self._maximum_length - self._minimum_length + 1)) + 1e-300 word_model_square = numpy.zeros( (1, self._maximum_length - self._minimum_length + 1)) + 1e-300 #word_model_empirical_frequency_old = numpy.zeros((1, self._maximum_length - self._minimum_length + 1)); #word_model_square_old = numpy.zeros((1, self._maximum_length - self._minimum_length + 1)); total_outcomes = 0 for x in xrange(self._minimum_length, self._maximum_length + 1): total_outcomes += len(self._char_set)**x for word in word_freq_dist.keys(): word_model_empirical_frequency[ 0, len(word) - self._minimum_length] += word_prob_dist.prob( word) * self.probability_without_length(word) #word_model_empirical_frequency[0, len(word)-self._minimum_length] += 1.0/total_outcomes * self.probability_without_length(word); word_model_square[ 0, len(word) - self._minimum_length] += self.probability_without_length( word)**2 #word_model_empirical_frequency_old[0, len(word)-self._minimum_length] += word_prob_dist.prob(word) * self.probability_without_length(word); #word_model_square_old[0, len(word)-self._minimum_length] += self.probability_without_length(word) ** 2; #print "alpha is", 2 * (1-numpy.sum(word_model_empirical_frequency / word_model_square))/numpy.sum(1.0/word_model_square) #print word_model_empirical_frequency, word_model_square #sum_word_model_square_inverse = numpy.sum(1.0 / word_model_square); #sum_word_model_empirical_frequency_over_word_model_square = numpy.sum(word_model_empirical_frequency / word_model_square); #self._multinomial_length = (word_model_empirical_frequency * sum_word_model_square_inverse - sum_word_model_empirical_frequency_over_word_model_square + 1) / (word_model_square * sum_word_model_square_inverse); #print sum_word_model_square_inverse, sum_word_model_empirical_frequency_over_word_model_square; #print self._multinomial_length, numpy.sum(self._multinomial_length); if True: lagrangian_parameter = 2 * (1 - numpy.sum( word_model_empirical_frequency / word_model_square)) / numpy.sum(1.0 / word_model_square) else: lagrangian_parameter = 1. #print "lagrangian parameter is", lagrangian_parameter self._multinomial_length = ( word_model_empirical_frequency - lagrangian_parameter / 2) / word_model_square self._multinomial_length /= numpy.sum(self._multinomial_length) #print self._multinomial_length, numpy.sum(self._multinomial_length); assert numpy.all( self._multinomial_length >= 0), self._multinomial_length # recursively construct the lower-order models if n > 1: self._backoff = NcharModel(n - 1, train, self._smoothing, maximum_length, minimum_length, self._char_set, self._patch_char)
from nltk.corpus import genesis from nltk.probability import LidstoneProbDist from nltk.model import NgramModel from nltk.util import ingrams # Tokens contains the words for Genesis and Reuters Trade #tokens = list(genesis.words('english-kjv.txt')) #tokens.extend(list(reuters.words(categories = 'trade'))) # estimator for smoothing the N-gram model estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) sent = "abraham lincoln be bear feb 12 1809" tokens = sent.split() splitNgrams = list(ingrams(list(sent), 3)) tokens = ["".join(x) for x in splitNgrams] # N-gram language model with 3-grams # Without an estimator, it assumes Good-Turing. model = NgramModel(3, tokens, estimator) print "Model: " + str(model) sent2 = "abe lincoln was born in 1809" splitNgrams2 = list(ingrams(list(sent2), 3)) tokens2 = ["".join(x) for x in splitNgrams2] print "Word: " + tokens2[-1]
def __init__(self, n, train, estimator=None, *estimator_args, **estimator_kw_args): """ Creates an ngram language model to capture patterns in n consecutive words of training text. An estimator smooths the probabilities derived from the text and may allow generation of ngrams not seen during training. @param n: the order of the language model (ngram size) @type n: C{int} @param train: the training text @type train: C{list} of C{string} (or C{list} of C{string} C{list}s) @param estimator: a function for generating a probability distribution @type estimator: a function that takes a C{ConditionalFreqDist} and returns a C{ConditionalProbDist} @param estimator_args: Extra arguments for C{estimator}. These arguments are usually used to specify extra properties for the probability distributions of individual conditions, such as the number of bins they contain. Note: For backward-compatibility, if no arguments are specified, the number of bins in the underlying C{ConditionalFreqDist} are passed to the estimator as an argument. @type estimator_args: (any) @param estimator_kw_args: Extra keyword arguments for C{estimator}. @type estimator_kw_args: (any) """ self._n = n if estimator is None: estimator = _estimator cfd = ConditionalFreqDist() self._ngrams = set() self._padding = ('', ) * (n - 1) # If given a list of strings instead of a list of lists, create enclosing list if isinstance(train[0], basestring): train = [train] for utterance in train: for ngram in ingrams( chain(self._padding, utterance, self._padding), n): self._ngrams.add(ngram) context = tuple(ngram[:-1]) token = ngram[-1] cfd[context].inc(token) if (not estimator_args) and (not estimator_kw_args): self._model = ConditionalProbDist(cfd, estimator, len(cfd)) else: self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kw_args) # recursively construct the lower-order models if n > 1: self._backoff = NgramModel(n - 1, train, estimator, *estimator_args, **estimator_kw_args)
def __init__(self, n, train, pad_left=False, pad_right=False, estimator=None, *estimator_args, **estimator_kwargs): """ Creates an ngram language model to capture patterns in n consecutive words of training text. An estimator smooths the probabilities derived from the text and may allow generation of ngrams not seen during training. :param n: the order of the language model (ngram size) :type n: C{int} :param train: the training text :type train: C{iterable} of C{string} or C{iterable} of C{iterable} of C{string} :param estimator: a function for generating a probability distribution---defaults to MLEProbDist :type estimator: a function that takes a C{ConditionalFreqDist} and returns a C{ConditionalProbDist} :param pad_left: whether to pad the left of each sentence with an (n-1)-gram of <s> :type pad_left: bool :param pad_right: whether to pad the right of each sentence with </s> :type pad_right: bool :param estimator_args: Extra arguments for estimator. These arguments are usually used to specify extra properties for the probability distributions of individual conditions, such as the number of bins they contain. Note: For backward-compatibility, if no arguments are specified, the number of bins in the underlying ConditionalFreqDist are passed to the estimator as an argument. :type estimator_args: (any) :param estimator_kwargs: Extra keyword arguments for the estimator :type estimator_kwargs: (any) """ # protection from cryptic behavior for calling programs # that use the pre-2.0.2 interface assert(isinstance(pad_left, bool)) assert(isinstance(pad_right, bool)) # make sure n is greater than zero, otherwise print it assert (n > 0), n # For explicitness save the check whether this is a unigram model self.is_unigram_model = (n == 1) # save the ngram order number self._n = n # save left and right padding self._lpad = ('<s>',) * (n - 1) if pad_left else () # Need _rpad even for unigrams or padded entropy will give # wrong answer because '</s>' will be treated as unseen... self._rpad = ('</s>',) if pad_right else () self._padLen = len(self._lpad)+len(self._rpad) self._N=0 delta = 1+self._padLen-n # len(sent)+delta == ngrams in sent if estimator is None: assert (estimator_args is ()) and (estimator_kwargs=={}),\ "estimator_args (%s) or _kwargs supplied (%s), but no estimator"%(estimator_args,estimator_kwargs) estimator = lambda fdist, bins: MLEProbDist(fdist) # Given backoff, a generator isn't acceptable if not isinstance(train,collections.abc.Sequence): train=list(train) self._W = len(train) # Coerce to list of list -- note that this means to train charGrams, # requires exploding the words ahead of time if train is not None: if isinstance(train[0], compat.string_types): train = [train] self._W=1 elif not isinstance(train[0],collections.abc.Sequence): # if you mix strings and generators, you have only yourself # to blame! for i in range(len(train)): train[i]=list(train[i]) if n == 1: if pad_right: sents=(chain(s,self._rpad) for s in train) else: sents=train fd=FreqDist() for s in sents: fd.update(s) if not estimator_args and not estimator_kwargs: self._model = estimator(fd,fd.B()) else: self._model = estimator(fd,fd.B(), *estimator_args, **estimator_kwargs) self._N=fd.N() else: cfd = ConditionalFreqDist() self._ngrams = set() for sent in train: self._N+=len(sent)+delta for ngram in ingrams(chain(self._lpad, sent, self._rpad), n): self._ngrams.add(ngram) context = tuple(ngram[:-1]) token = ngram[-1] cfd[context][token]+=1 if not estimator_args and not estimator_kwargs: self._model = ConditionalProbDist(cfd, estimator, len(cfd)) else: self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kwargs) # recursively construct the lower-order models if not self.is_unigram_model: self._backoff = NgramModel(n-1, train, pad_left, pad_right, estimator, *estimator_args, **estimator_kwargs) # Code below here in this method, and the _words_following and _alpha method, are from # http://www.nltk.org/_modules/nltk/model/ngram.html "Last updated on Feb 26, 2015" self._backoff_alphas = dict() # For each condition (or context) for ctxt in cfd.conditions(): backoff_ctxt = ctxt[1:] backoff_total_pr = 0.0 total_observed_pr = 0.0 # this is the subset of words that we OBSERVED following # this context. # i.e. Count(word | context) > 0 for word in self._words_following(ctxt, cfd): total_observed_pr += self.prob(word, ctxt) # we also need the total (n-1)-gram probability of # words observed in this n-gram context backoff_total_pr += self._backoff.prob(word, backoff_ctxt) if isclose(total_observed_pr,1.0): total_observed_pr=1.0 else: assert 0.0 <= total_observed_pr <= 1.0,\ "sum of probs for %s out of bounds: %.10g"%(ctxt,total_observed_pr) # beta is the remaining probability weight after we factor out # the probability of observed words. # As a sanity check, both total_observed_pr and backoff_total_pr # must be GE 0, since probabilities are never negative beta = 1.0 - total_observed_pr if beta!=0.0: assert (0.0 <= backoff_total_pr < 1.0), \ "sum of backoff probs for %s out of bounds: %s"%(ctxt,backoff_total_pr) alpha_ctxt = beta / (1.0 - backoff_total_pr) else: assert ((0.0 <= backoff_total_pr < 1.0) or isclose(1.0,backoff_total_pr)), \ "sum of backoff probs for %s out of bounds: %s"%(ctxt,backoff_total_pr) alpha_ctxt = 0.0 self._backoff_alphas[ctxt] = alpha_ctxt