Ejemplo n.º 1
0
 def n_gramas_texto(self,texto,N):
         '''
           Searching for n-grams
 
           In:
                 (texto, text) text string
                 (N, int)  size of the n-gram 2 is equal to bigram, 3 is equal to trigram and so on...
           Out:
                 (listaNgrama, list) n-gram list
         '''
         if  isinstance(N, int):
                 try:
                         if isinstance(texto, str):
                            texto = unicode(texto, "utf-8", "xmlcharrefreplace")
                         texto = self.filtro_caracteres_especiales(texto)
                         lista = self.filtro_palabras_cerradas(texto)
                         listaNgrama=[]
                         for Ngramas in ingrams(lista, N, pad_right=True):
                                 if (str(Ngramas[N-1])!= "None"):
                                         listaNgrama.append(" ".join(Ngramas))
                         return listaNgrama
                 except TypeError:
                         return []
                 except UnicodeDecodeError:
                         return []    
                 except:
                         return []
         else:
                 return []
Ejemplo n.º 2
0
    def __init__(self, n, train, estimator=None, *estimator_args, **estimator_kw_args):
        """
        Creates an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during
        training.

            >>> from nltk.corpus import brown
            >>> from nltk.probability import LidstoneProbDist
            >>> estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
            >>> lm = NgramModel(3, brown.words(categories='news'), estimator)
            >>> lm.entropy(['The', 'Fulton', 'County', 'Grand', 'Jury', 'said',
            ... 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent',
            ... 'primary', 'election', 'produced', '``', 'no', 'evidence',
            ... "''", 'that', 'any', 'irregularities', 'took', 'place', '.'])
            ... # doctest: +ELLIPSIS
            1.682...

        :param n: the order of the language model (ngram size)
        :type n: int
        :param train: the training text
        :type train: list of string
        :param estimator: a function for generating a probability distribution
        :type estimator: a function that takes a ConditionalFreqDist and
              returns a ConditionalProbDist
        :param estimator_args: Extra arguments for estimator.
            These arguments are usually used to specify extra
            properties for the probability distributions of individual
            conditions, such as the number of bins they contain.
            Note: For backward-compatibility, if no arguments are specified, the
            number of bins in the underlying ConditionalFreqDist are passed to
            the estimator as an argument.
        :type estimator_args: (any)
        :param estimator_kw_args: Extra keyword arguments for estimator.
        :type estimator_kw_args: (any)
        """

        self._n = n

        if estimator is None:
            estimator = _estimator

        cfd = ConditionalFreqDist()
        self._ngrams = set()
        self._prefix = ("",) * (n - 1)

        for ngram in ingrams(chain(self._prefix, train), n):
            self._ngrams.add(ngram)
            context = tuple(ngram[:-1])
            token = ngram[-1]
            cfd[context].inc(token)

        if (not estimator_args) and (not estimator_kw_args):
            self._model = ConditionalProbDist(cfd, estimator, len(cfd))
        else:
            self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kw_args)

        # recursively construct the lower-order models
        if n > 1:
            self._backoff = NgramModel(n - 1, train, estimator, *estimator_args, **estimator_kw_args)
Ejemplo n.º 3
0
    def eval_word(self, word):
        '''
            @return: probability that proposed word is a word.
        '''
        if self._hypothetical_phonotactics:
            # Make a deep copy of the ngram model so that we can update it without modifying the original
            word_ngram_model = copy.deepcopy(self._ngram_model)
            word_ngram_model.update([word])
        else:
            word_ngram_model = self._ngram_model

        # If I want to duplicate functionality of OCaml code, should probably make a ProbDist that inherits from Lidstone,
        # otherwise this won't do the denominator adjustments and other stuff.
        # for ngram in ingrams(chain(self._ngram_model._padding, word, self._ngram_model._padding), self._n):
        #     print("Ngram: {}\tProb for {} given {}: {}".format(ngram, ngram[-1], tuple(ngram[:-1]), self._ngram_model.prob(tuple(ngram[:-1]), ngram[-1])), file=sys.stderr)

        if self._n > 1:
            adjustment = Fraction(1)
        else:
            # Need to compensate for the lack of an empty word by getting rel freq of word_delimiter somehow
            # print("word delimiter prob: {}".format(self._ngram_model.prob(self._word_delimiter, self._ngram_model._padding)), file=sys.stderr)
            adjustment = Fraction(1) / (Fraction(1) - self._ngram_model.prob(self._word_delimiter, self._ngram_model._padding))

        # print([self._ngram_model.prob(ngram[-1], tuple(ngram[:-1])) for ngram in
        #                                     ingrams(chain(*([self._word_delimiter, word, self._word_delimiter] if self._n > 1 else [word, self._word_delimiter])),
        #                                                   self._n)], file=sys.stderr)

        raw_score = self._score_combiner([self._ngram_model.prob(ngram[-1], tuple(ngram[:-1])) for ngram in
                                            ingrams(chain(*([self._word_delimiter, word, self._word_delimiter] if self._n > 1 else [word, self._word_delimiter])),
                                                          self._n)])
        # print("adjustment: {:.10e}\traw score: {:.10e}".format(float(adjustment), float(raw_score)), file=sys.stderr)

        return adjustment * raw_score
Ejemplo n.º 4
0
 def N_gramas_lista(self,lista,N):
         if  isinstance(lista, list) and isinstance(N, int):
                 if (all(type(x) is str for x in lista)):
                       try:
                                 listaNgrama=[]
                                 remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
                                 lista=[unicode(s,codificacion) for s in lista]
                                 lista=[s.translate(remove_punctuation_map).lower() for s in lista]
                                 lista=[str(self.elimina_tildes(s)) for s in lista]
                                 lista= filter(None,lista)
                                 lista= filter(str.strip, lista)    
                                 for Ngramas in ingrams(lista, N, pad_right=True):
                                         if (str(Ngramas[N-1])!= "None"):
                                                 listaNgrama.append(" ".join(Ngramas))
                                 return listaNgrama
                       except TypeError:
                                 return []
                       except UnicodeDecodeError:
                                 return []  
                       except:
                                 return []
                 else:
                        return []  
         else:
                 return []
Ejemplo n.º 5
0
    def __init__(self, n, train, estimator=None):
        """
        Creates an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during
        training.

        :param n: the order of the language model (ngram size)
        :type n: int
        :param train: the training text
        :type train: list(str)
        :param estimator: a function for generating a probability distribution
        :type estimator: function(ConditionalFreqDist) -> ConditionalProbDist
        """

        self._n = n

        if estimator is None:
            estimator = _estimator
        
        cfd = ConditionalFreqDist()
        self._ngrams = set()
        self._prefix = ('',) * (n - 1)

        for ngram in ingrams(chain(self._prefix, train), n):
            self._ngrams.add(ngram)
            context = tuple(ngram[:-1])
            token = ngram[-1]
            cfd[context].inc(token)

        self._model = ConditionalProbDist(cfd, estimator, len(cfd))

        # recursively construct the lower-order models
        if n > 1:
            self._backoff = NgramModel(n-1, train, estimator)
    def from_words(cls, words):
        """Construct a QuadgramCollocationFinder for all quadgrams in the given
        sequence.
        """
        wfd = FreqDist()
        bfd = FreqDist()
        wild2fd = FreqDist()
        tfd = FreqDist()
        wild3fd = FreqDist()
        qfd = FreqDist()

        for w1, w2, w3, w4 in ingrams(words, 4, pad_right=True):
            wfd.inc(w1)
            if w2 is None:
                continue
            bfd.inc((w1, w2))
            if w3 is None:
                continue
            wild2fd.inc((w1, w3))
            tfd.inc((w1, w2, w3))
            if w4 is None:
                continue
            wild2fd.inc((w1, w4))
            wild3fd.inc((w1, w2, w4))
            wild3fd.inc((w1, w3, w4))
            qfd.inc((w1, w2, w3, w4))
        return cls(wfd, bfd, wild2fd, tfd, wild3fd, qfd)
Ejemplo n.º 7
0
def big_test(version="3.0", max_length=3):
    from topicmod.util.wordnet import load_wn
    from nltk.corpus import brown
    from nltk.util import ingrams

    wn = load_wn(version)

    term_counts = defaultdict(int)

    for ngram_length in xrange(max_length):
        token = 0
        for w in ingrams(brown.words(), ngram_length):
            token += 1
            normalized = "_".join(w).lower()
            if wn.synsets(normalized, 'n'):
                term_counts[wn.morphy(normalized)] += 1

    filename = "wn/wordnet.wn"
    if version != "3.0":
        filename = "wn/wordnet_%s.wn" % version
    o = OntologyWriter(filename)
    for ii in orderedTraversal(wn):
        o.AddSynset(ii.offset,
                    ii.name,
                    [x.offset for x in ii.hyponyms() + ii.instance_hyponyms()],
                    [(0, x.name.lower(), term_counts[x.name] + 1)
                     for x in ii.lemmas])
    o.Finalize()
Ejemplo n.º 8
0
    def __init__(self, n, train, pad_left=True, pad_right=False,estimator=None, *estimator_args, **estimator_kwargs):
        super(MyNgramModel,self).__init__(n,train,pad_left,pad_right,estimator,*estimator_args, **estimator_kwargs)
        assert(isinstance(pad_left, bool))
        assert(isinstance(pad_right, bool))
        
        self._n = n
        self._lpad = ('',) * (n - 1) if pad_left else ()
        self._rpad = ('',) * (n - 1) if pad_right else ()

        if estimator is None:
            estimator = _estimator

        self._cfd = ConditionalFreqDist()
        self._ngrams = set()
        
            
        # If given a list of strings instead of a list of lists, create enclosing list
        if (train is not None) and isinstance(train[0], basestring):
            train = [train]

        for sent in train:
            for ngram in ingrams(chain(self._lpad, sent, self._rpad), n):
                self._ngrams.add(ngram)
                context = tuple(ngram[:-1])
                token = ngram[-1]
                self._cfd[context].inc(token)

        if not estimator_args and not estimator_kwargs:
            self._model = ConditionalProbDist(self._cfd, estimator, len(self._cfd))
        else:
            self._model = ConditionalProbDist(self._cfd, estimator, *estimator_args, **estimator_kwargs)

        # recursively construct the lower-order models
        self._backoff = None
        if n > 1:
            self._backoff = MyNgramModel(n-1, train, pad_left, pad_right, estimator, *estimator_args, **estimator_kwargs)
        
            if self._backoff is not None:
                self._backoff_alphas = dict()
    
            # For each condition (or context)
                for ctxt in self._cfd.conditions():
                    pd = self._model[ctxt] # prob dist for this context
                    backoff_ctxt = ctxt[1:]
                    backoff_total_pr = 0
                    total_observed_pr = 0
                    for word in self._cfd[ctxt].keys(): # this is the subset of words that we OBSERVED                    
                        backoff_total_pr += self._backoff.prob(word,backoff_ctxt) 
                        total_observed_pr += pd.prob(word)        
                    assert total_observed_pr <= 1 and total_observed_pr > 0
                    assert backoff_total_pr <= 1 and backoff_total_pr > 0
                    alpha_ctxt = (1.0-total_observed_pr) / (1.0-backoff_total_pr)
        
                    self._backoff_alphas[ctxt] = alpha_ctxt
Ejemplo n.º 9
0
    def __init__(self, n, train, estimator=None, *estimator_args, **estimator_kw_args):
        """
        Creates an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during
        training.

        @param n: the order of the language model (ngram size)
        @type n: C{int}
        @param train: the training text
        @type train: C{list} of C{string} (or C{list} of C{string} C{list}s)
        @param estimator: a function for generating a probability distribution
        @type estimator: a function that takes a C{ConditionalFreqDist} and
              returns a C{ConditionalProbDist}
        @param estimator_args: Extra arguments for C{estimator}.
            These arguments are usually used to specify extra
            properties for the probability distributions of individual
            conditions, such as the number of bins they contain.
            Note: For backward-compatibility, if no arguments are specified, the
            number of bins in the underlying C{ConditionalFreqDist} are passed to
            the estimator as an argument.
        @type estimator_args: (any)
        @param estimator_kw_args: Extra keyword arguments for C{estimator}.
        @type estimator_kw_args: (any)
        """

        self._n = n

        if estimator is None:
            estimator = _estimator

        cfd = ConditionalFreqDist()
        self._ngrams = set()
        self._padding = ('',) * (n - 1)

        # If given a list of strings instead of a list of lists, create enclosing list
        if isinstance(train[0], basestring):
            train = [train]

        for utterance in train:
            for ngram in ingrams(chain(self._padding, utterance, self._padding), n):
                self._ngrams.add(ngram)
                context = tuple(ngram[:-1])
                token = ngram[-1]
                cfd[context].inc(token)

        if (not estimator_args) and (not estimator_kw_args):
            self._model = ConditionalProbDist(cfd, estimator, len(cfd))
        else:
            self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kw_args)

        # recursively construct the lower-order models
        if n > 1:
            self._backoff = NgramModel(n-1, train, estimator, *estimator_args, **estimator_kw_args)
Ejemplo n.º 10
0
def quantify_variant(analysis, variant):
    n = variant + 1

    d = {}

    all_pos_tags = [ pos for (_, pos) in analysis.pos_tags() ]

    for ngram in ingrams(all_pos_tags, n):
        sparse_dict_increment(d, ngram)

    return {output_filter_ngram(k): v for (k, v) in d.items()}
Ejemplo n.º 11
0
    def classify(self, sentence, tokenizer_lang, ngram_length=3):
        features = []
        for ii in self.tokenizers[tokenizer_lang].tokenize(sentence):
            d = {}
            for jj in ingrams(ii, ngram_length):
                d[jj] = d.get(jj, 0) + 1
            features.append(d)
        data = SparseDataSet(features)

        f = FreqDist()
        for ii in [self._labels[self._classifier.classify(data, x)[0]] for x in xrange(len(features))]:
            f.inc(ii)
        return f
Ejemplo n.º 12
0
    def update(self, samples, increase_amount=1):
        '''
        Update the underlying frequency distributions given the current list of samples.
        '''
        cond_samples = []
        for utterance in samples:
            for ngram in ingrams(chain(self._padding, utterance, self._padding), self._n):
                self._ngrams.add(ngram)
                cond_samples.append((tuple(ngram[:-1]), ngram[-1]))
        self._model.update(cond_samples, increase_amount)

        # Recursively update lower-order models
        if self._backoff:
            self._backoff.update(samples, increase_amount)
Ejemplo n.º 13
0
    def AddInstance(self, lang, line, ngram_length):
        if not lang in self.label_names:
            self.label_names.append(lang)

        id = line[:line.find("\t")]
        sentence = line[line.find("\t")+1:].strip()

        d = {}
        for ii in ingrams(line, ngram_length):
            d[ii] = d.get(ii, 0) + 1

        self.features.append(d)
        self.labels.append(self.label_names.index(lang))
        self.id.append("%s-%s" % (lang, id))
    def GetNGrams(self,tokenized_sentences):
        # Description = Takes the tokenized sentences and outputs all of the n-grams from a given text
        # Inputs:
            # Tokenized Sentences = The sentences after we tokenize the file to be used.
        # Outputs:
            # n_grams = The list of n_grams found in the file, not unique

       # This function simply turns all of the work into a really long vector of ngrams
        n_grams = []
        for token_sent in tokenized_sentences:
            sent_n_grams = ingrams(token_sent,self.N,self.leftpad,self.rightpad)
            n_grams.extend(sent_n_grams)
        
        return n_grams
Ejemplo n.º 15
0
 def prepareBigrams(self, window_size, word):
     wfd = FreqDist()
     bfd = FreqDist()
     
     if word == '':
         for sentence in self.__sentences:
             if len(sentence) > 1:
                 for window in ingrams(sentence, window_size, pad_right=True):
                     if window[0] not in self.__ignoredColl:
                         w1 = window[0]
                         try:
                             window = window[:list(window).index(w1, 1)]
                         except ValueError:
                             pass
                         wfd.inc(w1)
                         for w2 in set(window[1:]):
                             if w2 is not None and w2 not in self.__ignoredColl:
                                 bfd.inc((w1, w2))
     else:
         for sentence in self.__sentences:
             if len(sentence) > 1:
                 for window in ingrams(sentence, window_size, pad_right=True):
                     if window[0] not in self.__ignoredColl:
                         w1 = window[0]
                         try:
                             window = window[:list(window).index(w1, 1)]
                         except ValueError:
                             pass
                         bigramOK = False
                         for w2 in set(window[1:]):
                             if w2 is not None and w2 not in self.__ignoredColl and (w1 == word or w2==word):
                                 bfd.inc((w1, w2))
                                 bigramOK = True
                         if bigramOK:
                             wfd.inc(w1)
                             
     self.__bigrams = MyBigramCollFinder(wfd, bfd)
Ejemplo n.º 16
0
    def __init__(self, n, train, estimator, freqtype, padding, backoff, *estimator_args, **estimator_kw_args):
        """
        Creates an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during
        training.

        @param n: the order of the language model (ngram size)
        @type n: L{int}
        @param train: the training text
        @type train: L{list} of L{str} (or L{list} of L{str} L{list}s)
        @param estimator: a function for generating a probability distribution (must take FreqDist as first argument, and n as second)
        @type estimator: a function that takes a L{ConditionalFreqDist} and returns a L{ConditionalProbDist}
        @param freqtype: the type to use to store the counts in the underlying frequency distribution
        @type freqtype: any numeric type
        @param backoff: whether or not we should use Katz back-off
        @type backoff: L{bool}
        @param estimator_args: Extra arguments for L{estimator}.
            These arguments are usually used to specify extra
            properties for the probability distributions of individual
            conditions, such as the number of bins they contain.
        @type estimator_args: (any)
        @param estimator_kw_args: Extra keyword arguments for L{estimator}.
        @type estimator_kw_args: (any)
        """

        self._n = n

        cfd = ConditionalFreqDist(counttype=freqtype)
        self._ngrams = set()
        self._padding = (padding,) * (n - 1)
        self._estimator = estimator
        self._freqtype = freqtype
        self._estimator_args = estimator_args
        self._estimator_kw_args = estimator_kw_args

        if train:
            for utterance in train:
                for ngram in ingrams(chain(self._padding, utterance, self._padding), n):
                    self._ngrams.add(ngram)
                    context = tuple(ngram[:-1])
                    token = ngram[-1]
                    cfd[context].inc(token)

        self._model = ConditionalProbDist(cfd, estimator, self._freqtype, n, *estimator_args, **estimator_kw_args)

        # recursively construct the lower-order models
        self._backoff = PartialCountNgramModel(n - 1, train, estimator, freqtype, padding, backoff, *estimator_args, **estimator_kw_args) if (backoff and n > 1) else None
Ejemplo n.º 17
0
    def from_words(cls, words, window_size=2):
        """Construct a BigramCollocationFinder for all bigrams in the given
        sequence.  By default, bigrams must be contiguous.
        """
        wfd = FreqDist()
        bfd = FreqDist()

        if window_size < 2:
            raise ValueError, "Specify window_size at least 2"

        for window in ingrams(words, window_size, pad_right=True):
            w1 = window[0]
            wfd.inc(w1)
            for w2 in window[1:]:
                if w2 is not None:
                    bfd.inc((w1, w2))
        return cls(wfd, bfd)
Ejemplo n.º 18
0
def big_test(hyperparam, treefilename, hyperfilename, vocab, version="3.0",
             max_length=2):
    """
    @param hyperparam A function that, given a synset, returns a hyperparameter
    value

    @param version The version of WordNet we use

    @param max_length The maximum length of n-grams we'll use for computing
    counts
    """
    from python_lib.wordnet import load_wn
    from nltk.corpus import brown
    from nltk.util import ingrams

    wn = load_wn(version)

    term_counts = defaultdict(int)

    for ngram_length in xrange(max_length):
        token = 0
        for w in ingrams(brown.words(), ngram_length):
            token += 1
            normalized = "_".join(w).lower()
            if wn.synsets(normalized, 'n'):
                term_counts[wn.morphy(normalized)] += 1

    print("Done collecting counts")

    if not treefilename:
        treefilename = "wn/wordnet.wn"
        if version != "3.0":
            treefilename = "wn/wordnet_%s.wn" % version

    o = OntologyWriter(treefilename, vocab=vocab, max_leaves=-1)
    for ii in wn.all_synsets('n'):
        o.AddSynset(ii.offset,
                    ii.name,
                    [x.offset for x in ii.hyponyms() + ii.instance_hyponyms()],
                    [(0, x.name.lower(), term_counts[x.name] + 1)
                     for x in ii.lemmas], hyperparameter=hyperparam(ii))
    o.Finalize()

    #hyperparam.dump(filename + "_hyp.lookup")
    hyperparam.dump(hyperfilename)
Ejemplo n.º 19
0
    def from_words(cls, words, window_size=2):
        """Construct a BigramCollocationFinder for all bigrams in the given
        sequence.  When window_size > 2, count non-contiguous bigrams, in the
        style of Church and Hanks's (1990) association ratio.
        """
        wfd = FreqDist()
        bfd = FreqDist()

        if window_size < 2:
            raise ValueError("Specify window_size at least 2")

        for window in ingrams(words, window_size, pad_right=True):
            w1 = window[0]
            wfd.inc(w1)
            for w2 in window[1:]:
                if w2 is not None:
                    bfd.inc((w1, w2))
        return cls(wfd, bfd, window_size=window_size)
Ejemplo n.º 20
0
 def N_gramas_texto(self,texto,N):
         if  isinstance(texto, str) and isinstance(N, int):
                 try:
                         remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
                         lista=[s.translate(remove_punctuation_map).lower() for s in nltk.word_tokenize(unicode(texto,codificacion))]
                         lista=[str(self.elimina_tildes(s)) for s in lista]
                         lista= filter(None,lista)
                         lista= filter(str.strip, lista)
                         listaNgrama=[]
                         for Ngramas in ingrams(lista, N, pad_right=True):
                                 if (str(Ngramas[N-1])!= "None"):
                                         listaNgrama.append(" ".join(Ngramas))
                         return listaNgrama
                 except TypeError:
                         return []
                 except UnicodeDecodeError:
                         return []    
                 except:
                         return []
         else:
                 return []
Ejemplo n.º 21
0
    def from_words(cls, words, window_size=2):
        """Construct a BigramCollocationFinder for all bigrams in the given
        sequence.  By default, bigrams must be contiguous.
        """
        wfd = FreqDist()
        bfd = FreqDist()

        if window_size < 2:
            raise ValueError, "Specify window_size at least 2"

        for window in ingrams(words, window_size, pad_right=True):
            w1 = window[0]
            try:
                window = window[:list(window).index(w1, 1)]
            except ValueError:
                pass
            wfd.inc(w1)
            for w2 in set(window[1:]):
                if w2 is not None:
                    bfd.inc((w1, w2))
        return cls(wfd, bfd)
Ejemplo n.º 22
0
    def __init__(self, n, train, estimator=None):
        """
        Creates an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during
        training.

        @param n: the order of the language model (ngram size)
        @type n: C{int}
        @param train: the training text
        @type train: C{list} of C{string}
        @param estimator: a function for generating a probability distribution
        @type estimator: a function that takes a C{ConditionalFreqDist} and
              returns a C{ConditionalProbDist}
        """
        self._n = n
        self._N = 1 + len(train) - n

        if estimator is None:
            def estimator(fdist, bins): return MLEProbDist(fdist)

        if n == 1:
            fd = FreqDist(train)
            self._model = estimator(fd, fd.B())
        else:
            cfd = ConditionalFreqDist()
            self._ngrams = set()
            self._prefix = ('',) * (n - 1)

            for ngram in ingrams(chain(self._prefix, train), n):
                self._ngrams.add(ngram)
                context = tuple(ngram[:-1])
                token = ngram[-1]
                cfd[context][token] += 1

            self._model = ConditionalProbDist(cfd, estimator, len(cfd))
        # recursively construct the lower-order models
        if n > 1:
            self._backoff = NgramModel(n - 1, train, estimator)
    def from_words(cls, words):
        """Construct a QuadgramCollocationFinder for all quadgrams in the given
        sequence.
        """
        wfd = FreqDist()
        bfd = FreqDist()
        wild2fd = FreqDist()
        tfd = FreqDist()
        wild3fd = FreqDist()
        qfd = FreqDist()
        wild4fd = FreqDist()
        pfd = FreqDist()

        for w1, w2, w3, w4, w5 in ingrams(words, 5, pad_right=True):
            wfd.inc(w1)
            if w2 is None:
                continue
            bfd.inc((w1, w2))
            if w3 is None:
                continue
            wild2fd.inc((w1, w3))
            tfd.inc((w1, w2, w3))
            if w4 is None:
                continue
            wild2fd.inc((w1, w4))
            wild3fd.inc((w1, w2, w4))
            wild3fd.inc((w1, w3, w4))
            qfd.inc((w1, w2, w3, w4))
            if w5 is None:
                continue
            wild2fd.inc((w1, w5))
            wild3fd.inc((w1, w2, w5))
            wild3fd.inc((w1, w3, w5))
            wild3fd.inc((w1, w4, w5))
            wild4fd.inc((w1, w3, w4, w5))
            wild4fd.inc((w1, w2, w4, w5))
            wild4fd.inc((w1, w2, w3, w5))
            pfd.inc((w1, w2, w3, w4, w5))
        return cls(wfd, bfd, wild2fd, tfd, wild3fd, qfd, wild4fd, pfd)
Ejemplo n.º 24
0
    def prob_classify(self, document):
        """
        @return: a probability distribution over labels for the given document.
        @rtype: L{ProbDistI <nltk.probability.ProbDistI>}
        """
        # Find the log probabilty of each label, given the features.
        # Start with the log probability of the label itself.
        logprob = {}

        # Loop through each possible label and calculate the
        # (log) probability of document under that label
        for label in self.labels():
            # Probability of category
            logprob[label] = self.model.category_probdist.prob(label)

            # Extract ngram model
            ngram_model = self.model.ngrams[label]

            prefix = ('',) * (ngram_model._n - 1)

            # Prepare words
            words = [w.lower() for w in document if w.isalpha()]

            # Go through each word and calculate P(w | context)
            for ngram in ingrams(chain(prefix, words), ngram_model._n):
                context = tuple(ngram[:-1])
                token = ngram[-1]

                try:
                    logprob[label] += -ngram_model.logprob(token, context)
                except RuntimeError:
                    # Unknown word, skip it
                    #logger.debug(label + ': Ignoring unknown word: ' + token)
                    continue

            #logger.debug(label + ': ' + str(logprob[label]))

        # Return probability for each label
        return DictionaryProbDist(logprob, normalize=True, log=True)
Ejemplo n.º 25
0
    def eval_word(self, word):
        '''
            @return: probability that proposed word is a word.
        '''
        if self._hypothetical_phonotactics:
            # Make a deep copy of the ngram model so that we can update it without modifying the original
            word_ngram_model = copy.deepcopy(self._ngram_model)
            word_ngram_model.update([word])
        else:
            word_ngram_model = self._ngram_model

        # If I want to duplicate functionality of OCaml code, should probably make a ProbDist that inherits from Lidstone,
        # otherwise this won't do the denominator adjustments and other stuff.
        # for ngram in ingrams(chain(self._ngram_model._padding, word, self._ngram_model._padding), self._n):
        #     print("Ngram: {}\tProb for {} given {}: {}".format(ngram, ngram[-1], tuple(ngram[:-1]), self._ngram_model.prob(tuple(ngram[:-1]), ngram[-1])), file=sys.stderr)

        if self._n > 1:
            adjustment = Fraction(1)
        else:
            # Need to compensate for the lack of an empty word by getting rel freq of word_delimiter somehow
            # print("word delimiter prob: {}".format(self._ngram_model.prob(self._word_delimiter, self._ngram_model._padding)), file=sys.stderr)
            adjustment = Fraction(1) / (Fraction(1) - self._ngram_model.prob(
                self._word_delimiter, self._ngram_model._padding))

        # print([self._ngram_model.prob(ngram[-1], tuple(ngram[:-1])) for ngram in
        #                                     ingrams(chain(*([self._word_delimiter, word, self._word_delimiter] if self._n > 1 else [word, self._word_delimiter])),
        #                                                   self._n)], file=sys.stderr)

        raw_score = self._score_combiner([
            self._ngram_model.prob(ngram[-1], tuple(ngram[:-1]))
            for ngram in ingrams(
                chain(
                    *([self._word_delimiter, word, self._word_delimiter] if
                      self._n > 1 else [word, self._word_delimiter])), self._n)
        ])
        # print("adjustment: {:.10e}\traw score: {:.10e}".format(float(adjustment), float(raw_score)), file=sys.stderr)

        return adjustment * raw_score
Ejemplo n.º 26
0
def get_nested_noun_phrase_chunks(noun_phrase_chunks):
    # grabs any nested noun phrases
    # helpful for when original noun phrase includes tokens
    # which shouldn't really be part of the noun-phrase 
    # e.g. [('use', 'V'), ('firewalls', 'N')] should just be 'firewalls'
    
    def is_nounphrase(tagged_phrase):
        # Determine whether a tagged phrase matches the
        # noun-phrase regular expression tag sequence
        for subtree in NP_PARSER.parse(tagged_phrase).subtrees():
            if subtree.node == 'NounPhrase':
                return True
        return False

    from nltk.util import ingrams

    nested_noun_phrase_chunks = []
    for np in noun_phrase_chunks:
        for ngramSize in xrange(len(np)-1,0,-1):
            for ngram in ingrams(np,ngramSize):
                if is_nounphrase(ngram):
                    nested_noun_phrase_chunks.append(list(ngram))
    return nested_noun_phrase_chunks
Ejemplo n.º 27
0
def get_nested_noun_phrase_chunks(noun_phrase_chunks):
    # grabs any nested noun phrases
    # helpful for when original noun phrase includes tokens
    # which shouldn't really be part of the noun-phrase
    # e.g. [('use', 'V'), ('firewalls', 'N')] should just be 'firewalls'

    def is_nounphrase(tagged_phrase):
        # Determine whether a tagged phrase matches the
        # noun-phrase regular expression tag sequence
        for subtree in NP_PARSER.parse(tagged_phrase).subtrees():
            if subtree.node == 'NounPhrase':
                return True
        return False

    from nltk.util import ingrams

    nested_noun_phrase_chunks = []
    for np in noun_phrase_chunks:
        for ngramSize in xrange(len(np) - 1, 0, -1):
            for ngram in ingrams(np, ngramSize):
                if is_nounphrase(ngram):
                    nested_noun_phrase_chunks.append(list(ngram))
    return nested_noun_phrase_chunks
Ejemplo n.º 28
0
    def __init__(self, n, train, estimator=None, factor=0.77):
        """
        Creates an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during
        training.

        @param n: the order of the language model (ngram size)
        @type n: C{int}
        @param train: the training text
        @type train: C{list} of C{string}
        @param estimator: a function for generating a probability distribution
        @type estimator: a function that takes a C{ConditionalFreqDist} and
              returns a C{ConditionalProbDist}
        """

        self._n = n

        if estimator is None:
            if n > 1:
                # Use smoothing based on Ney et al
                probdist_factory = lambda fdist, bins, n_train, n_0: \
                                NeyProbDist(fdist, bins, n_train, n_0, factor, NeyProbDist.ABSOLUTE)
                                
                #probdist_factory = lambda fdist, bins, *args: LaplaceProbDist(fdist, bins)
            else:
                # Use simple add-1 smoothing for unigrams
                probdist_factory = lambda fdist, bins, *args: LaplaceProbDist(fdist, bins)
        else:
            probdist_factory = estimator

        # Initialize conditional frequency distribution
        cfd = ConditionalFreqDist()

        # Initialize set of ngrams
        self._ngrams = set()
        self._ngram_count = 0

        # Prefix beginning of document with empty strings
        self._prefix = ('',) * (n - 1)

        # Count the number of training examples
        num_training = 0

        # Loop through each ngram and add to CFD
        for ngram in ingrams(chain(self._prefix, train), n):
            # Lowercase words
            ngram = tuple(w.lower() for w in ngram)

            # Add to known ngrams
            self._ngrams.add(ngram)

            # Add to CFD
            context = tuple(ngram[:-1])
            token = ngram[-1]
            cfd[context].inc(token)

            num_training += 1

        # Calculate vocabulary size (for NeyProbDist)
        v = len(set(train))
        bins = v ** n

        # Number of bins with a count > 0
        self._ngram_count = len(self._ngrams)

        # Gives us number of bins with count = 0
        n_0 = bins - self._ngram_count

        # Create CPD model
        self._model = ConditionalProbDist(cfd, probdist_factory, bins, num_training, n_0)

        # recursively construct the lower-order models
        if n > 1:
            self._backoff = SLINgramModel(n-1, train, estimator)
Ejemplo n.º 29
0
    def __init__(self, n, train, k=5, v=None,
                 liveDangerously=False, quiet=False):
        """
        Creates an Katz-threshholded Ngram language model to capture
        patterns in n consecutive words of training text.
        Uses the KGoodTuringProbDist to estimate the conditional and unigram probabilities,
        to provide coverage of Ngrams not seen during training.

        @param n: the order of the language model (ngram size)
        @type n: C{int}
        @param train: the training text
        @type train: C{list} of C{string}
        @param k: The threshhold above which counts are assumed
                  to be reliable.  Defaults to 5.
        @type  k: C{Int}
        @param v: The number of unseens of degree 1.  Defaults to the
                  number of types in the training set
        @type  v: C{Int}
        @param liveDangerously: If False, for each model check that
                                the total probability mass after all
                                adjustments is close to 1.  Defaults
                                to False.
        @type  liveDangerously: C{Boolean}
        @param quiet: Various information will be printed during model
                       construction unless this is True.  Defaults to False.
        @type  quiet: C{Boolean}
        """
        self._n = n
        self._N = 1 + len(train) - n
        fd = FreqDist(train)
        if v is None:
            v = fd.B()
        print(('v', v))
        if n == 1:
            # Treat this case specially
            self._model = KGoodTuringProbDist(fd, k, v, liveDangerously, ())
            if not quiet:
                print("%s entries for %s tokens at degree 1, %s" % (len(fd),
                                                                    fd.N(),
                                                                    self._model.status))
        else:
            def estimator(fdist, ctxt): return KGoodTuringProbDist(fdist, k, v,
                                                                   liveDangerously,
                                                                   ctxt)

            cfd = ConditionalFreqDist()

            for ngram in ingrams(train, n):
                # self._ngrams.add(ngram)
                context = tuple(ngram[:-1])
                token = ngram[-1]
                cfd[context].inc(token)

            self._model = ConditionalProbDist(cfd, estimator, True)
            if not quiet:
                statuses = {'normal': 0, 'bigSkewed': 0,
                            'weak': 0, LowHacked: 0}
                for ctx in cfd.conditions():
                    statuses[self[ctx].status] += 1
                print("%s conditions at degree %s" %
                      (len(cfd.conditions()), n))
                for s in list(statuses.keys()):
                    print(" %s %6d" % (s, statuses[s]))

            # recursively construct the lower-order models
            self._backoff = KBNgramModel(n - 1, train, k, v, liveDangerously)
 def __add_token_ngrams(self, token):
     for current_ngram in ingrams(token, self.k + 1):
         sparse_dict_increment(self.histogram, ''.join(current_ngram))
Ejemplo n.º 31
0
    def __init__(self, n, train, pad_left=True, pad_right=False,
                 estimator=None, *estimator_args, **estimator_kwargs):
        """
        Create an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during
        training.

            >>> from nltk.corpus import brown
            >>> from nltk.probability import LidstoneProbDist
            >>> est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
            >>> lm = NgramModel(3, brown.words(categories='news'), estimator=est)
            >>> lm
            <NgramModel with 91603 3-grams>
            >>> lm._backoff
            <NgramModel with 62888 2-grams>
            >>> lm.entropy(['The', 'Fulton', 'County', 'Grand', 'Jury', 'said',
            ... 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent',
            ... 'primary', 'election', 'produced', '``', 'no', 'evidence',
            ... "''", 'that', 'any', 'irregularities', 'took', 'place', '.'])
            ... # doctest: +ELLIPSIS
            0.5776...

        :param n: the order of the language model (ngram size)
        :type n: int
        :param train: the training text
        :type train: list(str) or list(list(str))
        :param pad_left: whether to pad the left of each sentence with an (n-1)-gram of empty strings
        :type pad_left: bool
        :param pad_right: whether to pad the right of each sentence with an (n-1)-gram of empty strings
        :type pad_right: bool
        :param estimator: a function for generating a probability distribution
        :type estimator: a function that takes a ConditionalFreqDist and
            returns a ConditionalProbDist
        :param estimator_args: Extra arguments for estimator.
            These arguments are usually used to specify extra
            properties for the probability distributions of individual
            conditions, such as the number of bins they contain.
            Note: For backward-compatibility, if no arguments are specified, the
            number of bins in the underlying ConditionalFreqDist are passed to
            the estimator as an argument.
        :type estimator_args: (any)
        :param estimator_kwargs: Extra keyword arguments for the estimator
        :type estimator_kwargs: (any)
        """

        # protection from cryptic behavior for calling programs
        # that use the pre-2.0.2 interface
        assert(isinstance(pad_left, bool))
        assert(isinstance(pad_right, bool))

        self._n = n
        self._lpad = ('',) * (n - 1) if pad_left else ()
        self._rpad = ('',) * (n - 1) if pad_right else ()

        if estimator is None:
            estimator = _estimator

        cfd = ConditionalFreqDist()
        self._ngrams = set()


        # If given a list of strings instead of a list of lists, create enclosing list
        if (train is not None) and isinstance(train[0], compat.string_types):
            train = [train]

        for sent in train:
            for ngram in ingrams(chain(self._lpad, sent, self._rpad), n):
                self._ngrams.add(ngram)
                context = tuple(ngram[:-1])
                token = ngram[-1]
                cfd[context].inc(token)

        if not estimator_args and not estimator_kwargs:
            self._model = ConditionalProbDist(cfd, estimator, len(cfd))
        else:
            self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kwargs)

        # recursively construct the lower-order models
        if n > 1:
            self._backoff = NgramModel(n-1, train, pad_left, pad_right,
                                       estimator, *estimator_args, **estimator_kwargs)
Ejemplo n.º 32
0
def frequency_of_term_in_article(term, tokens):
    # term is a tuple of strings
    # tokens is a list of sentences, where sentences are a list of words
    return sum(1 for sent in tokens for ngram in ingrams(sent, len(term))
               if tuple(ngram) == term)
Ejemplo n.º 33
0
 def __add_token_ngrams(self, token):
     for current_ngram in ingrams(token, self.k + 1):
         sparse_dict_increment(self.histogram, ''.join(current_ngram))
Ejemplo n.º 34
0
    def __init__(self,
                 n,
                 train,
                 estimator=None,
                 *estimator_args,
                 **estimator_kw_args):
        """
        Creates an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during
        training.

            >>> from nltk.corpus import brown
            >>> from nltk.probability import LidstoneProbDist
            >>> estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
            >>> lm = NgramModel(3, brown.words(categories='news'), estimator)
            >>> lm.entropy(['The', 'Fulton', 'County', 'Grand', 'Jury', 'said',
            ... 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent',
            ... 'primary', 'election', 'produced', '``', 'no', 'evidence',
            ... "''", 'that', 'any', 'irregularities', 'took', 'place', '.'])
            ... # doctest: +ELLIPSIS
            1.682...

        :param n: the order of the language model (ngram size)
        :type n: int
        :param train: the training text
        :type train: list of string
        :param estimator: a function for generating a probability distribution
        :type estimator: a function that takes a ConditionalFreqDist and
              returns a ConditionalProbDist
        :param estimator_args: Extra arguments for estimator.
            These arguments are usually used to specify extra
            properties for the probability distributions of individual
            conditions, such as the number of bins they contain.
            Note: For backward-compatibility, if no arguments are specified, the
            number of bins in the underlying ConditionalFreqDist are passed to
            the estimator as an argument.
        :type estimator_args: (any)
        :param estimator_kw_args: Extra keyword arguments for estimator.
        :type estimator_kw_args: (any)
        """

        self._n = n

        if estimator is None:
            estimator = _estimator

        cfd = ConditionalFreqDist()
        self._ngrams = set()
        self._prefix = ('', ) * (n - 1)

        for ngram in ingrams(chain(self._prefix, train), n):
            self._ngrams.add(ngram)
            context = tuple(ngram[:-1])
            token = ngram[-1]
            cfd[context].inc(token)

        if (not estimator_args) and (not estimator_kw_args):
            self._model = ConditionalProbDist(cfd, estimator, len(cfd))
        else:
            self._model = ConditionalProbDist(cfd, estimator, *estimator_args,
                                              **estimator_kw_args)

        # recursively construct the lower-order models
        if n > 1:
            self._backoff = NgramModel(n - 1, train, estimator,
                                       *estimator_args, **estimator_kw_args)
Ejemplo n.º 35
0
def frequency_of_term_in_article(term,tokens):
    # term is a tuple of strings
    # tokens is a list of sentences, where sentences are a list of words
    return sum(1 for sent in tokens for ngram in ingrams(sent,len(term)) if tuple(ngram) == term)
Ejemplo n.º 36
0
    def __init__(self,
                 n,
                 train,
                 smoothing=1e9,
                 #lagrangian_parameter=1.,
                 #estimator=None,
                 maximum_length=20,
                 minimum_length=3,
                 char_set=string.lowercase + string.punctuation + string.digits,
                 #char_set=string.lowercase,
                 patch_char='#'):
        """
        Creates an nchar language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during
        training.

        @param n: the order of the language model (nchar size)
        @type n: C{int}
        @param train: the training text
        @type train: C{list} of C{string}
        @param estimator: a function for generating a probability distribution
        @type estimator: a function that takes a C{ConditionalFreqDist} and
              returns a C{ConditionalProbDist}
        """

        self._smoothing = smoothing;
        #self.lagrangian_parameter = lagrangian_parameter;

        self._n = n

        self._maximum_length = maximum_length;
        self._minimum_length = minimum_length;
        self._char_set = char_set;
        
        #estimator = lambda fdist, bins: nltk.probability.WittenBellProbDist(fdist, len(char_set));
        estimator = lambda fdist, bins: nltk.probability.LidstoneProbDist(fdist, self._smoothing, len(self._char_set)+1);
        #estimator = lambda fdist, bins: nltk.probability.LidstoneProbDist(fdist, 1e-9, len(self._char_set));
        #estimator = lambda fdist, bins: nltk.probability.GoodTuringProbDist(fdist, len(self._char_set));
        #estimator = lambda fdist, bins: nltk.probability.SimpleGoodTuringProbDist(fdist, len(self._char_set));

        cfd = ConditionalFreqDist()
        self._ngrams = set()
        self._patch_char = patch_char;
        self._prefix = (self._patch_char,) * (n - 1)
        
        length = nltk.probability.FreqDist();
        word_freq_dist = nltk.probability.FreqDist();
        char_list = [];
        for word in train:
            word = word.strip().lower();
            if len(word)<self._minimum_length or len(word)>self._maximum_length:
                continue;
            length.inc(len(word));
            word_freq_dist.inc(word, 1);
            char_list.extend(self._prefix);
            char_list.extend([char for char in word if char in self._char_set]);
        self._length = nltk.probability.WittenBellProbDist(length, length.B()+1);
        #self._length = nltk.probability.WittenBellProbDist(length, self._maximum_length);
        
        #context_freq_dist = nltk.probability.FreqDist();
        #for nchar in ingrams(chain(self._prefix, train), n):
        for nchar in ingrams(char_list, n):
            self._ngrams.add(nchar)
            context = tuple(nchar[:-1])
            #context_freq_dist.inc(context);
            token = nchar[-1]
            cfd[context].inc(token)
        #self._context = nltk.probability.WittenBellProbDist(context_freq_dist, len(self._char_set)**(n-1)+1);

        '''
        if n==3:
            cond = 0;
            for x in self._char_set:
                for y in self._char_set:
                    print (x, y), context_freq_dist[(x, y)], self._context.prob((x, y));
                    cond += self._context.prob((x, y));
            print 'cond is', cond
        '''
        
        #self._model = ConditionalProbDist(cfd, estimator, len(cfd));
        #print self._char_set;
        self._model = ConditionalProbDist(cfd, estimator, len(self._char_set) ** (n - 1));

        #========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ==========
        '''
        consonant_freq_dist = nltk.probability.FreqDist();
        consonant_condition_freq_dist = nltk.probability.ConditionalFreqDist();
        for word in train:
            #word = re.sub(r'aeiou', ' ', word);
            word = word[0] + re.sub('aeiouy', ' ', word[1:]);
            
            consonant_list = word.split();
            #consonant_list = ['#', '#'] + consonant_list;
            for temp in consonant_list:
                consonant_freq_dist.inc(temp, 1);
                
        consonant_freq_dist.plot()
        '''
        #========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ==========        
        word_prob_dist = nltk.probability.MLEProbDist(word_freq_dist);

        word_model_empirical_frequency = numpy.zeros((1, self._maximum_length - self._minimum_length + 1)) + 1e-300;
        word_model_square = numpy.zeros((1, self._maximum_length - self._minimum_length + 1)) + 1e-300;
        
        #word_model_empirical_frequency_old = numpy.zeros((1, self._maximum_length - self._minimum_length + 1));
        #word_model_square_old = numpy.zeros((1, self._maximum_length - self._minimum_length + 1));
        
        total_outcomes = 0;
        for x in xrange(self._minimum_length, self._maximum_length+1):
            total_outcomes += len(self._char_set) ** x;

        for word in word_freq_dist.keys():
            word_model_empirical_frequency[0, len(word)-self._minimum_length] += word_prob_dist.prob(word) * self.probability_without_length(word);
            #word_model_empirical_frequency[0, len(word)-self._minimum_length] += 1.0/total_outcomes * self.probability_without_length(word);
            word_model_square[0, len(word)-self._minimum_length] += self.probability_without_length(word) ** 2;
            
            #word_model_empirical_frequency_old[0, len(word)-self._minimum_length] += word_prob_dist.prob(word) * self.probability_without_length(word);
            #word_model_square_old[0, len(word)-self._minimum_length] += self.probability_without_length(word) ** 2;
        
        #print "alpha is", 2 * (1-numpy.sum(word_model_empirical_frequency / word_model_square))/numpy.sum(1.0/word_model_square)
        #print word_model_empirical_frequency, word_model_square

        #sum_word_model_square_inverse = numpy.sum(1.0 / word_model_square);
        #sum_word_model_empirical_frequency_over_word_model_square = numpy.sum(word_model_empirical_frequency / word_model_square);
        #self._multinomial_length = (word_model_empirical_frequency * sum_word_model_square_inverse - sum_word_model_empirical_frequency_over_word_model_square + 1) / (word_model_square * sum_word_model_square_inverse);
        #print sum_word_model_square_inverse, sum_word_model_empirical_frequency_over_word_model_square;
        #print self._multinomial_length, numpy.sum(self._multinomial_length);
            
        if True:
            lagrangian_parameter = 2 * (1-numpy.sum(word_model_empirical_frequency / word_model_square))/numpy.sum(1.0/word_model_square)
        else:
            lagrangian_parameter = 1.;
        #print "lagrangian parameter is", lagrangian_parameter
        self._multinomial_length = (word_model_empirical_frequency - lagrangian_parameter / 2) / word_model_square;
        self._multinomial_length /= numpy.sum(self._multinomial_length);
        
        #print self._multinomial_length, numpy.sum(self._multinomial_length);
        assert numpy.all(self._multinomial_length>=0), self._multinomial_length;

        # recursively construct the lower-order models
        if n > 1:
            self._backoff = NcharModel(n-1, train, self._smoothing, maximum_length,
                 minimum_length, self._char_set, self._patch_char);
Ejemplo n.º 37
0
t = time() - t
print str(t) + 's'

# Test generation of CFD
print 'Creating CFD...',
sys.stdout.flush()
t = time()

cat = cr.categories()[0]

n = 3

cfd = ConditionalFreqDist()
prefix = ('',) * (n - 1)

for ngram in ingrams(chain(prefix, cr.words(categories=[cat])), n):
    context = tuple(ngram[:-1])
    token = ngram[-1]
    cfd[context].inc(token)

t = time() - t
print str(t) + 's'


t = time()
print 'Pickling CFD...',
sys.stdout.flush()

pickle.dump(cfd, open('cfd.p', 'w'), protocol=1)

t = time() - t
Ejemplo n.º 38
0
    def __init__(self, n, train, pad_left=True, pad_right=False,
                 estimator=None, *estimator_args, **estimator_kwargs):
        """
        Create an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during
        training.

            >>> from nltk.corpus import brown
            >>> from nltk.probability import LidstoneProbDist
            >>> est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
            >>> lm = NgramModel(3, brown.words(categories='news'), estimator=est)
            >>> lm
            <NgramModel with 91603 3-grams>
            >>> lm._backoff
            <NgramModel with 62888 2-grams>
            >>> lm.entropy(['The', 'Fulton', 'County', 'Grand', 'Jury', 'said',
            ... 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent',
            ... 'primary', 'election', 'produced', '``', 'no', 'evidence',
            ... "''", 'that', 'any', 'irregularities', 'took', 'place', '.'])
            ... # doctest: +ELLIPSIS
            0.5776...

        :param n: the order of the language model (ngram size)
        :type n: int
        :param train: the training text
        :type train: list(str) or list(list(str))
        :param pad_left: whether to pad the left of each sentence with an (n-1)-gram of empty strings
        :type pad_left: bool
        :param pad_right: whether to pad the right of each sentence with an (n-1)-gram of empty strings
        :type pad_right: bool
        :param estimator: a function for generating a probability distribution
        :type estimator: a function that takes a ConditionalFreqDist and
            returns a ConditionalProbDist
        :param estimator_args: Extra arguments for estimator.
            These arguments are usually used to specify extra
            properties for the probability distributions of individual
            conditions, such as the number of bins they contain.
            Note: For backward-compatibility, if no arguments are specified, the
            number of bins in the underlying ConditionalFreqDist are passed to
            the estimator as an argument.
        :type estimator_args: (any)
        :param estimator_kwargs: Extra keyword arguments for the estimator
        :type estimator_kwargs: (any)
        """

        # protection from cryptic behavior for calling programs
        # that use the pre-2.0.2 interface
        assert (isinstance(pad_left, bool))
        assert (isinstance(pad_right, bool))

        self._n = n
        self._lpad = ('',) * (n - 1) if pad_left else ()
        self._rpad = ('',) * (n - 1) if pad_right else ()

        if estimator is None:
            estimator = _estimator

        cfd = ConditionalFreqDist()
        self._ngrams = set()

        # If given a list of strings instead of a list of lists, create enclosing list
        if (train is not None) and isinstance(train[0], str):
            train = [train]

        for sent in train:
            for ngram in ingrams(chain(self._lpad, sent, self._rpad), n):
                self._ngrams.add(ngram)
                context = tuple(ngram[:-1])
                token = ngram[-1]
                cfd[context].inc(token)

        if not estimator_args and not estimator_kwargs:
            self._model = ConditionalProbDist(cfd, estimator, len(cfd))
        else:
            self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kwargs)

        # recursively construct the lower-order models
        if n > 1:
            self._backoff = NgramModel(n - 1, train, pad_left, pad_right,
                                       estimator, *estimator_args, **estimator_kwargs)
Ejemplo n.º 39
0
    def __init__(
            self,
            n,
            train,
            smoothing=1e9,
            #lagrangian_parameter=1.,
            #estimator=None,
            maximum_length=20,
            minimum_length=3,
            char_set=string.lowercase + string.punctuation + string.digits,
            #char_set=string.lowercase,
            patch_char='#'):
        """
        Creates an nchar language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during
        training.

        @param n: the order of the language model (nchar size)
        @type n: C{int}
        @param train: the training text
        @type train: C{list} of C{string}
        @param estimator: a function for generating a probability distribution
        @type estimator: a function that takes a C{ConditionalFreqDist} and
              returns a C{ConditionalProbDist}
        """

        self._smoothing = smoothing
        #self.lagrangian_parameter = lagrangian_parameter;

        self._n = n

        self._maximum_length = maximum_length
        self._minimum_length = minimum_length
        self._char_set = char_set

        #estimator = lambda fdist, bins: nltk.probability.WittenBellProbDist(fdist, len(char_set));
        estimator = lambda fdist, bins: nltk.probability.LidstoneProbDist(
            fdist, self._smoothing,
            len(self._char_set) + 1)
        #estimator = lambda fdist, bins: nltk.probability.LidstoneProbDist(fdist, 1e-9, len(self._char_set));
        #estimator = lambda fdist, bins: nltk.probability.GoodTuringProbDist(fdist, len(self._char_set));
        #estimator = lambda fdist, bins: nltk.probability.SimpleGoodTuringProbDist(fdist, len(self._char_set));

        cfd = ConditionalFreqDist()
        self._ngrams = set()
        self._patch_char = patch_char
        self._prefix = (self._patch_char, ) * (n - 1)

        length = nltk.probability.FreqDist()
        word_freq_dist = nltk.probability.FreqDist()
        char_list = []
        for word in train:
            word = word.strip().lower()
            if len(word) < self._minimum_length or len(
                    word) > self._maximum_length:
                continue
            length.inc(len(word))
            word_freq_dist.inc(word, 1)
            char_list.extend(self._prefix)
            char_list.extend([char for char in word if char in self._char_set])
        self._length = nltk.probability.WittenBellProbDist(
            length,
            length.B() + 1)
        #self._length = nltk.probability.WittenBellProbDist(length, self._maximum_length);

        #context_freq_dist = nltk.probability.FreqDist();
        #for nchar in ingrams(chain(self._prefix, train), n):
        for nchar in ingrams(char_list, n):
            self._ngrams.add(nchar)
            context = tuple(nchar[:-1])
            #context_freq_dist.inc(context);
            token = nchar[-1]
            cfd[context].inc(token)
        #self._context = nltk.probability.WittenBellProbDist(context_freq_dist, len(self._char_set)**(n-1)+1);
        '''
        if n==3:
            cond = 0;
            for x in self._char_set:
                for y in self._char_set:
                    print (x, y), context_freq_dist[(x, y)], self._context.prob((x, y));
                    cond += self._context.prob((x, y));
            print 'cond is', cond
        '''

        #self._model = ConditionalProbDist(cfd, estimator, len(cfd));
        #print self._char_set;
        self._model = ConditionalProbDist(cfd, estimator,
                                          len(self._char_set)**(n - 1))

        #========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ==========
        '''
        consonant_freq_dist = nltk.probability.FreqDist();
        consonant_condition_freq_dist = nltk.probability.ConditionalFreqDist();
        for word in train:
            #word = re.sub(r'aeiou', ' ', word);
            word = word[0] + re.sub('aeiouy', ' ', word[1:]);
            
            consonant_list = word.split();
            #consonant_list = ['#', '#'] + consonant_list;
            for temp in consonant_list:
                consonant_freq_dist.inc(temp, 1);
                
        consonant_freq_dist.plot()
        '''
        #========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ========== ==========
        word_prob_dist = nltk.probability.MLEProbDist(word_freq_dist)

        word_model_empirical_frequency = numpy.zeros(
            (1, self._maximum_length - self._minimum_length + 1)) + 1e-300
        word_model_square = numpy.zeros(
            (1, self._maximum_length - self._minimum_length + 1)) + 1e-300

        #word_model_empirical_frequency_old = numpy.zeros((1, self._maximum_length - self._minimum_length + 1));
        #word_model_square_old = numpy.zeros((1, self._maximum_length - self._minimum_length + 1));

        total_outcomes = 0
        for x in xrange(self._minimum_length, self._maximum_length + 1):
            total_outcomes += len(self._char_set)**x

        for word in word_freq_dist.keys():
            word_model_empirical_frequency[
                0, len(word) - self._minimum_length] += word_prob_dist.prob(
                    word) * self.probability_without_length(word)
            #word_model_empirical_frequency[0, len(word)-self._minimum_length] += 1.0/total_outcomes * self.probability_without_length(word);
            word_model_square[
                0, len(word) -
                self._minimum_length] += self.probability_without_length(
                    word)**2

            #word_model_empirical_frequency_old[0, len(word)-self._minimum_length] += word_prob_dist.prob(word) * self.probability_without_length(word);
            #word_model_square_old[0, len(word)-self._minimum_length] += self.probability_without_length(word) ** 2;

        #print "alpha is", 2 * (1-numpy.sum(word_model_empirical_frequency / word_model_square))/numpy.sum(1.0/word_model_square)
        #print word_model_empirical_frequency, word_model_square

        #sum_word_model_square_inverse = numpy.sum(1.0 / word_model_square);
        #sum_word_model_empirical_frequency_over_word_model_square = numpy.sum(word_model_empirical_frequency / word_model_square);
        #self._multinomial_length = (word_model_empirical_frequency * sum_word_model_square_inverse - sum_word_model_empirical_frequency_over_word_model_square + 1) / (word_model_square * sum_word_model_square_inverse);
        #print sum_word_model_square_inverse, sum_word_model_empirical_frequency_over_word_model_square;
        #print self._multinomial_length, numpy.sum(self._multinomial_length);

        if True:
            lagrangian_parameter = 2 * (1 - numpy.sum(
                word_model_empirical_frequency /
                word_model_square)) / numpy.sum(1.0 / word_model_square)
        else:
            lagrangian_parameter = 1.
        #print "lagrangian parameter is", lagrangian_parameter
        self._multinomial_length = (
            word_model_empirical_frequency -
            lagrangian_parameter / 2) / word_model_square
        self._multinomial_length /= numpy.sum(self._multinomial_length)

        #print self._multinomial_length, numpy.sum(self._multinomial_length);
        assert numpy.all(
            self._multinomial_length >= 0), self._multinomial_length

        # recursively construct the lower-order models
        if n > 1:
            self._backoff = NcharModel(n - 1, train, self._smoothing,
                                       maximum_length, minimum_length,
                                       self._char_set, self._patch_char)
Ejemplo n.º 40
0
from nltk.corpus import genesis
from nltk.probability import LidstoneProbDist
from nltk.model import NgramModel
from nltk.util import ingrams


# Tokens contains the words for Genesis and Reuters Trade
#tokens = list(genesis.words('english-kjv.txt'))
#tokens.extend(list(reuters.words(categories = 'trade')))

# estimator for smoothing the N-gram model
estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)

sent = "abraham lincoln be bear feb 12 1809"
tokens = sent.split()
splitNgrams = list(ingrams(list(sent), 3))

tokens = ["".join(x) for x in splitNgrams]

# N-gram language model with 3-grams
# Without an estimator, it assumes Good-Turing.
model = NgramModel(3, tokens, estimator)
print "Model: " + str(model)

sent2 = "abe lincoln was born in 1809"

splitNgrams2 = list(ingrams(list(sent2), 3))
tokens2 = ["".join(x) for x in splitNgrams2]


print "Word: " + tokens2[-1]
Ejemplo n.º 41
0
    def __init__(self,
                 n,
                 train,
                 estimator=None,
                 *estimator_args,
                 **estimator_kw_args):
        """
        Creates an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during
        training.

        @param n: the order of the language model (ngram size)
        @type n: C{int}
        @param train: the training text
        @type train: C{list} of C{string} (or C{list} of C{string} C{list}s)
        @param estimator: a function for generating a probability distribution
        @type estimator: a function that takes a C{ConditionalFreqDist} and
              returns a C{ConditionalProbDist}
        @param estimator_args: Extra arguments for C{estimator}.
            These arguments are usually used to specify extra
            properties for the probability distributions of individual
            conditions, such as the number of bins they contain.
            Note: For backward-compatibility, if no arguments are specified, the
            number of bins in the underlying C{ConditionalFreqDist} are passed to
            the estimator as an argument.
        @type estimator_args: (any)
        @param estimator_kw_args: Extra keyword arguments for C{estimator}.
        @type estimator_kw_args: (any)
        """

        self._n = n

        if estimator is None:
            estimator = _estimator

        cfd = ConditionalFreqDist()
        self._ngrams = set()
        self._padding = ('', ) * (n - 1)

        # If given a list of strings instead of a list of lists, create enclosing list
        if isinstance(train[0], basestring):
            train = [train]

        for utterance in train:
            for ngram in ingrams(
                    chain(self._padding, utterance, self._padding), n):
                self._ngrams.add(ngram)
                context = tuple(ngram[:-1])
                token = ngram[-1]
                cfd[context].inc(token)

        if (not estimator_args) and (not estimator_kw_args):
            self._model = ConditionalProbDist(cfd, estimator, len(cfd))
        else:
            self._model = ConditionalProbDist(cfd, estimator, *estimator_args,
                                              **estimator_kw_args)

        # recursively construct the lower-order models
        if n > 1:
            self._backoff = NgramModel(n - 1, train, estimator,
                                       *estimator_args, **estimator_kw_args)
Ejemplo n.º 42
0
    def __init__(self, n, train, pad_left=False, pad_right=False,
                 estimator=None, *estimator_args, **estimator_kwargs):
        """
        Creates an ngram language model to capture patterns in n consecutive
        words of training text.  An estimator smooths the probabilities derived
        from the text and may allow generation of ngrams not seen during
        training.

        :param n: the order of the language model (ngram size)
        :type n: C{int}
        :param train: the training text
        :type train: C{iterable} of C{string} or C{iterable} of C{iterable} of C{string} 
        :param estimator: a function for generating a probability distribution---defaults to MLEProbDist
        :type estimator: a function that takes a C{ConditionalFreqDist} and
              returns a C{ConditionalProbDist}
        :param pad_left: whether to pad the left of each sentence with an (n-1)-gram of <s>
        :type pad_left: bool
        :param pad_right: whether to pad the right of each sentence with </s>
        :type pad_right: bool
        :param estimator_args: Extra arguments for estimator.
            These arguments are usually used to specify extra
            properties for the probability distributions of individual
            conditions, such as the number of bins they contain.
            Note: For backward-compatibility, if no arguments are specified, the
            number of bins in the underlying ConditionalFreqDist are passed to
            the estimator as an argument.
        :type estimator_args: (any)
        :param estimator_kwargs: Extra keyword arguments for the estimator
        :type estimator_kwargs: (any)
        """

        # protection from cryptic behavior for calling programs
        # that use the pre-2.0.2 interface
        assert(isinstance(pad_left, bool))
        assert(isinstance(pad_right, bool))

        # make sure n is greater than zero, otherwise print it
        assert (n > 0), n

        # For explicitness save the check whether this is a unigram model
        self.is_unigram_model = (n == 1)
        # save the ngram order number
        self._n = n
        # save left and right padding
        self._lpad = ('<s>',) * (n - 1) if pad_left else ()
        # Need _rpad even for unigrams or padded entropy will give
        #  wrong answer because '</s>' will be treated as unseen...
        self._rpad = ('</s>',) if pad_right else ()
        self._padLen = len(self._lpad)+len(self._rpad)

        self._N=0
        delta = 1+self._padLen-n        # len(sent)+delta == ngrams in sent

        if estimator is None:
            assert (estimator_args is ()) and (estimator_kwargs=={}),\
                   "estimator_args (%s) or _kwargs supplied (%s), but no estimator"%(estimator_args,estimator_kwargs)
            estimator = lambda fdist, bins: MLEProbDist(fdist)

        # Given backoff, a generator isn't acceptable
        if not isinstance(train,collections.abc.Sequence):
          train=list(train)
        self._W = len(train)
        # Coerce to list of list -- note that this means to train charGrams,
        #  requires exploding the words ahead of time 
        if train is not None:
            if isinstance(train[0], compat.string_types):
                train = [train]
                self._W=1
            elif not isinstance(train[0],collections.abc.Sequence):
                # if you mix strings and generators, you have only yourself
                #  to blame!
                for i in range(len(train)):
                    train[i]=list(train[i])

        if n == 1:
            if pad_right:
                sents=(chain(s,self._rpad) for s in train)
            else:
                sents=train
            fd=FreqDist()
            for s in sents:
                fd.update(s)
            if not estimator_args and not estimator_kwargs:
                self._model = estimator(fd,fd.B())
            else:
                self._model = estimator(fd,fd.B(),
                                        *estimator_args, **estimator_kwargs)
            self._N=fd.N()
        else:
            cfd = ConditionalFreqDist()
            self._ngrams = set()

            for sent in train:
                self._N+=len(sent)+delta
                for ngram in ingrams(chain(self._lpad, sent, self._rpad), n):
                    self._ngrams.add(ngram)
                    context = tuple(ngram[:-1])
                    token = ngram[-1]
                    cfd[context][token]+=1
            if not estimator_args and not estimator_kwargs:
                self._model = ConditionalProbDist(cfd, estimator, len(cfd))
            else:
                self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kwargs)

        # recursively construct the lower-order models
        if not self.is_unigram_model:
            self._backoff = NgramModel(n-1, train,
                                        pad_left, pad_right,
                                        estimator,
                                        *estimator_args,
                                        **estimator_kwargs)

            # Code below here in this method, and the _words_following and _alpha method, are from
            # http://www.nltk.org/_modules/nltk/model/ngram.html "Last updated on Feb 26, 2015"
            self._backoff_alphas = dict()
            # For each condition (or context)
            for ctxt in cfd.conditions():
                backoff_ctxt = ctxt[1:]
                backoff_total_pr = 0.0
                total_observed_pr = 0.0

                # this is the subset of words that we OBSERVED following
                # this context.
                # i.e. Count(word | context) > 0
                for word in self._words_following(ctxt, cfd):
                    total_observed_pr += self.prob(word, ctxt)
                    # we also need the total (n-1)-gram probability of
                    # words observed in this n-gram context
                    backoff_total_pr += self._backoff.prob(word, backoff_ctxt)
                if isclose(total_observed_pr,1.0):
                    total_observed_pr=1.0
                else:
                    assert 0.0 <= total_observed_pr <= 1.0,\
                           "sum of probs for %s out of bounds: %.10g"%(ctxt,total_observed_pr)
                # beta is the remaining probability weight after we factor out
                # the probability of observed words.
                # As a sanity check, both total_observed_pr and backoff_total_pr
                # must be GE 0, since probabilities are never negative
                beta = 1.0 - total_observed_pr

                if beta!=0.0:
                    assert (0.0 <= backoff_total_pr < 1.0), \
                           "sum of backoff probs for %s out of bounds: %s"%(ctxt,backoff_total_pr)
                    alpha_ctxt = beta / (1.0 - backoff_total_pr)
                else:
                    assert ((0.0 <= backoff_total_pr < 1.0) or
                            isclose(1.0,backoff_total_pr)), \
                           "sum of backoff probs for %s out of bounds: %s"%(ctxt,backoff_total_pr)
                    alpha_ctxt = 0.0

                self._backoff_alphas[ctxt] = alpha_ctxt