Beispiel #1
0
def laplace_stuff():
    sent = "am ate ate apple am x."
    sent_tokenized = word_tokenize(sent)
    freq_dist = FreqDist(word.lower() for word in word_tokenize(sent))
    print(freq_dist.items())
    lap = probability.LaplaceProbDist(freq_dist)
    print(lap.generate())
    print(lap.prob("am"))
    print("Finished freq dist, Starting Cond dist")
    # Cond Probabilty
    cond_dist = ConditionalFreqDist()
    context = None
    tokens = sent_tokenized
    # The type of the preceeding word
    for token in tokens:
        outcome = token
        cond_dist[context] = (outcome)
        context = token
    print(cond_dist["am"])
    print(cond_dist.items())
Beispiel #2
0
class ContextIndex(object):
    """
    A bidirectional index between words and their 'contexts' in a text.
    The context of a word is usually defined to be the words that occur
    in a fixed window around the word; but other definitions may also
    be used by providing a custom context function.
    """
    @staticmethod
    def _default_context(tokens, i):
        """One left token and one right token, normalized to lowercase"""
        left = (tokens[i - 1].lower() if i != 0 else '*START*')
        right = (tokens[i + 1].lower() if i != len(tokens) - 1 else '*END*')
        return (left, right)

    def __init__(self,
                 tokens,
                 context_func=None,
                 filter=None,
                 key=lambda x: x):
        self._key = key
        self._tokens = tokens
        if context_func:
            self._context_func = context_func
        else:
            self._context_func = self._default_context
        if filter:
            tokens = [t for t in tokens if filter(t)]
        self._word_to_contexts = CFD(
            (self._key(w), self._context_func(tokens, i))
            for i, w in enumerate(tokens))
        self._context_to_words = CFD(
            (self._context_func(tokens, i), self._key(w))
            for i, w in enumerate(tokens))

    def tokens(self):
        """
        :rtype: list(str)
        :return: The document that this context index was
            created from.
        """
        return self._tokens

    def word_similarity_dict(self, word):
        """
        Return a dictionary mapping from words to 'similarity scores,'
        indicating how often these two words occur in the same
        context.
        """
        word = self._key(word)
        word_contexts = set(self._word_to_contexts[word])

        scores = {}
        for w, w_contexts in self._word_to_contexts.items():
            scores[w] = f_measure(word_contexts, set(w_contexts))

        return scores

    def similar_words(self, word, n=20):
        scores = defaultdict(int)
        for c in self._word_to_contexts[self._key(word)]:
            for w in self._context_to_words[c]:
                if w != word:
                    scores[w] += self._context_to_words[c][
                        word] * self._context_to_words[c][w]
        return sorted(scores, key=scores.get, reverse=True)[:n]

    def common_contexts(self, words, fail_on_unknown=False):
        """
        Find contexts where the specified words can all appear; and
        return a frequency distribution mapping each context to the
        number of times that context was used.

        :param words: The words used to seed the similarity search
        :type words: str
        :param fail_on_unknown: If true, then raise a value error if
            any of the given words do not occur at all in the index.
        """
        words = [self._key(w) for w in words]
        contexts = [set(self._word_to_contexts[w]) for w in words]
        empty = [words[i] for i in range(len(words)) if not contexts[i]]
        common = reduce(set.intersection, contexts)
        if empty and fail_on_unknown:
            raise ValueError("The following word(s) were not found:",
                             " ".join(words))
        elif not common:
            # nothing in common -- just return an empty freqdist.
            return FreqDist()
        else:
            fd = FreqDist(c for w in words for c in self._word_to_contexts[w]
                          if c in common)
            return fd
Beispiel #3
0
class ContextIndex(object):
    """
    A bidirectional index between words and their 'contexts' in a text.
    The context of a word is usually defined to be the words that occur
    in a fixed window around the word; but other definitions may also
    be used by providing a custom context function.
    """

    @staticmethod
    def _default_context(tokens, i):
        """One left token and one right token, normalized to lowercase"""
        left = tokens[i - 1].lower() if i != 0 else '*START*'
        right = tokens[i + 1].lower() if i != len(tokens) - 1 else '*END*'
        return (left, right)

    def __init__(self, tokens, context_func=None, filter=None, key=lambda x: x):
        self._key = key
        self._tokens = tokens
        if context_func:
            self._context_func = context_func
        else:
            self._context_func = self._default_context
        if filter:
            tokens = [t for t in tokens if filter(t)]
        self._word_to_contexts = CFD(
            (self._key(w), self._context_func(tokens, i)) for i, w in enumerate(tokens)
        )
        self._context_to_words = CFD(
            (self._context_func(tokens, i), self._key(w)) for i, w in enumerate(tokens)
        )

    def tokens(self):
        """
        :rtype: list(str)
        :return: The document that this context index was
            created from.
        """
        return self._tokens

    def word_similarity_dict(self, word):
        """
        Return a dictionary mapping from words to 'similarity scores,'
        indicating how often these two words occur in the same
        context.
        """
        word = self._key(word)
        word_contexts = set(self._word_to_contexts[word])

        scores = {}
        for w, w_contexts in self._word_to_contexts.items():
            scores[w] = f_measure(word_contexts, set(w_contexts))

        return scores

    def similar_words(self, word, n=20):
        scores = defaultdict(int)
        for c in self._word_to_contexts[self._key(word)]:
            for w in self._context_to_words[c]:
                if w != word:
                    scores[w] += (
                        self._context_to_words[c][word] * self._context_to_words[c][w]
                    )
        return sorted(scores, key=scores.get, reverse=True)[:n]

    def common_contexts(self, words, fail_on_unknown=False):
        """
        Find contexts where the specified words can all appear; and
        return a frequency distribution mapping each context to the
        number of times that context was used.

        :param words: The words used to seed the similarity search
        :type words: str
        :param fail_on_unknown: If true, then raise a value error if
            any of the given words do not occur at all in the index.
        """
        words = [self._key(w) for w in words]
        contexts = [set(self._word_to_contexts[w]) for w in words]
        empty = [words[i] for i in range(len(words)) if not contexts[i]]
        common = reduce(set.intersection, contexts)
        if empty and fail_on_unknown:
            raise ValueError("The following word(s) were not found:", " ".join(words))
        elif not common:
            # nothing in common -- just return an empty freqdist.
            return FreqDist()
        else:
            fd = FreqDist(
                c for w in words for c in self._word_to_contexts[w] if c in common
            )
            return fd
from nltk.probability import FreqDist
from nltk.probability import ConditionalFreqDist
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()

testNegWords = movie_reviews.words(categories=['pos'])
testPosWords = movie_reviews.words(categories=['neg'])

for word in testNegWords:
    word_fd[word.lower()]+=1
    label_word_fd['neg'][word.lower()]+=1
for word in testPosWords:
    word_fd[word.lower()]+=1
    label_word_fd['pos'][word.lower()]+=1
print(word_fd.N(),word_fd.B(),word_fd.most_common(20))
print(label_word_fd.N(),label_word_fd.conditions(),label_word_fd.items())
print(label_word_fd['pos'].N(),label_word_fd['neg'].N())


# In[ ]:

# n_ii = label_word_fd[label][word]
# n_ix = word_fd[word]
# n_xi = label_word_fd[label].N()
# n_xx = label_word_fd.N()
#         w1    ~w1
#      ------ ------
#  w2 | n_ii | n_oi | = n_xi
#      ------ ------
# ~w2 | n_io | n_oo |
#     ------ ------
Beispiel #5
0
A = set(allwords)
longwords = [w for w in A if len(w) > 12]  #单词长度>12的所有单词
print(sorted(longwords))

from nltk.probability import FreqDist, ConditionalFreqDist
"""
FreqDist: 创建一个所给数据的频率分布
B(): 不同单词的个数
N(): 所有单词的个数
tabulate(20): 把前20组数据以表格的形式显示出来
fd2.plot(20,cumulative=True): 参数cumulative 对数据进行累计 
"""
fd2 = FreqDist([sx.lower() for sx in allwords if sx.isalpha()])
print("不同单词的个数:%d" % fd2.B())
print("所有单词的个数:%d" % fd2.N())
fd2.tabulate(20)  #把前20组数据 以表格的形式显示出来
fd2.plot(20)
fd2.plot(20, cumulative=True)
"""
freq('the')  #单词the出现的频率
ConditionalFreqDist( ): 条件频率统计的函数,研究类别之间的系统性的差异
"""
from nltk.corpus import inaugural
print(fd2.freq('the'))  #单词the出现的频率
cfd = ConditionalFreqDist((fileid, len(w)) for fileid in inaugural.fileids()
                          for w in inaugural.word(fileid)
                          if fileid > '1980' and fileid < '2010')
print(cfd.items())
cfd.plot()