Ejemplo n.º 1
0
def fun_2_1():
    from nltk.util import ngrams
    from nltk.corpus import alpino

    # Unigram (一元语法)代表单个标识符。以下代码用于为 Alpino 语料库生成 unigrams
    print alpino.words()
    unigrams = ngrams(alpino.words(), 1)
    for i in unigrams:
        # print i
        pass

    # 考虑另一个有关从 alpino 语料库生成 quadgrams 或 fourgrams (四元语法)的例子
    unigrams = ngrams(alpino.words(), 4)
    for i in unigrams:
        # print i
        pass

    # bigram(二元语法)指的是一对标识符。为了在文本中找到 bigrams,首先需要搜索
    # 小写单词,把文本创建为小写单词列表后,然后创建 BigramCollocationFinder 实例。
    # 在 nltk.metrics 包中找到的 BigramAssocMeasures 可用于在文本中查找 bigrams
    from nltk.collocations import BigramCollocationFinder
    from nltk.corpus import webtext
    from nltk.metrics import BigramAssocMeasures
    tokens = [t.lower() for t in webtext.words('grail.txt')]
    words = BigramCollocationFinder.from_words(tokens)
    print words.nbest(BigramAssocMeasures.likelihood_ratio, 10)

    # 在上面的代码中,我们可以添加一个用来消除停止词和标点符号的单词过滤器
    from nltk.corpus import stopwords
    set1 = set(stopwords.words('english'))
    stops_filter = lambda w: len(w) < 3 or w in set1
    words.apply_word_filter(stops_filter)
    print words.nbest(BigramAssocMeasures.likelihood_ratio, 10)

    # 这里,我们可以将 bigrams 的频率更改为其他数字。
    # 另一种从文本中生成 bigrams 的方法是使用词汇搭配查找器,如下代码所示
    import nltk
    text1 = "Hardwork is the key to success. Never give up!"
    word = nltk.tokenize.wordpunct_tokenize(text1)
    finder = BigramCollocationFinder.from_words(word)
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    value = finder.score_ngrams(bigram_measures.raw_freq)
    print sorted(bigram for bigram, score in value)

    # 为了生成 fourgrams 并生成 fourgrams 的频率,可以使用如下代码
    text = "Hello how are you doing ? I hope you find the book interesting"
    tokens = nltk.wordpunct_tokenize(text)
    fourgrams = nltk.collocations.QuadgramCollocationFinder.from_words(tokens)
    for fourgram, freq in fourgrams.ngram_fd.items():
        print(fourgram, freq)
Ejemplo n.º 2
0
    def __init__(self, config, dont_ask=False):
        RulesetCommon.__init__(self, config, dont_ask)

        if config["worddb"]["type"] == "file":
            self.setWordDbFile(config["worddb"]["name"])
        else:
            from nltk.corpus import alpino
            self.worddb = set(alpino.words())

        self.language = LANGUAGE
        self.lang_id = self.db.languages[LANGUAGE]
        self.vowels = ["a", "e", "i", "u", "o"]
        self.double_chars = [
            "oe", "ou", "au", "ij", "ui", "ie", "ei", "eu", "oi", "ai"
        ]
import nltk
from nltk.util import ngrams
from nltk.corpus import alpino
print(alpino.words())
trigrams_tokens=ngrams(alpino.words(),3)
for i in trigrams_tokens:
    print(i)
Ejemplo n.º 4
0
from nltk.util import ngrams
from nltk.corpus import alpino

print(alpino.words())
unigrams = ngrams(alpino.words(), 4)  #四元语法
print(unigrams)
# for i in unigrams:
#     print(i)

from nltk.collocations import BigramCollocationFinder
from nltk.corpus import webtext
from nltk.metrics import BigramAssocMeasures
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
stops_filter = lambda w: len(w) < 3 or w in stop_words  # 单词长度小于3或是停用词
tokens = [t.lower() for t in webtext.words('grail.txt')]
words = BigramCollocationFinder.from_words(tokens)  # 创建实例
print(words)
words.apply_word_filter(stops_filter)
res = words.nbest(BigramAssocMeasures.likelihood_ratio, 5)  # 二元语法,前5个
print(res)

# 使用词汇搭配查找器生成bigrams
import nltk
text1 = "Hardwork is the key to success. Never give up!"
word = nltk.wordpunct_tokenize(text1)
finder = BigramCollocationFinder.from_words(word)
bigram_measures = BigramAssocMeasures()
value = finder.score_ngrams(bigram_measures.raw_freq)
print(sorted(bigram for bigram, score in value))
Ejemplo n.º 5
0
import nltk
from nltk.util import ngrams
from nltk.corpus import alpino
print(alpino.words())
bigrams_tokens = ngrams(alpino.words(), 2)
for i in bigrams_tokens:
    print(i)
Ejemplo n.º 6
0
            'English: Brown Corpus (Learned)':
                lambda: brown.words(categories='learned'),
            'English: Brown Corpus (Science Fiction)':
                lambda: brown.words(categories='science_fiction'),
            'English: Brown Corpus (Romance)':
                lambda: brown.words(categories='romance'),
            'English: Brown Corpus (Humor)':
                lambda: brown.words(categories='humor'),
            'English: NPS Chat Corpus':
                lambda: nps_chat.words(),
            'English: Wall Street Journal Corpus':
                lambda: treebank.words(),
            'Chinese: Sinica Corpus':
                lambda: sinica_treebank.words(),
            'Dutch: Alpino Corpus':
                lambda: alpino.words(),
            'Hindi: Indian Languages Corpus':
                lambda: indian.words(files='hindi.pos'),
            'Portuguese: Floresta Corpus (Portugal)':
                lambda: floresta.words(),
            'Portuguese: MAC-MORPHO Corpus (Brazil)':
                lambda: mac_morpho.words(),
            'Portuguese: Machado Corpus (Brazil)':
                lambda: machado.words(),
            'Spanish: CESS-ESP Corpus':
                lambda: cess_esp.words()
           }

class CollocationsView:
    _BACKGROUND_COLOUR='#FFF' #white
Ejemplo n.º 7
0
import nltk
from nltk.util import ngrams
from nltk.corpus import alpino
print(alpino.words())
bigrams_tokens=ngrams(alpino.words(),2)
for i in bigrams_tokens:
    print(i) 
Ejemplo n.º 8
0
import nltk
from nltk.util import ngrams
from nltk.corpus import alpino
print(alpino.words())
unigrams = ngrams(alpino.words(), 1)
for i in unigrams:
    print(i)
Ejemplo n.º 9
0
POLL_INTERVAL = 100

_DEFAULT = "English: Brown Corpus (Humor)"
_CORPORA = {
    "Catalan: CESS-CAT Corpus": lambda: cess_cat.words(),
    "English: Brown Corpus": lambda: brown.words(),
    "English: Brown Corpus (Press)": lambda: brown.words(categories=["news", "editorial", "reviews"]),
    "English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"),
    "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"),
    "English: Brown Corpus (Science Fiction)": lambda: brown.words(categories="science_fiction"),
    "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"),
    "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"),
    "English: NPS Chat Corpus": lambda: nps_chat.words(),
    "English: Wall Street Journal Corpus": lambda: treebank.words(),
    "Chinese: Sinica Corpus": lambda: sinica_treebank.words(),
    "Dutch: Alpino Corpus": lambda: alpino.words(),
    "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"),
    "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(),
    "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(),
    "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(),
    "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(),
}


class CollocationsView:
    _BACKGROUND_COLOUR = "#FFF"  # white

    def __init__(self):
        self.queue = q.Queue()
        self.model = CollocationsModel(self.queue)
        self.top = Tk()
import nltk
from nltk.util import ngrams
from nltk.corpus import alpino
print(alpino.words())
quadgrams = ngrams(alpino.words(), 4)
for i in quadgrams:
    print(i)
Ejemplo n.º 11
0
import nltk
from nltk.util import ngrams
from nltk.corpus import alpino
print(alpino.words())
unigrams=ngrams(alpino.words(),1)
for i in unigrams:
    print(i)
Ejemplo n.º 12
0
"""统计语言建模"""
"""计算句子中某种语言模式出现概率的统计模型 把自然语言作为模型进行统计分析"""
import nltk
from nltk import ngrams, BigramCollocationFinder, BigramAssocMeasures, unique_list, KneserNeyProbDist
from nltk.corpus import alpino, webtext, stopwords
"""单词分组 util.py"""
n = 4
grams = ngrams(alpino.words(), n)
# for i in grams:
# print(i)
out = list(ngrams([1, 2, 3, 4, 5], 3))
print(out)  # [(1, 2, 3), (2, 3, 4), (3, 4, 5)]

set = set(stopwords.words('english'))
stops_filter = lambda w: len(w) < 3 or w in set
tokens = [t.lower() for t in webtext.words('grail.txt')]
words = BigramCollocationFinder.from_words(tokens)
words.apply_word_filter(stops_filter)
out = words.nbest(BigramAssocMeasures.likelihood_ratio, 10)
print(out)
"""最大似然估计的目的就是:利用已知的样本结果,反推最有可能(最大概率)导致这样结果的参数值。"""
"""最大似然估计wiki https://zh.wikipedia.org/zh-cn/%E6%9C%80%E5%A4%A7%E4%BC%BC%E7%84%B6%E4%BC%B0%E8%AE%A1"""
"""隐马尔科夫模型估计 HMM"""
corpus = nltk.corpus.brown.tagged_sents(categories='adventure')[:700]
print(len(corpus))
tag_set = unique_list(tag for sent in corpus for (word, tag) in sent)
print(len(tag_set))
"""平滑"""
# gt = lambda fd, bins:SimpleGoodTuringProbDist(fd, bins=1e5)
# train_and_test(gt)
corpus = [[((x[0], y[0], z[0]), (x[1], y[1], z[1]))
Ejemplo n.º 13
0
import nltk
from nltk.util import ngrams
from nltk.corpus import alpino

unigrams = ngrams(alpino.words(), 4)

for i in unigrams:
    print(i)
    exit()
import nltk
from nltk.util import ngrams
from nltk.corpus import alpino
print(alpino.words())
quadgrams=ngrams(alpino.words(),4)
for i in quadgrams:
    print(i)
Ejemplo n.º 15
0
'''
@author:KongWeiKun
@file: wordFrequency.py
@time: 18-3-31 下午1:17
@contact: [email protected]
'''
import nltk
from nltk.util import ngrams
from nltk.corpus import alpino

print(alpino.words())
Ejemplo n.º 16
0
    'English: Brown Corpus (Learned)':
    lambda: brown.words(categories='learned'),
    'English: Brown Corpus (Science Fiction)':
    lambda: brown.words(categories='science_fiction'),
    'English: Brown Corpus (Romance)':
    lambda: brown.words(categories='romance'),
    'English: Brown Corpus (Humor)':
    lambda: brown.words(categories='humor'),
    'English: NPS Chat Corpus':
    lambda: nps_chat.words(),
    'English: Wall Street Journal Corpus':
    lambda: treebank.words(),
    'Chinese: Sinica Corpus':
    lambda: sinica_treebank.words(),
    'Dutch: Alpino Corpus':
    lambda: alpino.words(),
    'Hindi: Indian Languages Corpus':
    lambda: indian.words(files='hindi.pos'),
    'Portuguese: Floresta Corpus (Portugal)':
    lambda: floresta.words(),
    'Portuguese: MAC-MORPHO Corpus (Brazil)':
    lambda: mac_morpho.words(),
    'Portuguese: Machado Corpus (Brazil)':
    lambda: machado.words(),
    'Spanish: CESS-ESP Corpus':
    lambda: cess_esp.words()
}


class CollocationsView:
    _BACKGROUND_COLOUR = '#FFF'  # white
Ejemplo n.º 17
0
    "Catalan: CESS-CAT Corpus": lambda: cess_cat.words(),
    "English: Brown Corpus": lambda: brown.words(),
    "English: Brown Corpus (Press)": lambda: brown.words(
        categories=["news", "editorial", "reviews"]
    ),
    "English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"),
    "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"),
    "English: Brown Corpus (Science Fiction)": lambda: brown.words(
        categories="science_fiction"
    ),
    "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"),
    "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"),
    "English: NPS Chat Corpus": lambda: nps_chat.words(),
    "English: Wall Street Journal Corpus": lambda: treebank.words(),
    "Chinese: Sinica Corpus": lambda: sinica_treebank.words(),
    "Dutch: Alpino Corpus": lambda: alpino.words(),
    "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"),
    "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(),
    "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(),
    "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(),
    "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(),
}


class CollocationsView:
    _BACKGROUND_COLOUR = "#FFF"  # white

    def __init__(self):
        self.queue = q.Queue()
        self.model = CollocationsModel(self.queue)
        self.top = Tk()