Esempio n. 1
0
import json
from nltk import FreqDist
from nltk.corpus import knbc

cp = knbc.words()
dist = FreqDist(cp)
total_count = 0

for term, count in dist.items():
    total_count += count

if __name__ == "__main__":
    line = input()
    out = {"corpus_size": len(dist), "total_count": total_count}  # |C|
    for word in line.split(" "):
        out[word] = dist[word]
    print(json.dumps(out))
Esempio n. 2
0

# out.write(line + ',' + tagged + '\n')

if __name__ == '__main__':
    with open('word_list.csv', newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            kansai = row[0].replace('〜', '')
            standard = row[1].replace('〜', '').split('・')
            if not kansai in dictionary:
                dictionary[kansai] = standard

    total = 0
    prev = 'BOS'
    for word in knbc.words():
        bg = bigram(prev, word)
        if bigram_freq_given(bg) == 0:
            frequency_bigram[bg] = 1
        else:
            frequency_bigram[bg] = frequency_bigram[bg] + 1
        if word_freq(word) == 0:
            frequency_word[word] = 1
        else:
            frequency_word[word] = frequency_word[word] + 1
        prev = word
        # total += 1
        # print(word + ', ' + str(frequency_word[word]))
    prev = 'BOS'
    for word in jeita.words():
        bg = bigram(prev, word)
Esempio n. 3
0
#JPop Band Name and Hits Generator
#aka. chickenberry
#Jonisha McKiddy | Julie Evans

###import/corpus###
import re
import nltk
from nltk.corpus import knbc
jc = knbc.words()
c = nltk.corpus.words.words()
from nltk.corpus import PlaintextCorpusReader
#portal=r"C:\Users\JuJuBee Marie\Google Drive\linguistics\comp ling\chickenberry"
corpus_root = r"C:\\Users\ses71_000\Desktop\programming"
pc = PlaintextCorpusReader(corpus_root, 'portal 12text.txt')
#cfd_pc=nltk.ConditionalFreqDist(nltk.bigrams(pc))
#cpd_pc=nltk.ConditionalProbDist(cfd_pc, nltk.MLEProbDist)

###dictionary###
f = open('C:\\Users\ses71_000\Desktop\edict.csv', encoding='utf-8')
#def searchable(w):
#[word for line in f for word in line.split()]
#print(w, word)

#srch=[word for word in f for word in line.split()]

read = f.readlines()
char = {}
for line in read:
    l = line
    str.replace(l, ',,,,,,,,,,,,,,,,,,,,', '')
    p = re.split('\||\|', l)
import nltk
from nltk.corpus import jeita
from nltk.corpus import knbc

jfull_t = nltk.Text(jeita.words())           # create NLTK Text from JEITA Corpus
kfull_t = nltk.Text(knbc.words())            # create NLTK Text from KNB Corpus

fdist_jfull = nltk.FreqDist(jfull_t)         # create frequency dist from JEITA Corpus
print(fdist_jfull.most_common(50))           # 50 most common words in JEITA Corpus
fdist_kfull = nltk.FreqDist(kfull_t)         # create frequency dist from KNB Corpus
print(fdist_kfull.most_common(50))           # 50 most common words in KNB Corpus



# words that appear in the same context (same words on either side) as '人'
print(jfull_t.similar("人"))
print(kfull_t.similar("人"))
print(
    "words in a0010.chasen with 3 or more characters, that appear 3 or more times:"
)
print(sorted(w for w in set(jsingle_t) if len(w) >= 3 and fdist_j[w] >= 3))
print("words in a0010.chasen ending in しい:")
print(sorted(w for w in set(jsingle_t) if w.endswith('しい')))
print("words in a0010.chasen starting with 見:")
print(sorted(w for w in set(jsingle_t) if w.startswith('見')))
print("words in a0010.chasen which contain 山:")
print(sorted(w for w in set(jsingle_t) if '山' in w))
print("words in a0010.chasen which contain 上 or 下:")
print(sorted(w for w in set(jsingle_t) if '上' in w or '下' in w))

# create NLTK texts for each corpus in full
jfull_t = nltk.Text(jeita.words())
kfull_t = nltk.Text(knbc.words())

# the frequency distribution tells us the frequency of each vocabulary item in the text
fdist_jfull = nltk.FreqDist(jfull_t)
print("50 most common words in the full JEITA corpus:")
print(fdist_jfull.most_common(50))
fdist_kfull = nltk.FreqDist(kfull_t)
print("50 most common words in the full KNB corpus:")
print(fdist_kfull.most_common(50))

# a collocation is a sequence of words that occur together unusually often
print(
    "collocations (words that occur together unusually often) in the JEITA corpus:"
)
print(jfull_t.collocations())
print(