Beispiel #1
0
def fun5():
    # 比较词表
    # 表格词典的另一个例子是比较词表。NLTK中包含了所谓的斯瓦迪士核心词列表(Swadesh
    # wordlists),包括几种语言的约200个常用词的列表。语言标识符使用ISO639双字母码
    from nltk.corpus import swadesh
    print swadesh.fileids()
    print swadesh.words('en')
    # 可以通过使用entries()
    # 方法来指定一个语言链表来访问多语言中的同源词。而且, 还可以把它转换成一
    # 个简单的词典
    fr2en = swadesh.entries(['fr', 'en'])
    print fr2en
    translate = dict(fr2en)
    print translate['chien']
    print translate['jeter']
Beispiel #2
0
    def _calculate_languages_ratios(self, text):
        """
        Calculate probability of given text to be written in several languages and
        return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}

        @param text: Text whose language want to be detected
        @type text: str

        @return: Dictionary with languages and unique stopwords seen in analyzed text
        @rtype: dict
        """

        languages_ratios = {}

        tokens = wordpunct_tokenize(text)
        words = [word.lower() for word in tokens]

        # Compute per language included in nltk number of unique stopwords appearing in analyzed text
        for language in swadesh.fileids():
            stopwords_set = set(swadesh.words(language))
            words_set = set(words)
            common_elements = words_set.intersection(stopwords_set)

            languages_ratios[language] = len(common_elements)  # language "score"

        return languages_ratios
Beispiel #3
0
def compareWordlist():

    swadesh.fileids()
    swadesh.words('en')

    fr2en = swadesh.entries(['fr', 'en'])
    fr2en

    translate = dict(fr2en)
    translate['chien']
    translate['jeter']

    de2en = swadesh.entries(['de', 'en'])    # German-English
    es2en = swadesh.entries(['es', 'en'])    # Spanish-English
    translate.update(dict(de2en))
    translate.update(dict(es2en))
    translate['Hund']
    translate['perro']

    languages = ['en', 'de', 'nl', 'es', 'fr', 'pt', 'la']
    for i in [139, 140, 141, 142]:
        print swadesh.entries(languages)[i]
Beispiel #4
0
def test_swadesh(model, lang) -> Tuple[Optional[float], Optional[List]]:
    swadesh_langs = set(swadesh.fileids())
    if lang in swadesh_langs:
        logging.info('Testing model on Swadesh list for {}...'.format(lang))
        # some entries in the swadesh list have multiple words
        # because they include contextual definitions
        # so we need to only take the first word
        words = swadesh.words(fileids=lang)
        words = [word.split()[0].casefold() for word in words]
        accuracy, errors = test_accuracy(words, model)
    else:
        logging.error('No Swadesh corpus for "{}"'.format(lang))
        accuracy = None
        errors = None
    return accuracy, errors
Beispiel #5
0
# An example of a tabular lexicon is the comparative wordlist. NLTK includes so-called Swadesh wordlists, lists of about 200 common words in several languages. The Swadesh list is used in the quantitative assessment of the genealogical relatedness of languages. 

from nltk.corpus import swadesh

print(swadesh.fileids()) 

# prints out the language identifiers (two-letter code).
print() # prints out an empty line.

print(swadesh.words("de"))

# prints out 200 common German words from swadesh.
Beispiel #6
0
                               for name in names.words(fileid))
cfd.plot()

entries = nltk.corpus.cmudict.entries()
len(entries)

for entry in entries[42371:42379]:
    print(entry)

syllable = ['N', 'IHO', 'K', 'S']
[word for word, pron in entries if pron[-4:] == syllable]

[w for w, pron in entries if pron[-1] == 'M' and w[-1] == 'n']

from nltk.corpus import swadesh
swadesh.fileids()
swadesh.words('en')

fr2en = swadesh.entries(['fr', 'en'])
fr2en
translate = dict(fr2en)
translate['chien']
translate['jeter']

de2en = swadesh.entries(['de', 'en'])
es2en = swadesh.entries(['es', 'en'])
translate.update(dict(de2en))
translate.update(dict(es2en))
translate['Hund']
translate['perro']
Beispiel #7
0
# 在词典中寻找单词的发音
text = ['natural', 'language', 'processing']
pron_list = [ph for w in text for ph in prondict[w][0]]
print("word pronoun list= ", pron_list)

# 加[0]是因为natural有两个发音,取其中一个就好了
pron_list = [ph for w in text for ph in prondict[w]]
print("'natural' pronoun list= ", pron_list)
print("prondict['natural']=", prondict['natural'])

# P70 2.4.3 比较词表(Swadesh wordlists)
# 包括几种语言的约200个常用词的列表,可以用于比较两个语言之间的差别,也可以用于不同语言的单词翻译
from nltk.corpus import swadesh

print("swadesh.fileids()= ", swadesh.fileids())
print("swadesh.words('en')= ", swadesh.words('en'))

fr2en = swadesh.entries(['fr', 'en'])
print("fr2en= ", fr2en[:13])
translate = dict(fr2en)
print("translate= ", translate)
print("translate['chien']= ", translate['chien'])

de2en = swadesh.entries(['de', 'en'])
translate.update(dict(de2en))
es2en = swadesh.entries(['es', 'en'])
translate.update(dict(es2en))
print("translate= ", translate)

print("translate['jeter']= ", translate['jeter'])
Beispiel #8
0
        word = cfdist[word].max()

text = nltk.corpus.genesis.words('english-kjv.txt')
bigrams = nltk.bigrams(text)
cfd = nltk.ConditionalFreqDist(bigrams) 
>>> cfd['living']
FreqDist({'creature': 7, 'thing': 4, 'substance': 2, ',': 1, '.': 1, 'soul': 1})
>>> generate_model(cfd, 'living')
living creature that he said , and the land of the land of the land
# basically, we get a conditional frequency distribution of bigrams so we can see which word pairs occur frequently
# then the function will reset the context word (starting with seed word) every time = predictive text

# symbols in the CMU Pronouncing Dictionary are from the Arpabet
# Swadesh wordlists: lists of about 200 common words in several languages. Access like below:
from nltk.corpus import swadesh
>>> swadesh.fileids()
['be', 'bg', 'bs', 'ca', 'cs', 'cu', 'de', 'en', 'es', 'fr', 'hr', 'it', 'la', 'mk',
'nl', 'pl', 'pt', 'ro', 'ru', 'sk', 'sl', 'sr', 'sw', 'uk']
# can create a simple dictionary for these words
>>> fr2en = swadesh.entries(['fr', 'en'])
>>> fr2en
[('je', 'I'), ('tu, vous', 'you (singular), thou'), ('il', 'he'), ...]
>>> translate = dict(fr2en)
>>> translate['chien']
'dog'
>>> translate['jeter']
'throw'
>>> de2en = swadesh.entries(['de', 'en'])    # German-English
>>> es2en = swadesh.entries(['es', 'en'])    # Spanish-English
>>> translate.update(dict(de2en)) # this is basically just adding onto the previous translate dictionary with french in it
>>> translate.update(dict(es2en))
Beispiel #9
0
__author__ = 'lizhifeng'
from nltk.corpus import  swadesh

print swadesh.fileids()
print swadesh.words('en')

fr2en = swadesh.entries(['fr', 'en'])
print fr2en

translate = dict(fr2en)
print translate["chien"]
Beispiel #10
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import nltk
from nltk.corpus import swadesh

swadesh.fileids()
swadesh.words("en")
fr2en = swadesh.entries(["fr", "en"])
fr2en
translate = dict(fr2en)
translate["chien"]
translate["jeter"]
Beispiel #11
0
import nltk 

#Using corpus swadesh
from nltk.corpus import swadesh


#Heading to be provided with the help of streamlit 
st.title('Translate English words to various languages')
st.markdown('If using mobile please tap on top left arrrow button to visualize sidebar')
st.sidebar.title('Translate English words to various languages')

st.markdown('This is a dashboard where you can hover over the country in a map to know the language codes of that place\'s native language and then translate English words')


#Languages whose words are present in swadesh corpus
languages = swadesh.fileids()

#iso_alpha codes of countries to use them in form of locations in plotly_express map
country_codes = ['BLR' , 'BGR' , 'BIH' , 'AND' ,'CZE' , 'MNE','DEU' , 'USA' , 'ESP' , 'FRA' , 'HRV' , 'CHE', 'ITA' , 'MKD' , 'ABW', 'POL',
 'PRT' , 'ROU' , 'RUS' ,'SVK' , 'SVN' , 'SRB', 'KEN' ,'UKR'  ]

country_names = ['Belarus' , 'Bulgaria' , 'Bosnia and Herzengove' , 'Andorra' , 'Czech Republic', 'Montenegro' ,'Germany','USA' , 'Spain' , 'France' , 'Croatia' ,'Switzerland' ,'Italy' ,'Republic of North Macedonia','Aruba','Poland' ,'Portugal' ,'Romania' , 'Russia' , 'Slovakia' , 'Slovenia', 'Serbia' , 'Kenya' , 'Ukraine']


lang2country_name = {languages[i] : country_names[i] for i in range(len(languages))}


lang2country = {languages[i] : country_codes[i] for i in range(len(languages))}

#Country_codes converted to pandas dataframe so as to use them in maps
Beispiel #12
0
from nltk.corpus import swadesh

print(swadesh.fileids(), '\n')
print(swadesh.words('en'), '\n')

fr2en = swadesh.entries(['fr', 'en'])
print(fr2en, '\n')

translate = dict(fr2en)
print(translate['chien'])
print(translate['jeter'], '\n')

de2en = swadesh.entries(['de', 'en'])  # German-English
es2en = swadesh.entries(['es', 'en'])  # Spanish-English

translate.update(dict(de2en))
translate.update(dict(es2en))

print(translate['Hund'])
print(translate['perro'], '\n')

languages = ['en', 'de', 'nl', 'es', 'fr', 'pt', 'la']

for i in [139, 140, 141, 142]:
    print(swadesh.entries(languages)[i])