def generate_language_df(test_column, rows=50): import nltk import random random.seed(55) from nltk.corpus import udhr from timeit import default_timer as timer languages = udhr._fileids sents_eng = udhr.sents('English-Latin1') sents_bg = udhr.sents('Bulgarian_Balgarski-UTF8') sents_ger = udhr.sents('German_Deutsch-Latin1') cnt_en = int(rows * 0.4) cnt_bg = int(rows * 0.4) cnt_de = int(rows * 0.2) df = pd.DataFrame(np.random.randn(rows, 4), columns=['A', 'B', 'C', test_column]) df.set_index('A') tcol = df[test_column] nums = [] for i in range(cnt_en): rnd_sent = ' '.join(random.choice(sents_eng)) nums.append(rnd_sent) for i in range(cnt_bg): rnd_sent = ' '.join(random.choice(sents_bg)) nums.append(rnd_sent) for i in range(cnt_de): rnd_sent = ' '.join(random.choice(sents_ger)) nums.append(rnd_sent) df.loc[:, test_column] = nums return df
def demo(): from nltk.corpus import udhr langs = [ "Kurdish-UTF8", "Abkhaz-UTF8", "Farsi_Persian-UTF8", "Hindi-UTF8", "Hawaiian-UTF8", "Russian-UTF8", "Vietnamese-UTF8", "Serbian_Srpski-UTF8", "Esperanto-UTF8", ] friendly = { "kmr": "Northern Kurdish", "abk": "Abkhazian", "pes": "Iranian Persian", "hin": "Hindi", "haw": "Hawaiian", "rus": "Russian", "vie": "Vietnamese", "srp": "Serbian", "epo": "Esperanto", } tc = TextCat() for cur_lang in langs: # Get raw data from UDHR corpus raw_sentences = udhr.sents(cur_lang) rows = len(raw_sentences) - 1 cols = list(map(len, raw_sentences)) sample = "" # Generate a sample text of the language for i in range(0, rows): cur_sent = "" for j in range(0, cols[i]): cur_sent += " " + raw_sentences[i][j] sample += cur_sent # Try to detect what it is print("Language snippet: " + sample[0:140] + "...") guess = tc.guess_language(sample) print(f"Language detection: {guess} ({friendly[guess]})") print("#" * 140)
def demo(): from nltk.corpus import udhr langs = [ 'Kurdish-UTF8', 'Abkhaz-UTF8', 'Farsi_Persian-UTF8', 'Hindi-UTF8', 'Hawaiian-UTF8', 'Russian-UTF8', 'Vietnamese-UTF8', 'Serbian_Srpski-UTF8', 'Esperanto-UTF8', ] friendly = { 'kmr': 'Northern Kurdish', 'abk': 'Abkhazian', 'pes': 'Iranian Persian', 'hin': 'Hindi', 'haw': 'Hawaiian', 'rus': 'Russian', 'vie': 'Vietnamese', 'srp': 'Serbian', 'epo': 'Esperanto', } tc = TextCat() for cur_lang in langs: # Get raw data from UDHR corpus raw_sentences = udhr.sents(cur_lang) rows = len(raw_sentences) - 1 cols = list(map(len, raw_sentences)) sample = '' # Generate a sample text of the language for i in range(0, rows): cur_sent = '' for j in range(0, cols[i]): cur_sent += ' ' + raw_sentences[i][j] sample += cur_sent # Try to detect what it is print('Language snippet: ' + sample[0:140] + '...') guess = tc.guess_language(sample) print('Language detection: %s (%s)' % (guess, friendly[guess])) print('#' * 140)
def runLeaveOutSentTrial(language): sents = udhr.sents(language) sent_idx = random.randint(0, len(sents) - 1) test_sent = list(filterWords(sents[sent_idx])) train_set = [] for i in range(len(sents)): if i == sent_idx: continue train_set += filterWords(sents[i]) bigrams = [(language, makeTrigrams(train_set))] for lang in LANGUAGES: if lang == language: continue bigrams.append((lang, makeTrigrams(udhr.words(lang)))) grammars = makeTrigramGrammars(bigrams) return test_sent, predictSentLanguage(test_sent, grammars)
def runLeaveOutSentTrial(language): sents = udhr.sents(language) sent_idx = random.randint(0, len(sents) - 1) test_sent = sents[sent_idx] train_set = [] for i in range(len(sents)): if i == sent_idx: continue train_set += sents[i] ngrams = [(language, train_set)] for lang in LANGUAGES: if lang == language: continue ngrams.append((lang, udhr.words(lang))) classifier = NGramClassifier(N, ngrams) return test_sent, classifier.classifySent(test_sent)
'Greenlandic_Inuktikut-Latin1', 'Hungarian_Magyar-Latin1', 'Ibibio_Efik-Latin1' ] # , 'Chinese_Mandarin-UTF8'] cfd = nltk.ConditionalFreqDist( (lang, len(word)) for lang in languages for word in udhr.words(lang)) cfd.plot(cumulative=True) cfd.tabulate(samples=range(10), cumulative=True) cfd.tabulate(conditions=['English-Latin1', 'German_Deutsch-Latin1'], samples=range(10), cumulative=True) # 中文是字符型的,不能使用单词读入 chinese_mandarin_raw = udhr.raw('Chinese_Mandarin-UTF8') print(chinese_mandarin_raw) chinese_mandarin_words = udhr.words('Chinese_Mandarin-UTF8') chinese_mandarin_words chinese_mandarin_sents = udhr.sents('Chinese_Mandarin-UTF8') chinese_mandarin_sents def generate_model(cfdist, word, num=15): for i in range(num): print(word, end=' ') word = cfdist[word].max() # 1.8. 文本语料库的结构 raw = gutenberg.raw('burgess-busterbrown.txt') print(raw[1:20]) words = gutenberg.words('burgess-busterbrown.txt') print(words[1:20]) sents = gutenberg.sents('burgess-busterbrown.txt')
#!/usr/bin/python3 import nltk.corpus as corpus from nltk.corpus import udhr from nltk.corpus import swadesh text = udhr.sents('Spanish-Latin1') es = swadesh.words('es') spanish_to_english = swadesh.entries(['es', 'en']) trans = dict(spanish_to_english) for sentence in text: for i in range(len(sentence)): if sentence[i] in es: print(trans[sentence[i]], end=' ') else: print("UNK", end=' ') print('')
import nltk from nltk.corpus import udhr as u #The full text of the declaration in Ibibio-Efik print(u.raw('Ibibio_Efik-Latin1')) #The length (in words) of the text in Amahuaca and in Greenlandic, and which one is longer word_lenA = len(u.words('Amahuaca')) word_lenG = len(u.words('Greenlandic_Inuktikut-Latin1')) print('\nAmahuaca one has %s words and Greenland one has %s words.' % (word_lenA, word_lenG)) if word_lenA > word_lenG: print('Amahuaca one is longer.') else: print('Greenland one is longer.') #The first sentence of the text in Turkish sentences = u.sents('Turkish_Turkce-Turkish') sentence1 = ' '.join(sentences[1]) print('\n', sentence1)
import json from nltk.corpus import udhr from pandas import DataFrame df = DataFrame.from_csv('language_speakers', index_col='language') def exportToJSON(langName, passage): l = [] for sentence in passage: l.append(sentence) with open("passages/" + langName + ".json", 'w') as f: f.write(json.dumps(l)) for lang in udhr.fileids(): langName = ' '.join(lang.split('-')[:-1]) try: print(' '.join(udhr.sents(lang)[0])[:50] + '...', langName) if langName in df.index and df.loc[langName].get('speakers_native(m)') > 1: exportToJSON(langName, udhr.sents(lang)) except AssertionError: print('could not print... ', lang)