def languages_freq(langlist, input_text): fdistinput = nltk.FreqDist(input_text) result = [] for language in Latin1_langs: Lang_freqdist = nltk.FreqDist(udhr.words(language)) result.append([language,nltk.spearman_correlation(Lang_freqdist,fdistinput)]) return result
def predict_language(sample_fd, training_set_fds, debug=False): """ Predict language using the spearman coefficient trained on UDHR translations. :param sample_fd: :type sample_fd: FreqDist :param training_set_fds: Dictionary of language to frequency distribution. :type training_set_fds: dict :returns: Best matching language. :rtype: str """ scores = dict() if debug: stdout.write('Finding best match') for language, language_fd in training_set_fds.iteritems(): # make copies so we don't alter the originals sfd = dict(sample_fd) lfd = dict(language_fd) # make sure both frequency distributions have only the keys they have in common # delete_differences(sfd, lfd) scores[language] = spearman_correlation( ranks_from_sequence(sfd), ranks_from_sequence(lfd) ) if debug: stdout.write('.') stdout.flush() stdout.write('\n') return sorted(scores.items(), key=lambda x: x[-1], reverse=True)
def guesslanguage(text): #Tokenizing text and creating a dict of occurences of words in text fdist_text=nltk.FreqDist(nltk.Text(nltk.wordpunct_tokenize(text))) #Creating a tuple to return combining the guessed language and corresponding Spearman score best_guess = ('', 0) #Iterating over languages with Latin font for lang in nltk.corpus.udhr.fileids(): if lang.endswith('-Latin1'): print(lang) #Creating a freq distribution of words in udhr corpus of nltk in respective languages fdist_lang=nltk.FreqDist(nltk.corpus.udhr.words(fileids=lang)) #Obtaining a intersection of words from text_dict and lang_dict created intersection=list(set(fdist_lang.keys()) & set(fdist_text.keys())) dict_text=[] dict_lang=[] for word in intersection: #Creating list of tuples with occurence count of words present in the text in both text and nltk's udhr corpus dict_text.append((word,fdist_text[word])) dict_lang.append((word,fdist_lang[word])) #Getting the Spearman correlation score for list of tuples spearman=nltk.spearman_correlation(dict_text,dict_lang) if (best_guess[1]==0 and spearman!=0.0) or (spearman!=0.0 and spearman>best_guess[1]): best_guess=(lang[:-7],spearman) #Returning best guess tuple with language and highest correlation score return best_guess
def calculate_similarity(language_freq_dict, text_freq_dict, score_dict): """Return the similarity scores for the frequency distribution of the user text compared to the frequency distribution of each specified language in the udhr.""" for language, lang_freq_dist in language_freq_dict.iteritems(): score_dict[language] = nltk.spearman_correlation( nltk.ranks_from_sequence(text_freq_dict['text_fd']), nltk.ranks_from_sequence(lang_freq_dist)) return score_dict
def spearman(mystery_ranks, language_ranks): """spearman correlation bit. compares the ranks for the mystery text with the ranks of every other language """ spearman_numbers = [] for language in language_ranks: number = spearman_correlation(language, mystery_ranks) spearman_numbers.append(number) return spearman_numbers
def ch03_43_translate(): from nltk.corpus import udhr en_fd = bigram_freqdist(udhr.words("English-Latin1")) fr_fd = bigram_freqdist(udhr.words("French_Francais-Latin1")) de_fd = bigram_freqdist(udhr.words("German_Deutsch-Latin1")) es_fd = bigram_freqdist(udhr.words("Spanish-Latin1")) inputs = ["Nice day", "Guten Tag", "Buenas Dias", "Tres Bien"] for input in inputs: words = input.lower().split(" ") # TODO: remove keys present in reference set ranks = map(lambda x : nltk.spearman_correlation(x, bigram_freqdist(words)), [en_fd, fr_fd, de_fd, es_fd]) print input, ranks
def ch03_43_translate(): from nltk.corpus import udhr en_fd = bigram_freqdist(udhr.words("English-Latin1")) fr_fd = bigram_freqdist(udhr.words("French_Francais-Latin1")) de_fd = bigram_freqdist(udhr.words("German_Deutsch-Latin1")) es_fd = bigram_freqdist(udhr.words("Spanish-Latin1")) inputs = ["Nice day", "Guten Tag", "Buenas Dias", "Tres Bien"] for input in inputs: words = input.lower().split(" ") # TODO: remove keys present in reference set ranks = map( lambda x: nltk.spearman_correlation(x, bigram_freqdist(words)), [en_fd, fr_fd, de_fd, es_fd]) print input, ranks
def detectLanguage(targetText): # source word freq dist fileNameList = ['English-Latin1', 'Spanish-Latin1', 'German_Deutsch-Latin1', 'French_Francais-Latin1', 'Italian-Latin1'] wordsList = [nltk.corpus.udhr.words(fileName) for fileName in fileNameList] freqDistList = [nltk.FreqDist(words) for words in wordsList] # target word freq dist targetWordList = nltk.word_tokenize(targetText) targetFreqDist = nltk.FreqDist(targetWordList) # compare each source freq dist with target dist correlationList = [nltk.spearman_correlation(targetFreqDist, freqDist) for freqDist in freqDistList] # return list of pair (language, factor of similarity) return zip(fileNameList, correlationList)
def guessLanguage(text): fdist_text = nltk.FreqDist(nltk.Text(nltk.wordpunct_tokenize(text))) best_guess = ('', -float("inf")) for language in nltk.corpus.udhr.fileids(): if language.endswith('-Latin1'): fdist_lang = nltk.FreqDist(nltk.corpus.udhr.words(language)) intersection = list( set(fdist_text.keys()) & set(fdist_lang.keys())) dict_text = [(word, fdist_text[word]) for word in intersection] dict_lang = [(word, fdist_lang[word]) for word in intersection] spearman = nltk.spearman_correlation(dict_text, dict_lang) if spearman != 0.0 and spearman > best_guess[1]: # spearman == 0.0 when only one item is ranked, which should be excuded best_guess = (language[:-7], spearman) print(best_guess) return best_guess[0]
def language(texto): #fd = nltk.FreqDist(texto) fd = nltk.FreqDist(word_tokenize(texto)) #print(list(fd)) #print(fd.most_common(50)) correlationMax = -10000 langFinal = '-Latin1' for lang in udhr.fileids(): if lang[-7:] == '-Latin1': fdu = nltk.FreqDist(word_tokenize(udhr.raw(lang))) #fdu = nltk.FreqDist(udhr.raw(lang)) correlation = nltk.spearman_correlation( list(ranks_from_sequence(fd)), list(ranks_from_sequence(fdu))) #print(fdu.most_common(50)) #print(lang,correlation) if correlation > correlationMax: langFinal = lang correlationMax = correlation return langFinal + ',corr:' + str(correlationMax)
def guess_language(words): ranks = list( map(lambda x: nltk.spearman_correlation(x, bigram_freqdist(words)), [en_fd, fr_fd, de_fd, es_fd])) print(ranks) return sorted(label_ranks(ranks), reverse=True)[0][1]
# [u'also', u'back', u'even', u'first', u'get', u'got', u'like', u'new', u'one', u'said', u'time', u'women', u'would'] fem_sig_ws=find_signature(fem,sign_length=1000,remove_stopwords=False) fem_sig_ns=find_signature(fem,sign_length=1000,remove_stopwords=True) mal_sig_ws=find_signature(mal,sign_length=1000,remove_stopwords=False) mal_sig_ns=find_signature(mal,sign_length=1000,remove_stopwords=True) # test_src=['http://www.elle.com','http://www.cosmopolitan.com', 'http://www.maxim.com', 'http://www.huffingtonpost.com', 'http://www.glamour.com', '' 'http://cnn.com', 'http://www.time.com', 'http://www.ted.com'] test_src=['http://www.elle.com','http://www.cosmopolitan.com', 'http://www.maxim.com'] lst=[] for src in test_src: txt=extract_articles(src,num=10) txt_sig_ws=find_signature(txt,sign_length=1000,remove_stopwords=False) txt_sig_ns=find_signature(txt,sign_length=1000,remove_stopwords=True) lst=lst+ [ [ src, nltk.spearman_correlation(txt_sig_ws, fem_sig_ws), nltk.spearman_correlation(txt_sig_ns, fem_sig_ns), nltk.spearman_correlation(txt_sig_ws, mal_sig_ws), nltk.spearman_correlation(txt_sig_ns, mal_sig_ns) ] ] # ['http://www.elle.com', -3.7880321254308464, -7.913619616139419, -4.392126028805952, -11.42968626823723] # ['http://www.cosmopolitan.com', -0.7200827425758876, -2.0631836483509884, -2.343372184363161, -5.471496896822967] # ['http://www.maxim.com', -6.033499667091246, -19.795347645052622, -4.359960113887133, -10.742962959669567] # 29 Write a recursive function that pretty prints a trie in alphabetically sorted order, e.g.: # chair: 'flesh' # ---t: 'cat' # --ic: 'stylish' # ---en: 'dog' # def insert(trie, key, value): # if key:
def detect_lang(text): langlist = ['English-Latin1', 'Spanish-Latin1', 'German_Deutsch-Latin1', 'French_Francais-Latin1'] langfdist = [ nltk.FreqDist(nltk.corpus.udhr.words(lang)) for lang in langlist ] fdist=nltk.FreqDist(nltk.word_tokenize(text)) SC = [ nltk.spearman_correlation(fdist, lfdist) for lfdist in langfdist ] return langlist[SC.index(max(SC))], SC