Example #1
0
def languages_freq(langlist, input_text):
    fdistinput = nltk.FreqDist(input_text)
    result = []
    for language in Latin1_langs:
        Lang_freqdist = nltk.FreqDist(udhr.words(language))
        result.append([language,nltk.spearman_correlation(Lang_freqdist,fdistinput)])
    return result
def predict_language(sample_fd, training_set_fds, debug=False):
    """ Predict language using the spearman coefficient trained on UDHR translations.
    :param sample_fd:
    :type sample_fd: FreqDist
    :param training_set_fds: Dictionary of language to frequency distribution.
    :type training_set_fds: dict
    :returns: Best matching language.
    :rtype: str
    """
    scores = dict()

    if debug:
        stdout.write('Finding best match')

    for language, language_fd in training_set_fds.iteritems():
        # make copies so we don't alter the originals
        sfd = dict(sample_fd)
        lfd = dict(language_fd)

        # make sure both frequency distributions have only the keys they have in common
        # delete_differences(sfd, lfd)

        scores[language] = spearman_correlation(
            ranks_from_sequence(sfd),
            ranks_from_sequence(lfd)
        )

        if debug:
            stdout.write('.')
            stdout.flush()

    stdout.write('\n')

    return sorted(scores.items(), key=lambda x: x[-1], reverse=True)
Example #3
0
def guesslanguage(text):
    #Tokenizing text and creating a dict of occurences of words in text 
    fdist_text=nltk.FreqDist(nltk.Text(nltk.wordpunct_tokenize(text)))
    #Creating a tuple to return combining the guessed language and corresponding Spearman score
    best_guess = ('', 0)
    #Iterating over languages with Latin font 
    for lang in nltk.corpus.udhr.fileids():
        if lang.endswith('-Latin1'):
            print(lang)
    #Creating a freq distribution of words in udhr corpus of nltk in respective languages 
            fdist_lang=nltk.FreqDist(nltk.corpus.udhr.words(fileids=lang))
    #Obtaining a intersection of words from text_dict and lang_dict created
            intersection=list(set(fdist_lang.keys()) & set(fdist_text.keys()))
            dict_text=[]
            dict_lang=[]
            for word in intersection:
    #Creating list of tuples with occurence count of words present in the text in both text and nltk's udhr corpus            
                dict_text.append((word,fdist_text[word]))
                dict_lang.append((word,fdist_lang[word]))
    #Getting the Spearman correlation score for list of tuples         
            spearman=nltk.spearman_correlation(dict_text,dict_lang)
            if (best_guess[1]==0 and spearman!=0.0) or (spearman!=0.0 and spearman>best_guess[1]):
                best_guess=(lang[:-7],spearman)
    #Returning best guess tuple with language and highest correlation score
    return best_guess            
Example #4
0
def calculate_similarity(language_freq_dict, text_freq_dict, score_dict):
    """Return the similarity scores for the frequency distribution of the user text 
    compared to the frequency distribution of each specified language in the udhr."""
    for language, lang_freq_dist in language_freq_dict.iteritems():
        score_dict[language] = nltk.spearman_correlation(
            nltk.ranks_from_sequence(text_freq_dict['text_fd']), 
            nltk.ranks_from_sequence(lang_freq_dist))
    return score_dict
Example #5
0
def spearman(mystery_ranks, language_ranks):
	"""spearman correlation bit. compares the ranks for the mystery text with the ranks of every other language
"""
	spearman_numbers = [] 
	for language in language_ranks:
		number = spearman_correlation(language, mystery_ranks)
		spearman_numbers.append(number)

	return spearman_numbers
Example #6
0
def ch03_43_translate():
  from nltk.corpus import udhr
  en_fd = bigram_freqdist(udhr.words("English-Latin1"))
  fr_fd = bigram_freqdist(udhr.words("French_Francais-Latin1"))
  de_fd = bigram_freqdist(udhr.words("German_Deutsch-Latin1"))
  es_fd = bigram_freqdist(udhr.words("Spanish-Latin1"))
  inputs = ["Nice day", "Guten Tag", "Buenas Dias", "Tres Bien"]
  for input in inputs:
    words = input.lower().split(" ")
    # TODO: remove keys present in reference set
    ranks = map(lambda x : nltk.spearman_correlation(x, bigram_freqdist(words)),
      [en_fd, fr_fd, de_fd, es_fd])
    print input, ranks
Example #7
0
def ch03_43_translate():
    from nltk.corpus import udhr
    en_fd = bigram_freqdist(udhr.words("English-Latin1"))
    fr_fd = bigram_freqdist(udhr.words("French_Francais-Latin1"))
    de_fd = bigram_freqdist(udhr.words("German_Deutsch-Latin1"))
    es_fd = bigram_freqdist(udhr.words("Spanish-Latin1"))
    inputs = ["Nice day", "Guten Tag", "Buenas Dias", "Tres Bien"]
    for input in inputs:
        words = input.lower().split(" ")
        # TODO: remove keys present in reference set
        ranks = map(
            lambda x: nltk.spearman_correlation(x, bigram_freqdist(words)),
            [en_fd, fr_fd, de_fd, es_fd])
        print input, ranks
def detectLanguage(targetText):
	# source word freq dist
	fileNameList = ['English-Latin1', 'Spanish-Latin1', 'German_Deutsch-Latin1', 'French_Francais-Latin1', 'Italian-Latin1']
	wordsList = [nltk.corpus.udhr.words(fileName) for fileName in fileNameList]
	freqDistList = [nltk.FreqDist(words) for words in wordsList]

	# target word freq dist
	targetWordList = nltk.word_tokenize(targetText)
	targetFreqDist = nltk.FreqDist(targetWordList)

	# compare each source freq dist with target dist
	correlationList = [nltk.spearman_correlation(targetFreqDist, freqDist)
					   for freqDist in freqDistList]

	# return list of pair (language, factor of similarity)
	return zip(fileNameList, correlationList)
Example #9
0
def guessLanguage(text):
    fdist_text = nltk.FreqDist(nltk.Text(nltk.wordpunct_tokenize(text)))
    best_guess = ('', -float("inf"))
    for language in nltk.corpus.udhr.fileids():
        if language.endswith('-Latin1'):
            fdist_lang = nltk.FreqDist(nltk.corpus.udhr.words(language))
            intersection = list(
                set(fdist_text.keys()) & set(fdist_lang.keys()))
            dict_text = [(word, fdist_text[word]) for word in intersection]
            dict_lang = [(word, fdist_lang[word]) for word in intersection]
            spearman = nltk.spearman_correlation(dict_text, dict_lang)
            if spearman != 0.0 and spearman > best_guess[1]:
                # spearman == 0.0 when only one item is ranked, which should be excuded
                best_guess = (language[:-7], spearman)
                print(best_guess)
    return best_guess[0]
Example #10
0
def language(texto):

    #fd = nltk.FreqDist(texto)
    fd = nltk.FreqDist(word_tokenize(texto))
    #print(list(fd))
    #print(fd.most_common(50))
    correlationMax = -10000
    langFinal = '-Latin1'

    for lang in udhr.fileids():
        if lang[-7:] == '-Latin1':
            fdu = nltk.FreqDist(word_tokenize(udhr.raw(lang)))
            #fdu = nltk.FreqDist(udhr.raw(lang))
            correlation = nltk.spearman_correlation(
                list(ranks_from_sequence(fd)), list(ranks_from_sequence(fdu)))
            #print(fdu.most_common(50))
            #print(lang,correlation)
            if correlation > correlationMax:
                langFinal = lang
                correlationMax = correlation

    return langFinal + ',corr:' + str(correlationMax)
Example #11
0
def guess_language(words):
    ranks = list(
        map(lambda x: nltk.spearman_correlation(x, bigram_freqdist(words)),
            [en_fd, fr_fd, de_fd, es_fd]))
    print(ranks)
    return sorted(label_ranks(ranks), reverse=True)[0][1]
Example #12
0
# [u'also', u'back', u'even', u'first', u'get', u'got', u'like', u'new', u'one', u'said', u'time', u'women', u'would']

fem_sig_ws=find_signature(fem,sign_length=1000,remove_stopwords=False)
fem_sig_ns=find_signature(fem,sign_length=1000,remove_stopwords=True)
mal_sig_ws=find_signature(mal,sign_length=1000,remove_stopwords=False)
mal_sig_ns=find_signature(mal,sign_length=1000,remove_stopwords=True)

# test_src=['http://www.elle.com','http://www.cosmopolitan.com', 'http://www.maxim.com', 'http://www.huffingtonpost.com', 'http://www.glamour.com', '' 'http://cnn.com', 'http://www.time.com', 'http://www.ted.com']
test_src=['http://www.elle.com','http://www.cosmopolitan.com', 'http://www.maxim.com']
lst=[]
for src in test_src:
    txt=extract_articles(src,num=10)
    txt_sig_ws=find_signature(txt,sign_length=1000,remove_stopwords=False)
    txt_sig_ns=find_signature(txt,sign_length=1000,remove_stopwords=True)
    lst=lst+ [ [ src,
                 nltk.spearman_correlation(txt_sig_ws, fem_sig_ws),
                 nltk.spearman_correlation(txt_sig_ns, fem_sig_ns),
                 nltk.spearman_correlation(txt_sig_ws, mal_sig_ws),
                 nltk.spearman_correlation(txt_sig_ns, mal_sig_ns) ] ]
# ['http://www.elle.com', -3.7880321254308464, -7.913619616139419, -4.392126028805952, -11.42968626823723]
# ['http://www.cosmopolitan.com', -0.7200827425758876, -2.0631836483509884, -2.343372184363161, -5.471496896822967]
# ['http://www.maxim.com', -6.033499667091246, -19.795347645052622, -4.359960113887133, -10.742962959669567]

# 29 Write a recursive function that pretty prints a trie in alphabetically sorted order, e.g.:
# chair: 'flesh'
# ---t: 'cat'
# --ic: 'stylish'
# ---en: 'dog'

# def insert(trie, key, value):
#     if key:
Example #13
0
def detect_lang(text):
    langlist = ['English-Latin1', 'Spanish-Latin1', 'German_Deutsch-Latin1', 'French_Francais-Latin1']
    langfdist = [ nltk.FreqDist(nltk.corpus.udhr.words(lang)) for lang in langlist ]
    fdist=nltk.FreqDist(nltk.word_tokenize(text))
    SC = [ nltk.spearman_correlation(fdist, lfdist) for lfdist in langfdist ]
    return langlist[SC.index(max(SC))], SC