def fun5(): # 比较词表 # 表格词典的另一个例子是比较词表。NLTK中包含了所谓的斯瓦迪士核心词列表(Swadesh # wordlists),包括几种语言的约200个常用词的列表。语言标识符使用ISO639双字母码 from nltk.corpus import swadesh print swadesh.fileids() print swadesh.words('en') # 可以通过使用entries() # 方法来指定一个语言链表来访问多语言中的同源词。而且, 还可以把它转换成一 # 个简单的词典 fr2en = swadesh.entries(['fr', 'en']) print fr2en translate = dict(fr2en) print translate['chien'] print translate['jeter']
def test_util(): count = 0 error = 0 cachedStopWords = set(stopwords.words("english")) cachedCommonWords = set(swadesh.words('en')) removewords = cachedStopWords.union(cachedCommonWords) for i in Test_class: path = os.path.join(test, i) for path, subdirs, files in os.walk(path): for name in files: doc_words = [] flag = 0 f = open(os.path.join(path, name), "r") for line in f: if flag == 1: tokenizer = nltk.RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(line) filtered_word = [ word.lower() for word in tokens if word.lower() not in removewords > 2 ] for k in filtered_word: doc_words.append(k) if ("Lines:" in line): flag = 1 test_cls = testNB(doc_words) if (i == test_cls): count += 1 else: error += 1 count += 1 print "Accuracy :", float((float(count - error) / float(count)) * 100)
def _calculate_languages_ratios(self, text): """ Calculate probability of given text to be written in several languages and return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0} @param text: Text whose language want to be detected @type text: str @return: Dictionary with languages and unique stopwords seen in analyzed text @rtype: dict """ languages_ratios = {} tokens = wordpunct_tokenize(text) words = [word.lower() for word in tokens] # Compute per language included in nltk number of unique stopwords appearing in analyzed text for language in swadesh.fileids(): stopwords_set = set(swadesh.words(language)) words_set = set(words) common_elements = words_set.intersection(stopwords_set) languages_ratios[language] = len(common_elements) # language "score" return languages_ratios
def Vocabulary(n): cachedStopWords = set(stopwords.words("english")) cachedCommonWords = set(swadesh.words('en')) removewords = cachedStopWords.union(cachedCommonWords) for i in classlist: local_cnt = collections.Counter() count = 0 path = os.path.join(root, i) for path, subdirs, files in os.walk(path): for name in files: n += 1 count += 1 flag = 0 f = open(os.path.join(path, name), "r") for line in f: if flag == 1: tokenizer = nltk.RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(line) filtered_word = [ word.lower() for word in tokens if word.lower() not in removewords ] for k in filtered_word: cnt[k] += 1 local_cnt[k] += 1 if ("Lines:" in line): flag = 1 Nc[i] = count List_cnt[i] = local_cnt return n
def swadesh(): for i in swadesh.words(): s = str(j) + "-" + str(j%length[top]) #s = str(j) #f.write( (s+": " + i + "\r\n").encode('utf-8')); j+=1 f.write((i+ "\r\n").encode('utf-8')); j+=1 all = all+1 #if (all == length[top]): all = 0; top+=1 f.close() #from nltk.corpus import swadesh
def compareWordlist(): swadesh.fileids() swadesh.words('en') fr2en = swadesh.entries(['fr', 'en']) fr2en translate = dict(fr2en) translate['chien'] translate['jeter'] de2en = swadesh.entries(['de', 'en']) # German-English es2en = swadesh.entries(['es', 'en']) # Spanish-English translate.update(dict(de2en)) translate.update(dict(es2en)) translate['Hund'] translate['perro'] languages = ['en', 'de', 'nl', 'es', 'fr', 'pt', 'la'] for i in [139, 140, 141, 142]: print swadesh.entries(languages)[i]
def test_swadesh(model, lang) -> Tuple[Optional[float], Optional[List]]: swadesh_langs = set(swadesh.fileids()) if lang in swadesh_langs: logging.info('Testing model on Swadesh list for {}...'.format(lang)) # some entries in the swadesh list have multiple words # because they include contextual definitions # so we need to only take the first word words = swadesh.words(fileids=lang) words = [word.split()[0].casefold() for word in words] accuracy, errors = test_accuracy(words, model) else: logging.error('No Swadesh corpus for "{}"'.format(lang)) accuracy = None errors = None return accuracy, errors
def removeStopwords(self, inlist): stop_words1 = stopwords.words('english') stop_words2 = swadesh.words('en') finalstopwords = stop_words1 + stop_words2 #here we can add any other words that we need to add to stopwords finalstopwords = finalstopwords + ['ref','also','title','http','image','cite','nbsp','disambiguation','article','articles','pages','page','wikipedia','retrieved','category','categories'] templist = [] for items in inlist: templist.append(items.lower()) for words in finalstopwords: while words in templist: templist.remove(words) return templist
def removeStopwords(self, inlist): stop_words1 = stopwords.words('english') stop_words2 = swadesh.words('en') finalstopwords = stop_words1 + stop_words2 #here we can add any other words that we need to add to stopwords finalstopwords = finalstopwords + [ 'ref', 'also', 'title', 'http', 'image', 'cite', 'nbsp', 'disambiguation', 'article', 'articles', 'pages', 'page', 'wikipedia', 'retrieved', 'category', 'categories' ] templist = [] for items in inlist: templist.append(items.lower()) for words in finalstopwords: while words in templist: templist.remove(words) return templist
def preprocessGICorpus(): giCorpus = {} corpus_root = os.getcwd() + "/GICorpus/" filelists = PlaintextCorpusReader(corpus_root, '.*\.txt', encoding='utf-8') wnl = nltk.WordNetLemmatizer() print filelists.fileids() for file in filelists.fileids(): wordlist = filelists.words(file) print "Printing size of " + file + " original wordlist: " + str( len(wordlist)) trimmedWordlist = [ x for x in wordlist if not (x in swadesh.words('en')) and len(x) >= 1 ] # lemmatizedWordlist = [wnl.lemmatize(t) for t in trimmedWordlist] taggedWordlist = nltk.pos_tag(trimmedWordlist) print "Printing size of " + file + " trimmed wordlist: " + str( len(trimmedWordlist)) giCorpus[file] = taggedWordlist # fd = FreqDist(w for w in taggedWordlist) return giCorpus
def getSwadesh(bSave=False): #W path="sw.txt"): j = 0 f = open("sw.txt", "wb") length = [207, 207, 207, 207, 207, 174, 207, 207] top = 0 all = 0 j = 0 for i in swadesh.words(): s = str(j) + "-" + str(j % length[top]) #s = str(j) #f.write( (s+": " + i + "\r\n").encode('utf-8')); j+=1 if (bSave): f.write((i + "\r\n").encode('utf-8')) else: print(i) j += 1 all = all + 1 #if (all == length[top]): all = 0; top+=1 if (bSave): e = swadesh.entries() for n in e: f.write((str(n) + "\r\n").encode('utf-8')) if (bSave): f.close()
def content_fraction2(text): compared_words = swadesh.words('en') content2 = [w for w in text if w.lower() not in compared_words] return content2
#!/usr/bin/env python # -*- coding: utf-8 -*- import nltk from nltk.corpus import swadesh swadesh.fileids() swadesh.words("en") fr2en = swadesh.entries(["fr", "en"]) fr2en translate = dict(fr2en) translate["chien"] translate["jeter"]
#!/usr/local/bin/python2.7.3 # -*- coding: utf-8 -*- #given three files in english,swedish and greek #the script will produce 3 cleaner versions of them #3 tokenized versions of the cleaned versions #and 3 files containing the lines erased import re import nltk from nltk.corpus import stopwords from nltk.corpus import swadesh english_stopwords = stopwords.words('english') swedish_stopwords = stopwords.words('swedish') english_common_words = swadesh.words('en') sv_eng_sim_in_sw_stopwords = ['i', 'till', 'dig', 'under'] for word in sv_eng_sim_in_sw_stopwords: if word in english_stopwords: english_stopwords.remove(word) common_words_eng_swed = ['I', 'not', 'all', 'small', 'man', 'dog',\ 'bark', 'fat', 'hand', 'drink', 'live', 'hit', 'dig', 'lie', 'fall', \ 'river', 'lake', 'salt', 'sand', 'red', 'full', 'bad', 'far', 'in', 'and'] for word in common_words_eng_swed: if word in english_common_words: english_common_words.remove(word) def read_files(): path_el = 'corpus_el.txt'
from nltk.corpus import swadesh import sys #reload(sys) #sys.setdefaultencoding('utf-8') #import sys, locale, os #print(sys.stdout.encoding) #print(sys.stdout.isatty()) #print(locale.getpreferredencoding()) #print(sys.getfilesystemencoding()) #print(os.environ["PYTHONIOENCODING"]) #print(chr(246), chr(9786), chr(9787)) j = 0 f = open("sw.txt", "wb") length = [207, 207, 207, 207, 207, 174, 207, 207] top = 0 all = 0 for i in swadesh.words(): s = str(j) + "-" + str(j % length[top]) #s = str(j) #f.write( (s+": " + i + "\r\n").encode('utf-8')); j+=1 f.write((i + "\r\n").encode('utf-8')) j += 1 all = all + 1 #if (all == length[top]): all = 0; top+=1 f.close() #from nltk.corpus import swadesh
# 在词典中寻找单词的发音 text = ['natural', 'language', 'processing'] pron_list = [ph for w in text for ph in prondict[w][0]] print("word pronoun list= ", pron_list) # 加[0]是因为natural有两个发音,取其中一个就好了 pron_list = [ph for w in text for ph in prondict[w]] print("'natural' pronoun list= ", pron_list) print("prondict['natural']=", prondict['natural']) # P70 2.4.3 比较词表(Swadesh wordlists) # 包括几种语言的约200个常用词的列表,可以用于比较两个语言之间的差别,也可以用于不同语言的单词翻译 from nltk.corpus import swadesh print("swadesh.fileids()= ", swadesh.fileids()) print("swadesh.words('en')= ", swadesh.words('en')) fr2en = swadesh.entries(['fr', 'en']) print("fr2en= ", fr2en[:13]) translate = dict(fr2en) print("translate= ", translate) print("translate['chien']= ", translate['chien']) de2en = swadesh.entries(['de', 'en']) translate.update(dict(de2en)) es2en = swadesh.entries(['es', 'en']) translate.update(dict(es2en)) print("translate= ", translate) print("translate['jeter']= ", translate['jeter']) print("translate['Hund']= ", translate['Hund'])
from nltk.corpus import swadesh print(swadesh.fileids(), '\n') print(swadesh.words('en'), '\n') fr2en = swadesh.entries(['fr', 'en']) print(fr2en, '\n') translate = dict(fr2en) print(translate['chien']) print(translate['jeter'], '\n') de2en = swadesh.entries(['de', 'en']) # German-English es2en = swadesh.entries(['es', 'en']) # Spanish-English translate.update(dict(de2en)) translate.update(dict(es2en)) print(translate['Hund']) print(translate['perro'], '\n') languages = ['en', 'de', 'nl', 'es', 'fr', 'pt', 'la'] for i in [139, 140, 141, 142]: print(swadesh.entries(languages)[i])
uso de la base de datos .xml. """ from lxml import etree from datetime import date from datetime import timedelta from nltk.tokenize import word_tokenize import matplotlib.pyplot as plt import numpy as np import codecs week_days = ['Lun', 'Mar', 'Mier', 'Jue', 'Vier', 'Sab', 'Dom'] # Auxiliar functions and things from nltk.corpus import swadesh from nltk.corpus import stopwords common_words = swadesh.words('es') + stopwords.words('spanish') def phrase_variation(phrase): phrase_lower = phrase.lower() phrase_upper = phrase.upper() phrase_capitalize = phrase.capitalize() phrase_title = phrase.title() phrases2check = [phrase, phrase_lower, \ phrase_upper, phrase_capitalize, \ phrase_title] return phrases2check # Class Note
from nltk.corpus import swadesh ##Que hay dentro de swadesh. que idiomas print(swadesh.fileids()) ##Saber que palabras tiene swadesh en ingles print(swadesh.words('en')) ##objeto que me ayuda a definir un diccionario par atraducir las palabras del frances al español fr2es = swadesh.entries(['fr', 'en']) print(fr2es) ##crear diccionario de frances a ingles translate = dict(fr2es) print(translate['chien']) ##traducir la plabra chien en frances al ingles
def mostCommon(dataArray): import nltk from nltk.corpus import stopwords from nltk.corpus import swadesh from nltk import bigrams import re import json import operator from collections import Counter, OrderedDict import string from bs4 import UnicodeDammit # # additional words to omit # extra_words = ['be', 'psb'] extra_words = [] # stringAll = '' emoticons_str = r""" (?: [:=;] # Eyes [oO\-]? # Nose (optional) [D\)\]\(\]/\\OpP] # Mouth )""" regex_str = [ emoticons_str, r'<[^>]+>', # HTML tags r'(?:@[\w_]+)', # @-mentions r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and ' r'(?:[\w_]+)', # other words r'(?:[\uD83C-\uDBFF\uDC00-\uDFFF]+)', r'(?:\S)' # anything else ] tokens_re = re.compile(r'(' + '|'.join(regex_str) + ')', re.VERBOSE | re.IGNORECASE) emoticon_re = re.compile(r'^' + emoticons_str + '$', re.VERBOSE | re.IGNORECASE) # def tokenize(s): return tokens_re.findall(s) # def preprocess(s, lowercase=False): tokens = tokenize(s) if lowercase: tokens = [ token if emoticon_re.search(token) else token.lower() for token in tokens ] return tokens # for row in dataArray: for item in row: stringAll = stringAll + item stringAll = stringAll + " " #print('stringAll...' + '\n' + stringAll) # count_all = Counter() # count everything terms_all = [term for term in preprocess(stringAll) if len(term) > 1] # # Count terms only once, equivalent to Document Frequency terms_single = set(terms_all) # # build up the words to omit... punctuation = list(string.punctuation) common_words = swadesh.words('en') stop_words = stopwords.words('english') omitted_words = punctuation + common_words + stop_words + extra_words # # Create a list with all the terms EXCLUDING punctuations terms_nostop = [ term for term in preprocess(stringAll) if term not in omitted_words and len(term) > 1 ] # Count hashtags only terms_hash = [ term for term in preprocess(stringAll) if term.startswith('#') and len(term) > 1 ] # Count terms only (no hashtags, no mentions)...startswith() takes a tuple (not a list) if we pass a list of inputs terms_only = [ term for term in preprocess(stringAll) if term not in stop_words and not term.startswith(('#', '@')) and len(term) > 1 ] # Count duples (co-occurances) terms_bigram = bigrams(terms_nostop) # Update the counter count_all.update(terms_nostop) # Print the most frequent words print(count_all.most_common(20)) # sorted_dictionary = OrderedDict( sorted(count_all.items(), key=lambda t: t[1], reverse=True)) print(sorted_dictionary.keys()) # sort by VAL in descending order #print( sorted(count_all.values()) ) #count_all_sorted = sorted(count_all.items(), key=operator.itemgetter(1)) #print(count_all_sorted) ''' # ask user for number which Most_Common_Count must be above, then create new dict with Top_Counts from tkinter.simpledialog import askstring shortdict_tresh = askstring("PromptWindow", "Enter cutoff integer in POS most-common-count") shortdict_num = int(shortdict_tresh) shortdict = dict((key,value) for key, value in sorted_dictionary.items() if value > shortdict_num) ''' shortdict = dict((key, value) for key, value in sorted_dictionary.items()) #print('shortdict... ') #print(str(shortdict)) dict_title = str(shortdict) # # take most-Common count and calcualte weighted score per string in lemma-dataArray... processList = [] for row in dataArray: #print(row) count = 0 ratingCounter = 0 row = row.split() for word in row: #print(word) for key, value in shortdict.items(): if word == key: count = count + value processList.append(count) #print('processList... ') #print(processList) return dict_title, processList
#!/usr/bin/python3 import nltk.corpus as corpus from nltk.corpus import udhr from nltk.corpus import swadesh text = udhr.sents('Spanish-Latin1') es = swadesh.words('es') spanish_to_english = swadesh.entries(['es', 'en']) trans = dict(spanish_to_english) for sentence in text: for i in range(len(sentence)): if sentence[i] in es: print(trans[sentence[i]], end=' ') else: print("UNK", end=' ') print('')
cfd.plot() entries = nltk.corpus.cmudict.entries() len(entries) for entry in entries[42371:42379]: print(entry) syllable = ['N', 'IHO', 'K', 'S'] [word for word, pron in entries if pron[-4:] == syllable] [w for w, pron in entries if pron[-1] == 'M' and w[-1] == 'n'] from nltk.corpus import swadesh swadesh.fileids() swadesh.words('en') fr2en = swadesh.entries(['fr', 'en']) fr2en translate = dict(fr2en) translate['chien'] translate['jeter'] de2en = swadesh.entries(['de', 'en']) es2en = swadesh.entries(['es', 'en']) translate.update(dict(de2en)) translate.update(dict(es2en)) translate['Hund'] translate['perro'] languages = ['en', 'de', 'nl', 'es', 'fr', 'pt', 'la']
__author__ = 'lizhifeng' from nltk.corpus import swadesh print swadesh.fileids() print swadesh.words('en') fr2en = swadesh.entries(['fr', 'en']) print fr2en translate = dict(fr2en) print translate["chien"]
# An example of a tabular lexicon is the comparative wordlist. NLTK includes so-called Swadesh wordlists, lists of about 200 common words in several languages. The Swadesh list is used in the quantitative assessment of the genealogical relatedness of languages. from nltk.corpus import swadesh print(swadesh.fileids()) # prints out the language identifiers (two-letter code). print() # prints out an empty line. print(swadesh.words("de")) # prints out 200 common German words from swadesh.
def get_frequncy_dist(dir_path): files = os.listdir(dir_path) all_words = 0 words_wt_freq = {} '''get words''' for filename in files: if (filename.endswith('.srt')): file_handler = open(dir_path + '\\' + filename, 'r') for line in file_handler : for word in line.strip().split(): sword = word.strip(punctuation) if (sword.isalpha()): lword = sword.lower() words_wt_freq[lword] = words_wt_freq.get(lword, 0) + 1 all_words += 1 file_handler.close() logger.debug('# all words: ' + str (all_words - 1)) logger.debug('# unique words: ' + str (len(words_wt_freq.keys()))) lexical_diversity_for_freq(words_wt_freq.values()) lemmatized_words_wt_freq = {} for word in words_wt_freq.keys(): lemmatized_word = nltk.WordNetLemmatizer().lemmatize(word) if (word != lemmatized_word and lemmatized_word != None): lemmatized_words_wt_freq[lemmatized_word] = lemmatized_words_wt_freq.get(lemmatized_word, 0) + words_wt_freq.get(word) #print(lemmatized_word, word) else: lemmatized_words_wt_freq[word] = words_wt_freq.get(word) lemmatized_size = len(lemmatized_words_wt_freq.keys()) logger.debug ('# words after lemmatized: ' + str (lemmatized_size) + " diff: " + str (len(words_wt_freq.keys()) - lemmatized_size)) lexical_diversity_for_freq(lemmatized_words_wt_freq.values()) words_wt_freq = {} # Save memory stopwords_en = stopwords.words('english') male_names = names.words('male.txt') female_names = names.words('female.txt') comparative = swadesh.words('en') ignore_list = [] ; ignore_list.extend(stopwords_en) ignore_list.extend(male_names) ignore_list.extend(female_names) ignore_list.extend(comparative) filtered_words = [] out_file = open(dir_path + '\\wfd.csv', 'w') out_file.write ('Word, Type, Frequency \n') for word in lemmatized_words_wt_freq.keys(): if len(word) > 2 and word not in ignore_list: filtered_words.append(word) else: out_file.write(word + ',stop words,' + str(lemmatized_words_wt_freq.get(word)) + '\n') logger.debug ('# words after filtering stop words: ' + str (len(filtered_words)) + " diff: " + str (len(lemmatized_words_wt_freq.keys()) - len(filtered_words))) ignore_list = [] #save memory '''wordnet has 155k''' usual_words = [] for word in filtered_words: if (len(wordnet.synsets(word)) != 0): usual_words.append(word) else: out_file.write(word + ',not in wordnet,' + str(lemmatized_words_wt_freq.get(word)) + '\n') logger.debug ('# words after filtering unused words: ' + str (len(usual_words)) + " diff: " + str (lemmatized_size - len(usual_words))) filtered_words = [] # save memory tag_filtered_words_wt_freq = {} words_wt_tags = nltk.pos_tag(usual_words) for (word, tag) in words_wt_tags: if (tag not in ['EX', 'DET', 'CNJ', 'FW', 'MD', 'NP', 'NUM', 'PRO', 'P', 'TO', 'UH', 'WH', 'WP', 'NNP', 'MOD']): if(en.is_adverb(word)): tag_filtered_words_wt_freq[word] = lemmatized_words_wt_freq[word] #print ('ADV,' + word) elif (en.is_adjective(word)): tag_filtered_words_wt_freq[word] = lemmatized_words_wt_freq[word] #print ('ADJ,' + word) elif (en.is_verb(word)): tag_filtered_words_wt_freq[word] = lemmatized_words_wt_freq[word] #print ('VB,' + word) elif (en.is_noun(word)): tag_filtered_words_wt_freq[word] = lemmatized_words_wt_freq[word] #print ('N,' + word) else: if (tag in ['VBZ', 'NNS']): if word.endswith('s'): new_word = word[:-1] tag_filtered_words_wt_freq[new_word] = lemmatized_words_wt_freq[word] + tag_filtered_words_wt_freq.get(new_word, 0) #print (word , new_word,tag) elif (tag == 'VBG'): new_word = en.verb.infinitive(word) if new_word != None and word != new_word: tag_filtered_words_wt_freq[new_word] = lemmatized_words_wt_freq[word] + tag_filtered_words_wt_freq.get(new_word, 0) elif (tag == 'JJS'): if word.endswith('est'): new_word = word[:-3] tag_filtered_words_wt_freq[new_word] = lemmatized_words_wt_freq[word] + tag_filtered_words_wt_freq.get(new_word, 0) else: tag_filtered_words_wt_freq[word] = lemmatized_words_wt_freq[word] #print (word,tag) else: out_file.write(word + ',unwanted pos,' + str(lemmatized_words_wt_freq.get(word)) + '\n') logger.debug ('# words after filtering unwanted pos:' + str (len(tag_filtered_words_wt_freq.keys())) + " diff: " + str (len(usual_words) - len(tag_filtered_words_wt_freq.keys()))) lexical_diversity_for_freq(tag_filtered_words_wt_freq.values()) lemmatized_words_wt_freq = {} # save memory usual_words = [] #save memory basic_english_vocab = en.basic.words non_basic_words = set(tag_filtered_words_wt_freq.keys()).difference(basic_english_vocab) non_basic_words_wt_freq = {} for non_basic_word in non_basic_words: non_basic_words_wt_freq[non_basic_word] = tag_filtered_words_wt_freq[non_basic_word] words_in_both = set(tag_filtered_words_wt_freq.keys()).intersection(basic_english_vocab) for word in words_in_both: out_file.write(word + ',en.basic.words,' + str(tag_filtered_words_wt_freq.get(word)) + '\n') logger.debug ('# words after filtering basic words: ' + str (len(non_basic_words_wt_freq.keys())) + " diff: " + str (len(tag_filtered_words_wt_freq.keys()) - len(non_basic_words_wt_freq.keys()))) lexical_diversity_for_freq(non_basic_words_wt_freq.values()) tag_filtered_words_wt_freq = {} #save memory fh = open(os.path.join(base.app_root(), 'etc\\basic_words.csv'), 'r') my_words = [word.lower() for line in fh for word in line.strip().split()] fh.close() new_words = set(non_basic_words).difference(my_words) words_in_both = set(non_basic_words).intersection(my_words) for word in words_in_both: out_file.write(word + ',en.basic.words.mine,' + str(non_basic_words_wt_freq.get(word)) + '\n') new_words_wt_freq = {} for new_word in new_words: new_words_wt_freq[new_word] = non_basic_words_wt_freq[new_word] logger.debug ('# words after filtering my words: ' + str (len(new_words_wt_freq.keys())) + " diff: " + str (len(non_basic_words_wt_freq.keys()) - len(new_words_wt_freq.keys()))) lexical_diversity_for_freq(new_words_wt_freq.values()) sorted_words = sorted(new_words_wt_freq.items(), key=itemgetter(1, 0)) for (word, frequency) in sorted_words: out_file.write (word + ',lexicon,' + str(frequency) + '\n') out_file.close() return new_words_wt_freq