コード例 #1
0
def fun5():
    # 比较词表
    # 表格词典的另一个例子是比较词表。NLTK中包含了所谓的斯瓦迪士核心词列表(Swadesh
    # wordlists),包括几种语言的约200个常用词的列表。语言标识符使用ISO639双字母码
    from nltk.corpus import swadesh
    print swadesh.fileids()
    print swadesh.words('en')
    # 可以通过使用entries()
    # 方法来指定一个语言链表来访问多语言中的同源词。而且, 还可以把它转换成一
    # 个简单的词典
    fr2en = swadesh.entries(['fr', 'en'])
    print fr2en
    translate = dict(fr2en)
    print translate['chien']
    print translate['jeter']
コード例 #2
0
def test_util():
    count = 0
    error = 0
    cachedStopWords = set(stopwords.words("english"))
    cachedCommonWords = set(swadesh.words('en'))
    removewords = cachedStopWords.union(cachedCommonWords)
    for i in Test_class:
        path = os.path.join(test, i)
        for path, subdirs, files in os.walk(path):
            for name in files:
                doc_words = []
                flag = 0
                f = open(os.path.join(path, name), "r")
                for line in f:
                    if flag == 1:
                        tokenizer = nltk.RegexpTokenizer(r'\w+')
                        tokens = tokenizer.tokenize(line)
                        filtered_word = [
                            word.lower() for word in tokens
                            if word.lower() not in removewords > 2
                        ]
                        for k in filtered_word:
                            doc_words.append(k)
                    if ("Lines:" in line):
                        flag = 1
                test_cls = testNB(doc_words)
                if (i == test_cls):
                    count += 1
                else:
                    error += 1
                    count += 1
    print "Accuracy :", float((float(count - error) / float(count)) * 100)
コード例 #3
0
ファイル: article.py プロジェクト: kozikowskik/newspaper
    def _calculate_languages_ratios(self, text):
        """
        Calculate probability of given text to be written in several languages and
        return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}

        @param text: Text whose language want to be detected
        @type text: str

        @return: Dictionary with languages and unique stopwords seen in analyzed text
        @rtype: dict
        """

        languages_ratios = {}

        tokens = wordpunct_tokenize(text)
        words = [word.lower() for word in tokens]

        # Compute per language included in nltk number of unique stopwords appearing in analyzed text
        for language in swadesh.fileids():
            stopwords_set = set(swadesh.words(language))
            words_set = set(words)
            common_elements = words_set.intersection(stopwords_set)

            languages_ratios[language] = len(common_elements)  # language "score"

        return languages_ratios
コード例 #4
0
def Vocabulary(n):
    cachedStopWords = set(stopwords.words("english"))
    cachedCommonWords = set(swadesh.words('en'))
    removewords = cachedStopWords.union(cachedCommonWords)
    for i in classlist:
        local_cnt = collections.Counter()
        count = 0
        path = os.path.join(root, i)
        for path, subdirs, files in os.walk(path):
            for name in files:
                n += 1
                count += 1
                flag = 0
                f = open(os.path.join(path, name), "r")
                for line in f:
                    if flag == 1:
                        tokenizer = nltk.RegexpTokenizer(r'\w+')
                        tokens = tokenizer.tokenize(line)
                        filtered_word = [
                            word.lower() for word in tokens
                            if word.lower() not in removewords
                        ]
                        for k in filtered_word:
                            cnt[k] += 1
                            local_cnt[k] += 1
                    if ("Lines:" in line):
                        flag = 1
        Nc[i] = count
        List_cnt[i] = local_cnt
    return n
コード例 #5
0
def swadesh():
    for i in swadesh.words(): 
      s = str(j) + "-" + str(j%length[top])
      
      #s = str(j)
      #f.write( (s+": " + i + "\r\n").encode('utf-8')); j+=1
      f.write((i+ "\r\n").encode('utf-8')); j+=1
      all = all+1
      #if (all == length[top]): all = 0; top+=1
    f.close()
    #from nltk.corpus import swadesh
コード例 #6
0
def compareWordlist():

    swadesh.fileids()
    swadesh.words('en')

    fr2en = swadesh.entries(['fr', 'en'])
    fr2en

    translate = dict(fr2en)
    translate['chien']
    translate['jeter']

    de2en = swadesh.entries(['de', 'en'])    # German-English
    es2en = swadesh.entries(['es', 'en'])    # Spanish-English
    translate.update(dict(de2en))
    translate.update(dict(es2en))
    translate['Hund']
    translate['perro']

    languages = ['en', 'de', 'nl', 'es', 'fr', 'pt', 'la']
    for i in [139, 140, 141, 142]:
        print swadesh.entries(languages)[i]
コード例 #7
0
ファイル: predict_length.py プロジェクト: anna-hope/talaffoz
def test_swadesh(model, lang) -> Tuple[Optional[float], Optional[List]]:
    swadesh_langs = set(swadesh.fileids())
    if lang in swadesh_langs:
        logging.info('Testing model on Swadesh list for {}...'.format(lang))
        # some entries in the swadesh list have multiple words
        # because they include contextual definitions
        # so we need to only take the first word
        words = swadesh.words(fileids=lang)
        words = [word.split()[0].casefold() for word in words]
        accuracy, errors = test_accuracy(words, model)
    else:
        logging.error('No Swadesh corpus for "{}"'.format(lang))
        accuracy = None
        errors = None
    return accuracy, errors
コード例 #8
0
ファイル: testImplemented.py プロジェクト: Wiki-G/wikiG-app
 def removeStopwords(self, inlist):
     stop_words1 = stopwords.words('english')
     stop_words2 = swadesh.words('en')
     finalstopwords = stop_words1 + stop_words2
     #here we can add any other words that we need to add to stopwords
     finalstopwords = finalstopwords + ['ref','also','title','http','image','cite','nbsp','disambiguation','article','articles','pages','page','wikipedia','retrieved','category','categories']
     templist = []
     for items in inlist:
         templist.append(items.lower())
         
         
     for words in finalstopwords:
     	while words in templist:
     		templist.remove(words)
     
     
     
     return templist
コード例 #9
0
    def removeStopwords(self, inlist):
        stop_words1 = stopwords.words('english')
        stop_words2 = swadesh.words('en')
        finalstopwords = stop_words1 + stop_words2
        #here we can add any other words that we need to add to stopwords
        finalstopwords = finalstopwords + [
            'ref', 'also', 'title', 'http', 'image', 'cite', 'nbsp',
            'disambiguation', 'article', 'articles', 'pages', 'page',
            'wikipedia', 'retrieved', 'category', 'categories'
        ]
        templist = []
        for items in inlist:
            templist.append(items.lower())

        for words in finalstopwords:
            while words in templist:
                templist.remove(words)

        return templist
コード例 #10
0
def preprocessGICorpus():
    giCorpus = {}
    corpus_root = os.getcwd() + "/GICorpus/"

    filelists = PlaintextCorpusReader(corpus_root, '.*\.txt', encoding='utf-8')
    wnl = nltk.WordNetLemmatizer()
    print filelists.fileids()
    for file in filelists.fileids():
        wordlist = filelists.words(file)
        print "Printing size of  " + file + " original wordlist: " + str(
            len(wordlist))
        trimmedWordlist = [
            x for x in wordlist
            if not (x in swadesh.words('en')) and len(x) >= 1
        ]
        # lemmatizedWordlist = [wnl.lemmatize(t) for t in trimmedWordlist]
        taggedWordlist = nltk.pos_tag(trimmedWordlist)
        print "Printing size of  " + file + " trimmed wordlist: " + str(
            len(trimmedWordlist))
        giCorpus[file] = taggedWordlist
        # fd = FreqDist(w for w in taggedWordlist)
    return giCorpus
コード例 #11
0
def getSwadesh(bSave=False):  #W path="sw.txt"):
    j = 0
    f = open("sw.txt", "wb")
    length = [207, 207, 207, 207, 207, 174, 207, 207]
    top = 0
    all = 0
    j = 0
    for i in swadesh.words():
        s = str(j) + "-" + str(j % length[top])
        #s = str(j)
        #f.write( (s+": " + i + "\r\n").encode('utf-8')); j+=1
        if (bSave): f.write((i + "\r\n").encode('utf-8'))
        else: print(i)
        j += 1
        all = all + 1
        #if (all == length[top]): all = 0; top+=1

    if (bSave):
        e = swadesh.entries()
        for n in e:
            f.write((str(n) + "\r\n").encode('utf-8'))
    if (bSave): f.close()
コード例 #12
0
def content_fraction2(text):
    compared_words = swadesh.words('en')
    content2 = [w for w in text if w.lower() not in compared_words]
    return content2
コード例 #13
0
ファイル: 06.py プロジェクト: kouheiszk/nltk
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import nltk
from nltk.corpus import swadesh

swadesh.fileids()
swadesh.words("en")
fr2en = swadesh.entries(["fr", "en"])
fr2en
translate = dict(fr2en)
translate["chien"]
translate["jeter"]
コード例 #14
0
#!/usr/local/bin/python2.7.3
# -*- coding: utf-8 -*-
#given three files in english,swedish and greek
#the script will produce 3 cleaner versions of them
#3 tokenized versions of the cleaned versions
#and 3 files containing the lines erased
import re
import nltk
from nltk.corpus import stopwords
from nltk.corpus import swadesh
english_stopwords = stopwords.words('english')
swedish_stopwords = stopwords.words('swedish')

english_common_words = swadesh.words('en')

sv_eng_sim_in_sw_stopwords = ['i', 'till', 'dig', 'under']
for word in sv_eng_sim_in_sw_stopwords:
    if word in english_stopwords:
        english_stopwords.remove(word)

common_words_eng_swed = ['I', 'not', 'all', 'small', 'man', 'dog',\
 'bark', 'fat', 'hand', 'drink', 'live', 'hit', 'dig', 'lie', 'fall', \
 'river', 'lake', 'salt', 'sand', 'red', 'full', 'bad', 'far', 'in', 'and']

for word in common_words_eng_swed:
    if word in english_common_words:
        english_common_words.remove(word)


def read_files():
    path_el = 'corpus_el.txt'
コード例 #15
0
ファイル: cleaner.py プロジェクト: limbero/project
#!/usr/local/bin/python2.7.3
# -*- coding: utf-8 -*-
#given three files in english,swedish and greek
#the script will produce 3 cleaner versions of them
#3 tokenized versions of the cleaned versions
#and 3 files containing the lines erased
import re
import nltk
from nltk.corpus import stopwords
from nltk.corpus import swadesh
english_stopwords = stopwords.words('english')
swedish_stopwords = stopwords.words('swedish')

english_common_words = swadesh.words('en')

sv_eng_sim_in_sw_stopwords = ['i', 'till', 'dig', 'under']
for word in sv_eng_sim_in_sw_stopwords:
	if word in english_stopwords:
		english_stopwords.remove(word)

common_words_eng_swed = ['I', 'not', 'all', 'small', 'man', 'dog',\
 'bark', 'fat', 'hand', 'drink', 'live', 'hit', 'dig', 'lie', 'fall', \
 'river', 'lake', 'salt', 'sand', 'red', 'full', 'bad', 'far', 'in', 'and']

for word in common_words_eng_swed:
	if word in english_common_words:
		english_common_words.remove(word)


def read_files():
	path_el = 'corpus_el.txt'
コード例 #16
0
from nltk.corpus import swadesh
import sys
#reload(sys)
#sys.setdefaultencoding('utf-8')

#import sys, locale, os
#print(sys.stdout.encoding)
#print(sys.stdout.isatty())
#print(locale.getpreferredencoding())
#print(sys.getfilesystemencoding())
#print(os.environ["PYTHONIOENCODING"])
#print(chr(246), chr(9786), chr(9787))

j = 0
f = open("sw.txt", "wb")

length = [207, 207, 207, 207, 207, 174, 207, 207]
top = 0
all = 0

for i in swadesh.words():
    s = str(j) + "-" + str(j % length[top])

    #s = str(j)
    #f.write( (s+": " + i + "\r\n").encode('utf-8')); j+=1
    f.write((i + "\r\n").encode('utf-8'))
    j += 1
    all = all + 1
    #if (all == length[top]): all = 0; top+=1
f.close()
#from nltk.corpus import swadesh
コード例 #17
0
ファイル: C0204.py プロジェクト: zhuyuanxiang/NLTK-Python-CN
# 在词典中寻找单词的发音
text = ['natural', 'language', 'processing']
pron_list = [ph for w in text for ph in prondict[w][0]]
print("word pronoun list= ", pron_list)

# 加[0]是因为natural有两个发音,取其中一个就好了
pron_list = [ph for w in text for ph in prondict[w]]
print("'natural' pronoun list= ", pron_list)
print("prondict['natural']=", prondict['natural'])

# P70 2.4.3 比较词表(Swadesh wordlists)
# 包括几种语言的约200个常用词的列表,可以用于比较两个语言之间的差别,也可以用于不同语言的单词翻译
from nltk.corpus import swadesh

print("swadesh.fileids()= ", swadesh.fileids())
print("swadesh.words('en')= ", swadesh.words('en'))

fr2en = swadesh.entries(['fr', 'en'])
print("fr2en= ", fr2en[:13])
translate = dict(fr2en)
print("translate= ", translate)
print("translate['chien']= ", translate['chien'])

de2en = swadesh.entries(['de', 'en'])
translate.update(dict(de2en))
es2en = swadesh.entries(['es', 'en'])
translate.update(dict(es2en))
print("translate= ", translate)

print("translate['jeter']= ", translate['jeter'])
print("translate['Hund']= ", translate['Hund'])
コード例 #18
0
from nltk.corpus import swadesh

print(swadesh.fileids(), '\n')
print(swadesh.words('en'), '\n')

fr2en = swadesh.entries(['fr', 'en'])
print(fr2en, '\n')

translate = dict(fr2en)
print(translate['chien'])
print(translate['jeter'], '\n')

de2en = swadesh.entries(['de', 'en'])  # German-English
es2en = swadesh.entries(['es', 'en'])  # Spanish-English

translate.update(dict(de2en))
translate.update(dict(es2en))

print(translate['Hund'])
print(translate['perro'], '\n')

languages = ['en', 'de', 'nl', 'es', 'fr', 'pt', 'la']

for i in [139, 140, 141, 142]:
    print(swadesh.entries(languages)[i])
コード例 #19
0
ファイル: corpus.py プロジェクト: spinto88/Lector_y_analisis
uso de la base de datos .xml.
"""
from lxml import etree
from datetime import date
from datetime import timedelta
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import numpy as np
import codecs

week_days = ['Lun', 'Mar', 'Mier', 'Jue', 'Vier', 'Sab', 'Dom']

# Auxiliar functions and things 
from nltk.corpus import swadesh
from nltk.corpus import stopwords
common_words = swadesh.words('es') + stopwords.words('spanish')

def phrase_variation(phrase):

    phrase_lower = phrase.lower()
    phrase_upper = phrase.upper()
    phrase_capitalize = phrase.capitalize()
    phrase_title = phrase.title()

    phrases2check = [phrase, phrase_lower, \
                 phrase_upper, phrase_capitalize, \
                 phrase_title]

    return phrases2check

# Class Note 
コード例 #20
0
ファイル: traducciones.py プロジェクト: Naranjoinc/NLP
from nltk.corpus import swadesh

##Que hay dentro de swadesh. que idiomas
print(swadesh.fileids())

##Saber que palabras tiene swadesh en ingles
print(swadesh.words('en'))

##objeto que me ayuda a definir un diccionario par atraducir las palabras del frances al español
fr2es = swadesh.entries(['fr', 'en'])
print(fr2es)

##crear diccionario de frances a ingles
translate = dict(fr2es)
print(translate['chien'])  ##traducir la plabra chien en frances al ingles
コード例 #21
0
ファイル: pos_count.py プロジェクト: 3pin/textbot
def mostCommon(dataArray):
    import nltk
    from nltk.corpus import stopwords
    from nltk.corpus import swadesh
    from nltk import bigrams
    import re
    import json
    import operator
    from collections import Counter, OrderedDict
    import string
    from bs4 import UnicodeDammit
    #
    # additional words to omit
    # extra_words = ['be', 'psb']
    extra_words = []
    #
    stringAll = ''
    emoticons_str = r"""
        (?:
            [:=;] # Eyes
            [oO\-]? # Nose (optional)
            [D\)\]\(\]/\\OpP] # Mouth
        )"""
    regex_str = [
        emoticons_str,
        r'<[^>]+>',  # HTML tags
        r'(?:@[\w_]+)',  # @-mentions
        r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)",  # hash-tags
        r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+',  # URLs
        r'(?:(?:\d+,?)+(?:\.?\d+)?)',  # numbers
        r"(?:[a-z][a-z'\-_]+[a-z])",  # words with - and '
        r'(?:[\w_]+)',  # other words
        r'(?:[\uD83C-\uDBFF\uDC00-\uDFFF]+)',
        r'(?:\S)'  # anything else
    ]
    tokens_re = re.compile(r'(' + '|'.join(regex_str) + ')',
                           re.VERBOSE | re.IGNORECASE)
    emoticon_re = re.compile(r'^' + emoticons_str + '$',
                             re.VERBOSE | re.IGNORECASE)

    #
    def tokenize(s):
        return tokens_re.findall(s)

    #
    def preprocess(s, lowercase=False):
        tokens = tokenize(s)
        if lowercase:
            tokens = [
                token if emoticon_re.search(token) else token.lower()
                for token in tokens
            ]
        return tokens

    #
    for row in dataArray:
        for item in row:
            stringAll = stringAll + item
        stringAll = stringAll + " "
    #print('stringAll...' + '\n' + stringAll)
    #
    count_all = Counter()
    # count everything
    terms_all = [term for term in preprocess(stringAll) if len(term) > 1]
    #
    # Count terms only once, equivalent to Document Frequency
    terms_single = set(terms_all)
    #
    # build up the words to omit...
    punctuation = list(string.punctuation)
    common_words = swadesh.words('en')
    stop_words = stopwords.words('english')
    omitted_words = punctuation + common_words + stop_words + extra_words
    #
    # Create a list with all the terms EXCLUDING punctuations
    terms_nostop = [
        term for term in preprocess(stringAll)
        if term not in omitted_words and len(term) > 1
    ]
    # Count hashtags only
    terms_hash = [
        term for term in preprocess(stringAll)
        if term.startswith('#') and len(term) > 1
    ]
    # Count terms only (no hashtags, no mentions)...startswith() takes a tuple (not a list) if we pass a list of inputs
    terms_only = [
        term for term in preprocess(stringAll) if term not in stop_words
        and not term.startswith(('#', '@')) and len(term) > 1
    ]
    # Count duples (co-occurances)
    terms_bigram = bigrams(terms_nostop)
    # Update the counter
    count_all.update(terms_nostop)
    # Print the most frequent words
    print(count_all.most_common(20))
    #
    sorted_dictionary = OrderedDict(
        sorted(count_all.items(), key=lambda t: t[1], reverse=True))
    print(sorted_dictionary.keys())
    # sort by VAL in descending order
    #print( sorted(count_all.values()) )
    #count_all_sorted = sorted(count_all.items(), key=operator.itemgetter(1))
    #print(count_all_sorted)
    '''
    # ask user for number which Most_Common_Count must be above, then create new dict with Top_Counts
    from tkinter.simpledialog import askstring
    shortdict_tresh = askstring("PromptWindow", "Enter cutoff integer in POS most-common-count")
    shortdict_num = int(shortdict_tresh)
    shortdict = dict((key,value) for key, value in sorted_dictionary.items() if value > shortdict_num)
    '''

    shortdict = dict((key, value) for key, value in sorted_dictionary.items())
    #print('shortdict... ')
    #print(str(shortdict))
    dict_title = str(shortdict)
    #
    # take most-Common count and calcualte weighted score per string in lemma-dataArray...
    processList = []
    for row in dataArray:
        #print(row)
        count = 0
        ratingCounter = 0
        row = row.split()
        for word in row:
            #print(word)
            for key, value in shortdict.items():
                if word == key:
                    count = count + value
        processList.append(count)
    #print('processList... ')
    #print(processList)
    return dict_title, processList
コード例 #22
0
#!/usr/bin/python3

import nltk.corpus as corpus
from nltk.corpus import udhr
from nltk.corpus import swadesh

text = udhr.sents('Spanish-Latin1')
es = swadesh.words('es')
spanish_to_english = swadesh.entries(['es', 'en'])
trans = dict(spanish_to_english)

for sentence in text:
    for i in range(len(sentence)):
        if sentence[i] in es:
            print(trans[sentence[i]], end=' ')
        else:
            print("UNK", end=' ')
    print('')
コード例 #23
0
cfd.plot()

entries = nltk.corpus.cmudict.entries()
len(entries)

for entry in entries[42371:42379]:
    print(entry)

syllable = ['N', 'IHO', 'K', 'S']
[word for word, pron in entries if pron[-4:] == syllable]

[w for w, pron in entries if pron[-1] == 'M' and w[-1] == 'n']

from nltk.corpus import swadesh
swadesh.fileids()
swadesh.words('en')

fr2en = swadesh.entries(['fr', 'en'])
fr2en
translate = dict(fr2en)
translate['chien']
translate['jeter']

de2en = swadesh.entries(['de', 'en'])
es2en = swadesh.entries(['es', 'en'])
translate.update(dict(de2en))
translate.update(dict(es2en))
translate['Hund']
translate['perro']

languages = ['en', 'de', 'nl', 'es', 'fr', 'pt', 'la']
コード例 #24
0
ファイル: comparatie.4.3.py プロジェクト: loveclj/python
__author__ = 'lizhifeng'
from nltk.corpus import  swadesh

print swadesh.fileids()
print swadesh.words('en')

fr2en = swadesh.entries(['fr', 'en'])
print fr2en

translate = dict(fr2en)
print translate["chien"]
コード例 #25
0
# An example of a tabular lexicon is the comparative wordlist. NLTK includes so-called Swadesh wordlists, lists of about 200 common words in several languages. The Swadesh list is used in the quantitative assessment of the genealogical relatedness of languages. 

from nltk.corpus import swadesh

print(swadesh.fileids()) 

# prints out the language identifiers (two-letter code).
print() # prints out an empty line.

print(swadesh.words("de"))

# prints out 200 common German words from swadesh.
コード例 #26
0
def get_frequncy_dist(dir_path):
    files = os.listdir(dir_path)

    all_words = 0
    words_wt_freq = {}   
    '''get words'''
    for filename in files:
        if (filename.endswith('.srt')):
            file_handler = open(dir_path + '\\' + filename, 'r')
            for line in file_handler :
                for word in line.strip().split():
                    sword = word.strip(punctuation)
                    if (sword.isalpha()):
                        lword = sword.lower()
                        words_wt_freq[lword] = words_wt_freq.get(lword, 0) + 1
                        all_words += 1
            file_handler.close()
    logger.debug('# all words: ' + str (all_words - 1))
    logger.debug('# unique words: ' + str (len(words_wt_freq.keys())))
    lexical_diversity_for_freq(words_wt_freq.values())
    
    lemmatized_words_wt_freq = {}
    for word in words_wt_freq.keys():
        lemmatized_word = nltk.WordNetLemmatizer().lemmatize(word)
        if (word != lemmatized_word and lemmatized_word != None):
            lemmatized_words_wt_freq[lemmatized_word] = lemmatized_words_wt_freq.get(lemmatized_word, 0) + words_wt_freq.get(word)
            #print(lemmatized_word, word)
        else:
            lemmatized_words_wt_freq[word] = words_wt_freq.get(word)
    lemmatized_size = len(lemmatized_words_wt_freq.keys())            
    logger.debug ('# words after lemmatized: ' + str (lemmatized_size) + " diff: " + str (len(words_wt_freq.keys()) - lemmatized_size))
    lexical_diversity_for_freq(lemmatized_words_wt_freq.values())
    words_wt_freq = {} # Save memory

    
    stopwords_en = stopwords.words('english')
    male_names = names.words('male.txt')
    female_names = names.words('female.txt')
    comparative = swadesh.words('en')
    ignore_list = [] ;
    ignore_list.extend(stopwords_en)
    ignore_list.extend(male_names)
    ignore_list.extend(female_names)
    ignore_list.extend(comparative)            
    filtered_words = []

    out_file = open(dir_path + '\\wfd.csv', 'w')
    out_file.write ('Word, Type, Frequency \n')
        
    for word in lemmatized_words_wt_freq.keys():
        if len(word) > 2 and word not in ignore_list:
            filtered_words.append(word)   
        else:
            out_file.write(word + ',stop words,' + str(lemmatized_words_wt_freq.get(word)) + '\n')
    logger.debug ('# words after filtering stop words: ' + str (len(filtered_words)) + " diff: " + str (len(lemmatized_words_wt_freq.keys()) - len(filtered_words)))
    ignore_list = [] #save memory

    '''wordnet has 155k'''                                 
    usual_words = []
    for word in  filtered_words:
        if (len(wordnet.synsets(word)) != 0):
            usual_words.append(word)
        else:
            out_file.write(word + ',not in wordnet,' + str(lemmatized_words_wt_freq.get(word)) + '\n')
    logger.debug ('# words after filtering unused words: ' + str (len(usual_words)) + " diff: " + str (lemmatized_size - len(usual_words)))
    filtered_words = [] # save memory 

    tag_filtered_words_wt_freq = {}
    words_wt_tags = nltk.pos_tag(usual_words)
    for (word, tag) in words_wt_tags:
        if (tag not in ['EX', 'DET', 'CNJ', 'FW', 'MD', 'NP', 'NUM', 'PRO', 'P', 'TO', 'UH', 'WH', 'WP', 'NNP', 'MOD']):
            if(en.is_adverb(word)):
                tag_filtered_words_wt_freq[word] = lemmatized_words_wt_freq[word]  
                #print ('ADV,' + word)
            elif (en.is_adjective(word)):
                tag_filtered_words_wt_freq[word] = lemmatized_words_wt_freq[word]  
                #print ('ADJ,' + word)
            elif (en.is_verb(word)):
                tag_filtered_words_wt_freq[word] = lemmatized_words_wt_freq[word]  
                #print ('VB,' + word)
            elif (en.is_noun(word)):
                tag_filtered_words_wt_freq[word] = lemmatized_words_wt_freq[word]  
                #print ('N,' + word) 
            else:
                if (tag in ['VBZ', 'NNS']):
                    if word.endswith('s'):
                        new_word = word[:-1]
                        tag_filtered_words_wt_freq[new_word] = lemmatized_words_wt_freq[word] + tag_filtered_words_wt_freq.get(new_word, 0)
                        #print (word , new_word,tag)    
                elif (tag == 'VBG'):
                    new_word = en.verb.infinitive(word)
                    if new_word != None and word != new_word:
                        tag_filtered_words_wt_freq[new_word] = lemmatized_words_wt_freq[word] + tag_filtered_words_wt_freq.get(new_word, 0)
                elif (tag == 'JJS'):
                    if word.endswith('est'):
                        new_word = word[:-3]
                        tag_filtered_words_wt_freq[new_word] = lemmatized_words_wt_freq[word] + tag_filtered_words_wt_freq.get(new_word, 0)     
                else:
                    tag_filtered_words_wt_freq[word] = lemmatized_words_wt_freq[word]        
                    #print (word,tag)   
        else:
            out_file.write(word + ',unwanted pos,' + str(lemmatized_words_wt_freq.get(word)) + '\n')
    logger.debug ('# words after filtering unwanted pos:' + str (len(tag_filtered_words_wt_freq.keys())) + " diff: " + str (len(usual_words) - len(tag_filtered_words_wt_freq.keys())))
    lexical_diversity_for_freq(tag_filtered_words_wt_freq.values())
    lemmatized_words_wt_freq = {} # save memory
    usual_words = [] #save memory

    basic_english_vocab = en.basic.words
    non_basic_words = set(tag_filtered_words_wt_freq.keys()).difference(basic_english_vocab)
    non_basic_words_wt_freq = {}
    for non_basic_word in non_basic_words:
        non_basic_words_wt_freq[non_basic_word] = tag_filtered_words_wt_freq[non_basic_word] 
    words_in_both = set(tag_filtered_words_wt_freq.keys()).intersection(basic_english_vocab)
    for word in words_in_both:
        out_file.write(word + ',en.basic.words,' + str(tag_filtered_words_wt_freq.get(word)) + '\n')
    logger.debug ('# words after filtering basic words: ' + str (len(non_basic_words_wt_freq.keys())) + " diff: " + str (len(tag_filtered_words_wt_freq.keys()) - len(non_basic_words_wt_freq.keys())))
    lexical_diversity_for_freq(non_basic_words_wt_freq.values())
    tag_filtered_words_wt_freq = {} #save memory


    fh = open(os.path.join(base.app_root(), 'etc\\basic_words.csv'), 'r')
    my_words = [word.lower() for line in fh for word in line.strip().split()]
    fh.close()
    new_words = set(non_basic_words).difference(my_words)
    words_in_both = set(non_basic_words).intersection(my_words)
    for word in words_in_both:
        out_file.write(word + ',en.basic.words.mine,' + str(non_basic_words_wt_freq.get(word)) + '\n')    
    new_words_wt_freq = {}
    for new_word in new_words:
        new_words_wt_freq[new_word] = non_basic_words_wt_freq[new_word] 
    logger.debug ('# words after filtering my words: ' + str (len(new_words_wt_freq.keys())) + " diff: " + str (len(non_basic_words_wt_freq.keys()) - len(new_words_wt_freq.keys())))
    lexical_diversity_for_freq(new_words_wt_freq.values())
    
    sorted_words = sorted(new_words_wt_freq.items(), key=itemgetter(1, 0))
    for (word, frequency) in sorted_words:
        out_file.write (word + ',lexicon,' + str(frequency) + '\n')
    out_file.close()
    
    return new_words_wt_freq